From 70be8dd11720fc2d34b764b642c9dcbacd540508 Mon Sep 17 00:00:00 2001 From: mmlspark-bot Date: Fri, 2 Jun 2017 11:57:25 -0400 Subject: [PATCH] Initial content --- .gitignore | 29 + CONTRIBUTING.md | 38 ++ LICENSE | 22 + README.md | 177 ++++++ docs/developer-readme.md | 80 +++ docs/third-party-notices.txt | 298 ++++++++++ docs/your-first-model.md | 109 ++++ .../101 - Adult Census Income Training.ipynb | 139 +++++ ...on Example with Flight Delay Dataset.ipynb | 161 +++++ .../103 - Before and After MMLSpark.ipynb | 286 +++++++++ ...Amazon Book Reviews - TextFeaturizer.ipynb | 186 ++++++ ...202 - Amazon Book Reviews - Word2Vec.ipynb | 228 +++++++ .../301 - CIFAR10 CNTK CNN Evaluation.ipynb | 264 +++++++++ ...302 - Pipeline Image Transformations.ipynb | 236 ++++++++ notebooks/tests/BasicDFOpsSmokeTest.ipynb | 107 ++++ runme | 50 ++ src/.gitignore | 44 ++ src/.sbtopts | 2 + src/build.sbt | 11 + src/checkpoint-data/build.sbt | 1 + .../src/main/scala/CheckpointData.scala | 71 +++ .../src/test/scala/CheckpointDataSuite.scala | 41 ++ src/cntk-model/build.sbt | 3 + src/cntk-model/src/main/python/CNTKModel.py | 21 + src/cntk-model/src/main/scala/CNTKModel.scala | 230 +++++++ .../src/test/scala/CNTKBindingSuite.scala | 60 ++ .../src/test/scala/CNTKModelSuite.scala | 157 +++++ .../src/test/scala/CNTKTestUtils.scala | 74 +++ src/cntk-train/build.sbt | 3 + src/cntk-train/src/main/python/CNTKLearner.py | 23 + .../src/main/scala/BrainscriptBuilder.scala | 117 ++++ .../src/main/scala/CNTKLearner.scala | 168 ++++++ .../src/main/scala/CommandBuilders.scala | 117 ++++ .../src/main/scala/DataConversion.scala | 173 ++++++ .../src/main/scala/TypeMapping.scala | 41 ++ .../src/test/scala/ValidateCntkTrain.scala | 267 +++++++++ .../test/scala/ValidateConfiguration.scala | 28 + .../test/scala/ValidateDataConversion.scala | 83 +++ .../test/scala/ValidateEnvironmentUtils.scala | 14 + src/codegen/build.sbt | 12 + src/codegen/src/main/scala/CodeGen.scala | 79 +++ src/codegen/src/main/scala/Config.scala | 29 + .../src/main/scala/PySparkWrapper.scala | 345 +++++++++++ .../main/scala/PySparkWrapperGenerator.scala | 123 ++++ .../src/main/scala/PySparkWrapperTest.scala | 275 +++++++++ src/compute-model-statistics/build.sbt | 3 + .../main/scala/ComputeModelStatistics.scala | 559 +++++++++++++++++ .../scala/VerifyComputeModelStatistics.scala | 245 ++++++++ src/compute-per-instance-statistics/build.sbt | 3 + .../scala/ComputePerInstanceStatistics.scala | 110 ++++ .../VerifyComputePerInstanceStatistics.scala | 130 ++++ src/core/build.sbt | 1 + src/core/contracts/build.sbt | 1 + .../contracts/src/main/scala/Exceptions.scala | 35 ++ .../contracts/src/main/scala/Metrics.scala | 47 ++ .../contracts/src/main/scala/Params.scala | 134 +++++ src/core/env/build.sbt | 7 + src/core/env/src/main/scala/CodegenTags.scala | 13 + .../env/src/main/scala/Configuration.scala | 51 ++ .../env/src/main/scala/EnvironmentUtils.scala | 52 ++ .../env/src/main/scala/FileUtilities.scala | 139 +++++ src/core/env/src/main/scala/Logging.scala | 23 + src/core/env/src/main/scala/NativeLoader.java | 194 ++++++ .../env/src/main/scala/ProcessUtilities.scala | 26 + src/core/hadoop/build.sbt | 1 + .../hadoop/src/main/scala/HadoopUtils.scala | 176 ++++++ src/core/ml/build.sbt | 3 + .../ml/src/test/scala/HashingTFSpec.scala | 81 +++ src/core/ml/src/test/scala/IDFSpec.scala | 103 ++++ src/core/ml/src/test/scala/NGramSpec.scala | 74 +++ .../ml/src/test/scala/OneHotEncoderSpec.scala | 102 ++++ src/core/ml/src/test/scala/Word2VecSpec.scala | 93 +++ src/core/schema/build.sbt | 4 + .../src/main/python/TypeConversionUtils.py | 17 + src/core/schema/src/main/python/Utils.py | 69 +++ .../src/main/scala/BinaryFileSchema.scala | 32 + .../schema/src/main/scala/Categoricals.scala | 317 ++++++++++ .../src/main/scala/DatasetExtensions.scala | 68 +++ .../schema/src/main/scala/ImageSchema.scala | 46 ++ .../src/main/scala/SchemaConstants.scala | 44 ++ .../schema/src/main/scala/SparkSchema.scala | 352 +++++++++++ .../src/test/scala/TestCategoricals.scala | 131 ++++ .../scala/VerifyFastVectorAssembler.scala | 118 ++++ .../src/test/scala/VerifySparkSchema.scala | 56 ++ src/core/spark/build.sbt | 1 + .../spark/src/main/scala/ArrayMapParam.scala | 70 +++ .../spark/src/main/scala/EstimatorParam.scala | 36 ++ .../src/main/scala/FastVectorAssembler.scala | 154 +++++ .../spark/src/main/scala/MapArrayParam.scala | 74 +++ .../src/main/scala/MetadataUtilities.scala | 10 + .../spark/src/main/scala/TransformParam.scala | 58 ++ src/core/test/base/build.sbt | 1 + .../src/main/scala/SparkSessionFactory.scala | 53 ++ .../test/base/src/main/scala/TestBase.scala | 155 +++++ src/core/test/build.sbt | 1 + src/core/test/datagen/build.sbt | 1 + .../src/main/scala/DatasetConstraints.scala | 68 +++ .../src/main/scala/DatasetOptions.scala | 57 ++ .../src/main/scala/GenerateDataType.scala | 37 ++ .../src/main/scala/GenerateDataset.scala | 114 ++++ .../datagen/src/main/scala/GenerateRow.scala | 70 +++ .../src/main/scala/ModuleFuzzingTest.scala | 52 ++ .../test/scala/VerifyGenerateDataset.scala | 46 ++ src/data-conversion/build.sbt | 1 + .../src/main/scala/DataConversion.scala | 161 +++++ .../src/test/scala/VerifyDataConversion.scala | 232 ++++++++ src/downloader/build.sbt | 1 + .../src/main/python/ModelDownloader.py | 101 ++++ .../src/main/scala/ModelDownloader.scala | 260 ++++++++ src/downloader/src/main/scala/Schema.scala | 92 +++ .../src/test/scala/DownloaderSuite.scala | 49 ++ src/featurize/build.sbt | 3 + .../src/main/scala/AssembleFeatures.scala | 499 ++++++++++++++++ src/featurize/src/main/scala/Featurize.scala | 92 +++ .../src/test/scala/VerifyFeaturize.scala | 330 +++++++++++ .../test/scala/benchmarkBasicDataTypes.json | 12 + .../src/test/scala/benchmarkNoOneHot.json | 6 + .../src/test/scala/benchmarkOneHot.json | 6 + .../src/test/scala/benchmarkString.json | 5 + .../scala/benchmarkStringIndexOneHot.json | 6 + .../test/scala/benchmarkStringMissing.json | 5 + .../src/test/scala/benchmarkVectors.json | 7 + src/find-best-model/build.sbt | 3 + .../src/main/scala/FindBestModel.scala | 331 +++++++++++ .../src/test/scala/VerifyFindBestModel.scala | 106 ++++ src/fuzzing/build.sbt | 5 + src/fuzzing/src/test/scala/Fuzzing.scala | 254 ++++++++ src/image-featurizer/build.sbt | 5 + .../src/main/scala/ImageFeaturizer.scala | 128 ++++ .../src/test/scala/ImageFeaturizerSuite.scala | 66 +++ src/image-transformer/build.sbt | 2 + .../src/main/python/ImageTransform.py | 96 +++ .../src/main/scala/ImageTransformer.scala | 314 ++++++++++ .../src/main/scala/UnrollImage.scala | 70 +++ .../test/scala/ImageTransformerSuite.scala | 293 +++++++++ src/multi-column-adapter/build.sbt | 1 + .../src/main/scala/MultiColumnAdapter.scala | 121 ++++ .../test/scala/MultiColumnAdapterSpec.scala | 49 ++ src/partition-sample/build.sbt | 1 + .../src/main/scala/PartitionSample.scala | 117 ++++ .../test/scala/VerifyPartitionSample.scala | 67 +++ src/pipeline-stages/build.sbt | 1 + .../src/main/scala/Repartition.scala | 42 ++ .../src/main/scala/SelectColumns.scala | 63 ++ .../src/test/scala/RepartitionSuite.scala | 50 ++ .../src/test/scala/SelectColumnsSuite.scala | 75 +++ src/project/build.sbt | 16 + src/project/build.scala | 201 +++++++ src/project/lib-check.scala | 34 ++ src/project/meta.sbt | 108 ++++ src/project/plugins.sbt | 5 + src/project/scalastyle.scala | 136 +++++ src/readers/build.sbt | 1 + .../src/main/python/BinaryFileReader.py | 52 ++ src/readers/src/main/python/ImageReader.py | 50 ++ .../src/main/scala/AzureBlobReader.scala | 72 +++ .../src/main/scala/AzureSQLReader.scala | 53 ++ .../src/main/scala/BinaryFileReader.scala | 79 +++ src/readers/src/main/scala/FileFormat.scala | 12 + src/readers/src/main/scala/ImageReader.scala | 63 ++ src/readers/src/main/scala/ReaderUtils.scala | 47 ++ src/readers/src/main/scala/Readers.scala | 50 ++ src/readers/src/main/scala/WasbReader.scala | 47 ++ .../test/scala/BinaryFileReaderSuite.scala | 44 ++ .../src/test/scala/ImageReaderSuite.scala | 75 +++ src/summarize-data/build.sbt | 1 + .../src/main/scala/SummarizeData.scala | 189 ++++++ .../src/test/scala/SummarizeDataSuite.scala | 52 ++ src/text-featurizer/build.sbt | 2 + .../src/main/scala/TextFeaturizer.scala | 442 ++++++++++++++ .../src/test/scala/TextFeaturizerSpec.scala | 86 +++ src/train-classifier/build.sbt | 3 + .../src/main/scala/TrainClassifier.scala | 367 ++++++++++++ .../test/scala/VerifyTrainClassifier.scala | 560 ++++++++++++++++++ .../src/test/scala/benchmarkMetrics.csv | 68 +++ src/train-regressor/build.sbt | 2 + .../src/main/scala/TrainRegressor.scala | 246 ++++++++ .../src/test/scala/VerifyTrainRegressor.scala | 184 ++++++ src/utils/build.sbt | 1 + .../src/main/scala/JarLoadingUtils.scala | 139 +++++ .../src/main/scala/ObjectUtilities.scala | 71 +++ .../src/main/scala/PipelineUtilities.scala | 55 ++ tools/bin/mml-exec | 37 ++ tools/build-pr/checkout | 58 ++ tools/build-pr/report | 39 ++ tools/build-pr/shared.sh | 47 ++ tools/config.sh | 274 +++++++++ tools/docker/Dockerfile | 54 ++ tools/docker/bin/EULA.txt | 203 +++++++ tools/docker/bin/eula | 13 + tools/docker/bin/eula.html | 54 ++ tools/docker/bin/eula.py | 37 ++ tools/docker/bin/launcher | 24 + tools/docker/build-docker | 49 ++ tools/docker/build-env | 28 + tools/hdi/install-mmlspark.sh | 165 ++++++ tools/hdi/setup-test-authkey.sh | 34 ++ tools/hdi/update_livy.py | 25 + tools/mmlspark-packages.spec | 67 +++ tools/notebook/postprocess.py | 110 ++++ tools/notebook/tester/NotebookTestSuite.py | 69 +++ tools/notebook/tester/TestNotebooksLocally.py | 36 ++ tools/notebook/tester/TestNotebooksOnHdi.py | 48 ++ tools/notebook/tester/parallel_run.sh | 32 + tools/pip/MANIFEST.in | 5 + tools/pip/README.txt | 8 + tools/pip/generate-pip.sh | 29 + tools/pip/setup.py | 33 ++ tools/pytests/auto-tests | 19 + tools/pytests/notebook-tests | 11 + tools/pytests/shared.sh | 16 + tools/runme/README.txt | 4 + tools/runme/build-readme.tmpl | 12 + tools/runme/build.sh | 249 ++++++++ tools/runme/install.sh | 206 +++++++ tools/runme/runme.sh | 51 ++ tools/runme/show-version | 7 + tools/runme/utils.sh | 450 ++++++++++++++ tools/tests/tags.sh | 70 +++ 219 files changed, 20154 insertions(+) create mode 100644 .gitignore create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 docs/developer-readme.md create mode 100644 docs/third-party-notices.txt create mode 100644 docs/your-first-model.md create mode 100644 notebooks/samples/101 - Adult Census Income Training.ipynb create mode 100644 notebooks/samples/102 - Regression Example with Flight Delay Dataset.ipynb create mode 100644 notebooks/samples/103 - Before and After MMLSpark.ipynb create mode 100644 notebooks/samples/201 - Amazon Book Reviews - TextFeaturizer.ipynb create mode 100644 notebooks/samples/202 - Amazon Book Reviews - Word2Vec.ipynb create mode 100644 notebooks/samples/301 - CIFAR10 CNTK CNN Evaluation.ipynb create mode 100644 notebooks/samples/302 - Pipeline Image Transformations.ipynb create mode 100644 notebooks/tests/BasicDFOpsSmokeTest.ipynb create mode 100755 runme create mode 100644 src/.gitignore create mode 100644 src/.sbtopts create mode 100644 src/build.sbt create mode 100644 src/checkpoint-data/build.sbt create mode 100644 src/checkpoint-data/src/main/scala/CheckpointData.scala create mode 100644 src/checkpoint-data/src/test/scala/CheckpointDataSuite.scala create mode 100644 src/cntk-model/build.sbt create mode 100644 src/cntk-model/src/main/python/CNTKModel.py create mode 100644 src/cntk-model/src/main/scala/CNTKModel.scala create mode 100644 src/cntk-model/src/test/scala/CNTKBindingSuite.scala create mode 100644 src/cntk-model/src/test/scala/CNTKModelSuite.scala create mode 100644 src/cntk-model/src/test/scala/CNTKTestUtils.scala create mode 100644 src/cntk-train/build.sbt create mode 100644 src/cntk-train/src/main/python/CNTKLearner.py create mode 100644 src/cntk-train/src/main/scala/BrainscriptBuilder.scala create mode 100644 src/cntk-train/src/main/scala/CNTKLearner.scala create mode 100644 src/cntk-train/src/main/scala/CommandBuilders.scala create mode 100644 src/cntk-train/src/main/scala/DataConversion.scala create mode 100644 src/cntk-train/src/main/scala/TypeMapping.scala create mode 100644 src/cntk-train/src/test/scala/ValidateCntkTrain.scala create mode 100644 src/cntk-train/src/test/scala/ValidateConfiguration.scala create mode 100644 src/cntk-train/src/test/scala/ValidateDataConversion.scala create mode 100644 src/cntk-train/src/test/scala/ValidateEnvironmentUtils.scala create mode 100644 src/codegen/build.sbt create mode 100644 src/codegen/src/main/scala/CodeGen.scala create mode 100644 src/codegen/src/main/scala/Config.scala create mode 100644 src/codegen/src/main/scala/PySparkWrapper.scala create mode 100644 src/codegen/src/main/scala/PySparkWrapperGenerator.scala create mode 100644 src/codegen/src/main/scala/PySparkWrapperTest.scala create mode 100644 src/compute-model-statistics/build.sbt create mode 100644 src/compute-model-statistics/src/main/scala/ComputeModelStatistics.scala create mode 100644 src/compute-model-statistics/src/test/scala/VerifyComputeModelStatistics.scala create mode 100644 src/compute-per-instance-statistics/build.sbt create mode 100644 src/compute-per-instance-statistics/src/main/scala/ComputePerInstanceStatistics.scala create mode 100644 src/compute-per-instance-statistics/src/test/scala/VerifyComputePerInstanceStatistics.scala create mode 100644 src/core/build.sbt create mode 100644 src/core/contracts/build.sbt create mode 100644 src/core/contracts/src/main/scala/Exceptions.scala create mode 100644 src/core/contracts/src/main/scala/Metrics.scala create mode 100644 src/core/contracts/src/main/scala/Params.scala create mode 100644 src/core/env/build.sbt create mode 100644 src/core/env/src/main/scala/CodegenTags.scala create mode 100644 src/core/env/src/main/scala/Configuration.scala create mode 100644 src/core/env/src/main/scala/EnvironmentUtils.scala create mode 100644 src/core/env/src/main/scala/FileUtilities.scala create mode 100644 src/core/env/src/main/scala/Logging.scala create mode 100644 src/core/env/src/main/scala/NativeLoader.java create mode 100644 src/core/env/src/main/scala/ProcessUtilities.scala create mode 100644 src/core/hadoop/build.sbt create mode 100644 src/core/hadoop/src/main/scala/HadoopUtils.scala create mode 100644 src/core/ml/build.sbt create mode 100644 src/core/ml/src/test/scala/HashingTFSpec.scala create mode 100644 src/core/ml/src/test/scala/IDFSpec.scala create mode 100644 src/core/ml/src/test/scala/NGramSpec.scala create mode 100644 src/core/ml/src/test/scala/OneHotEncoderSpec.scala create mode 100644 src/core/ml/src/test/scala/Word2VecSpec.scala create mode 100644 src/core/schema/build.sbt create mode 100644 src/core/schema/src/main/python/TypeConversionUtils.py create mode 100644 src/core/schema/src/main/python/Utils.py create mode 100644 src/core/schema/src/main/scala/BinaryFileSchema.scala create mode 100644 src/core/schema/src/main/scala/Categoricals.scala create mode 100644 src/core/schema/src/main/scala/DatasetExtensions.scala create mode 100644 src/core/schema/src/main/scala/ImageSchema.scala create mode 100644 src/core/schema/src/main/scala/SchemaConstants.scala create mode 100644 src/core/schema/src/main/scala/SparkSchema.scala create mode 100644 src/core/schema/src/test/scala/TestCategoricals.scala create mode 100644 src/core/schema/src/test/scala/VerifyFastVectorAssembler.scala create mode 100644 src/core/schema/src/test/scala/VerifySparkSchema.scala create mode 100644 src/core/spark/build.sbt create mode 100644 src/core/spark/src/main/scala/ArrayMapParam.scala create mode 100644 src/core/spark/src/main/scala/EstimatorParam.scala create mode 100644 src/core/spark/src/main/scala/FastVectorAssembler.scala create mode 100644 src/core/spark/src/main/scala/MapArrayParam.scala create mode 100644 src/core/spark/src/main/scala/MetadataUtilities.scala create mode 100644 src/core/spark/src/main/scala/TransformParam.scala create mode 100644 src/core/test/base/build.sbt create mode 100644 src/core/test/base/src/main/scala/SparkSessionFactory.scala create mode 100644 src/core/test/base/src/main/scala/TestBase.scala create mode 100644 src/core/test/build.sbt create mode 100644 src/core/test/datagen/build.sbt create mode 100644 src/core/test/datagen/src/main/scala/DatasetConstraints.scala create mode 100644 src/core/test/datagen/src/main/scala/DatasetOptions.scala create mode 100644 src/core/test/datagen/src/main/scala/GenerateDataType.scala create mode 100644 src/core/test/datagen/src/main/scala/GenerateDataset.scala create mode 100644 src/core/test/datagen/src/main/scala/GenerateRow.scala create mode 100644 src/core/test/datagen/src/main/scala/ModuleFuzzingTest.scala create mode 100644 src/core/test/datagen/src/test/scala/VerifyGenerateDataset.scala create mode 100644 src/data-conversion/build.sbt create mode 100644 src/data-conversion/src/main/scala/DataConversion.scala create mode 100644 src/data-conversion/src/test/scala/VerifyDataConversion.scala create mode 100644 src/downloader/build.sbt create mode 100644 src/downloader/src/main/python/ModelDownloader.py create mode 100644 src/downloader/src/main/scala/ModelDownloader.scala create mode 100644 src/downloader/src/main/scala/Schema.scala create mode 100644 src/downloader/src/test/scala/DownloaderSuite.scala create mode 100644 src/featurize/build.sbt create mode 100644 src/featurize/src/main/scala/AssembleFeatures.scala create mode 100644 src/featurize/src/main/scala/Featurize.scala create mode 100644 src/featurize/src/test/scala/VerifyFeaturize.scala create mode 100644 src/featurize/src/test/scala/benchmarkBasicDataTypes.json create mode 100644 src/featurize/src/test/scala/benchmarkNoOneHot.json create mode 100644 src/featurize/src/test/scala/benchmarkOneHot.json create mode 100644 src/featurize/src/test/scala/benchmarkString.json create mode 100644 src/featurize/src/test/scala/benchmarkStringIndexOneHot.json create mode 100644 src/featurize/src/test/scala/benchmarkStringMissing.json create mode 100644 src/featurize/src/test/scala/benchmarkVectors.json create mode 100644 src/find-best-model/build.sbt create mode 100644 src/find-best-model/src/main/scala/FindBestModel.scala create mode 100644 src/find-best-model/src/test/scala/VerifyFindBestModel.scala create mode 100644 src/fuzzing/build.sbt create mode 100644 src/fuzzing/src/test/scala/Fuzzing.scala create mode 100644 src/image-featurizer/build.sbt create mode 100644 src/image-featurizer/src/main/scala/ImageFeaturizer.scala create mode 100644 src/image-featurizer/src/test/scala/ImageFeaturizerSuite.scala create mode 100644 src/image-transformer/build.sbt create mode 100644 src/image-transformer/src/main/python/ImageTransform.py create mode 100644 src/image-transformer/src/main/scala/ImageTransformer.scala create mode 100644 src/image-transformer/src/main/scala/UnrollImage.scala create mode 100644 src/image-transformer/src/test/scala/ImageTransformerSuite.scala create mode 100644 src/multi-column-adapter/build.sbt create mode 100644 src/multi-column-adapter/src/main/scala/MultiColumnAdapter.scala create mode 100644 src/multi-column-adapter/src/test/scala/MultiColumnAdapterSpec.scala create mode 100644 src/partition-sample/build.sbt create mode 100644 src/partition-sample/src/main/scala/PartitionSample.scala create mode 100644 src/partition-sample/src/test/scala/VerifyPartitionSample.scala create mode 100644 src/pipeline-stages/build.sbt create mode 100644 src/pipeline-stages/src/main/scala/Repartition.scala create mode 100644 src/pipeline-stages/src/main/scala/SelectColumns.scala create mode 100644 src/pipeline-stages/src/test/scala/RepartitionSuite.scala create mode 100644 src/pipeline-stages/src/test/scala/SelectColumnsSuite.scala create mode 100644 src/project/build.sbt create mode 100644 src/project/build.scala create mode 100644 src/project/lib-check.scala create mode 100644 src/project/meta.sbt create mode 100644 src/project/plugins.sbt create mode 100644 src/project/scalastyle.scala create mode 100644 src/readers/build.sbt create mode 100644 src/readers/src/main/python/BinaryFileReader.py create mode 100644 src/readers/src/main/python/ImageReader.py create mode 100644 src/readers/src/main/scala/AzureBlobReader.scala create mode 100644 src/readers/src/main/scala/AzureSQLReader.scala create mode 100644 src/readers/src/main/scala/BinaryFileReader.scala create mode 100644 src/readers/src/main/scala/FileFormat.scala create mode 100644 src/readers/src/main/scala/ImageReader.scala create mode 100644 src/readers/src/main/scala/ReaderUtils.scala create mode 100644 src/readers/src/main/scala/Readers.scala create mode 100644 src/readers/src/main/scala/WasbReader.scala create mode 100644 src/readers/src/test/scala/BinaryFileReaderSuite.scala create mode 100644 src/readers/src/test/scala/ImageReaderSuite.scala create mode 100644 src/summarize-data/build.sbt create mode 100644 src/summarize-data/src/main/scala/SummarizeData.scala create mode 100644 src/summarize-data/src/test/scala/SummarizeDataSuite.scala create mode 100644 src/text-featurizer/build.sbt create mode 100644 src/text-featurizer/src/main/scala/TextFeaturizer.scala create mode 100644 src/text-featurizer/src/test/scala/TextFeaturizerSpec.scala create mode 100644 src/train-classifier/build.sbt create mode 100644 src/train-classifier/src/main/scala/TrainClassifier.scala create mode 100644 src/train-classifier/src/test/scala/VerifyTrainClassifier.scala create mode 100644 src/train-classifier/src/test/scala/benchmarkMetrics.csv create mode 100644 src/train-regressor/build.sbt create mode 100644 src/train-regressor/src/main/scala/TrainRegressor.scala create mode 100644 src/train-regressor/src/test/scala/VerifyTrainRegressor.scala create mode 100644 src/utils/build.sbt create mode 100644 src/utils/src/main/scala/JarLoadingUtils.scala create mode 100644 src/utils/src/main/scala/ObjectUtilities.scala create mode 100644 src/utils/src/main/scala/PipelineUtilities.scala create mode 100755 tools/bin/mml-exec create mode 100755 tools/build-pr/checkout create mode 100755 tools/build-pr/report create mode 100644 tools/build-pr/shared.sh create mode 100644 tools/config.sh create mode 100644 tools/docker/Dockerfile create mode 100644 tools/docker/bin/EULA.txt create mode 100755 tools/docker/bin/eula create mode 100644 tools/docker/bin/eula.html create mode 100755 tools/docker/bin/eula.py create mode 100755 tools/docker/bin/launcher create mode 100755 tools/docker/build-docker create mode 100755 tools/docker/build-env create mode 100755 tools/hdi/install-mmlspark.sh create mode 100755 tools/hdi/setup-test-authkey.sh create mode 100755 tools/hdi/update_livy.py create mode 100644 tools/mmlspark-packages.spec create mode 100755 tools/notebook/postprocess.py create mode 100644 tools/notebook/tester/NotebookTestSuite.py create mode 100644 tools/notebook/tester/TestNotebooksLocally.py create mode 100644 tools/notebook/tester/TestNotebooksOnHdi.py create mode 100755 tools/notebook/tester/parallel_run.sh create mode 100644 tools/pip/MANIFEST.in create mode 100644 tools/pip/README.txt create mode 100755 tools/pip/generate-pip.sh create mode 100644 tools/pip/setup.py create mode 100755 tools/pytests/auto-tests create mode 100755 tools/pytests/notebook-tests create mode 100644 tools/pytests/shared.sh create mode 100644 tools/runme/README.txt create mode 100644 tools/runme/build-readme.tmpl create mode 100644 tools/runme/build.sh create mode 100644 tools/runme/install.sh create mode 100755 tools/runme/runme.sh create mode 100755 tools/runme/show-version create mode 100644 tools/runme/utils.sh create mode 100755 tools/tests/tags.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..01eb12d9e8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# include BuildArtifacts.zip which is used in some parts of the build +/BuildArtifacts* +/TestResults +# accommodate installing the build environment locally +/pkgs/ +# useful env configurations +/tools/local-config.sh + +# Generated by tools/build-pr +/.build-pr + +# Ignore these for safety +*.class +*.jar +*.log +*.tgz +*.zip +*.exe +*.pyc +*.pyo + +# Generic editors +.vscode + +# Common things +*~ +.#* +.*.swp +.DS_Store diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..a0694bcc4a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,38 @@ +## Interested in contributing to MMLSpark? We're excited to work with you. + +### You can contribute in many ways + +* Use the library and give feedback +* Report a bug +* Request a feature +* Fix a bug +* Add examples and documentation +* Code a new feature +* Review pull requests + +### How to contribute? + +You can give feedback, report bugs and request new features anytime by +opening an issue. Also, you can up-vote and comment on existing issues. + +To make a pull request into the repo, such as bug fixes, documentation +or new features, follow these steps: + +* If it's a new feature, open an issue for preliminary discussion with + us, to ensure your contribution is a good fit and doesn't duplicate + on-going work. +* Typically, you'll need to accept Microsoft Contributor Licence + Agreement (CLA). +* Familiarize yourself with coding style and guidelines. +* Fork the repository, code your contribution, and create a pull + request. +* Wait for an MMMLSpark team member to review and accept it. Be patient + as we iron out the process for a new project. + +A good way to get started contributing is to look for issues with a "help +wanted" label. These are issues that we do want to fix, but don't have +resources to work on currently. + +*Apache®, Apache Spark, and Spark® are either registered trademarks or +trademarks of the Apache Software Foundation in the United States and/or other +countries.* diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..e2704e7bac --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) Microsoft Corporation. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000000..feeaff1203 --- /dev/null +++ b/README.md @@ -0,0 +1,177 @@ +# Microsoft Machine Learning for Apache Spark + + + +MMLSpark provides a number of deep learning and data science tools for [Apache +Spark](https://github.com/apache/spark), including seamless integration of Spark +Machine Learning pipelines with [Microsoft Cognitive Toolkit +(CNTK)](https://github.com/Microsoft/CNTK) and [OpenCV](http://www.opencv.org/), +enabling you to quickly create powerful, highly-scalable predictive and +analytical models for large image and text datasets. + +MMLSpark requires Scala 2.11, Spark 2.1+, and either Python 2.7 or +Python 3.5+. See the API documentation +[for Scala](http://mmlspark.azureedge.net/docs/scala/) and +[for PySpark](http://mmlspark.azureedge.net/docs/pyspark/). + + +## Salient features + +* Easily ingest images from HDFS into Spark `DataFrame` ([example:301]) +* Pre-process image data using transforms from OpenCV ([example:302]) +* Featurize images using pre-trained deep neural nets using CNTK ([example:301]) +* Train DNN-based image classification models on N-Series GPU VMs on Azure + ([example:301]) +* Featurize free-form text data using convenient APIs on top of primitives in + SparkML via a single transformer ([example:201]) +* Train classification and regression models easily via implicit featurization + of data ([example:101]) +* Compute a rich set of evaluation metrics including per-instance metrics + ([example:102]) + +See our [notebooks](notebooks/samples/) for all examples. + +[example:101]: notebooks/samples/101%20-%20Adult%20Census%20Income%20Training.ipynb + "Adult Census Income Training" +[example:102]: notebooks/samples/102%20-%20Regression%20Example%20with%20Flight%20Delay%20Dataset.ipynb + "Regression Example with Flight Delay Dataset" +[example:201]: notebooks/samples/201%20-%20Amazon%20Book%20Reviews%20-%20TextFeaturizer.ipynb + "Amazon Book Reviews - TextFeaturizer" +[example:301]: notebooks/samples/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation.ipynb + "CIFAR10 CNTK CNN Evaluation" +[example:302]: notebooks/samples/302%20-%20Pipeline%20Image%20Transformations.ipynb + "Pipeline Image Transformations" + + +## A short example + +Below is an excerpt from a simple example of using a pre-trained CNN to classify +images in the CIFAR-10 dataset. View the whole source code as [an example +notebook](notebooks/samples/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation.ipynb). + + ```python + ... + import mmlspark as mml + # Initialize CNTKModel and define input and output columns + cntkModel = mml.CNTKModel().setInputCol("images").setOutputCol("output").setModelLocation(modelFile) + # Train on dataset with internal spark pipeline + scoredImages = cntkModel.transform(imagesWithLabels) + ... + ``` + +See [other sample notebooks](notebooks/samples/) as well as the MMLSpark +documentation for [Scala](http://mmlspark.azureedge.net/docs/scala/) +and [PySpark](http://mmlspark.azureedge.net/docs/pyspark/). + + +## Setup and installation + +### Docker + +The easiest way to evaluate MMLSpark is via our pre-built Docker container. To +do so, run the following command: + + docker run -it -p 8888:8888 microsoft/mmlspark + +Navigate to in your web browser to run the sample +notebooks. See the +[documentation](http://mmlspark.azureedge.net/docs/pyspark/install.html) +for more on Docker use. + +> Note: If you wish to run a new instance of the Docker image, make sure you +> stop & remove the container with the name `my-mml` (using `docker rm my-mml`) +> before you try to run a new instance, or run it with a `--rm` flag. + +### Spark package + +MMLSpark can be conveniently installed on existing Spark clusters via the +`--packages` option, examples: + + spark-shell --packages com.microsoft.ml.spark:mmlspark_2.11:0.5 \ + --repositories=https://mmlspark.azureedge.net/maven + + pyspark --packages com.microsoft.ml.spark:mmlspark_2.11:0.5 \ + --repositories=https://mmlspark.azureedge.net/maven + + spark-submit --packages com.microsoft.ml.spark:mmlspark_2.11:0.5 \ + --repositories=https://mmlspark.azureedge.net/maven \ + MyApp.jar + + + +### HDInsight + +To install MMLSpark on an existing [HDInsight Spark +Cluster](https://docs.microsoft.com/en-us/azure/hdinsight/), you can execute a +script action on the cluster head and worker nodes. For instructions on running +script actions, see [this +guide](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-customize-cluster-linux#use-a-script-action-during-cluster-creation). + +The script action url is: + . + +If you're using the Azure Portal to run the script action, go to `Script +actions` ⇒ `Submit new` in the `Overview` section of your cluster blade. In the +`Bash script URI` field, input the script action URL provided above. Mark the +rest of the options as shown on the screenshot to the right. + +Submit, and the cluster should finish configuring within 10 minutes or so. + +### Databricks cloud + +To install MMLSpark on the +[Databricks cloud](http://community.cloud.databricks.com), create a new +[library from Maven coordinates](https://docs.databricks.com/user-guide/libraries.html#libraries-from-maven-pypi-or-spark-packages) +in your workspace. + +For the coordinates use: `com.microsoft.ml.spark:mmlspark:0.5`. Then, under +Advanced Options, use `https://mmlspark.azureedge.net/maven` for the repository. +Ensure this library is attached to all clusters you create. + +Finally, ensure that your Spark cluster has at least Spark 2.1 and Scala 2.11. + +You can use MMLSpark in both your Scala and PySpark notebooks. + +### SBT + +If you are building a Spark application in Scala, add the following lines to +your `build.sbt`: + + ```scala + resolvers += "MMLSpark Repo" at "https://mmlspark.azureedge.net/maven" + libraryDependencies += "com.microsoft.ml.spark" %% "mmlspark" % "0.5" + ``` + +### Building from source + +You can also easily create your own build by cloning this repo and use the main +build script: `./runme`. Run it once to install the needed dependencies, and +again to do a build. See [this guide](docs/developer-readme.md) for more +information. + + +## Contributing & feedback + +This project has adopted the [Microsoft Open Source Code of +Conduct](https://opensource.microsoft.com/codeofconduct/). For more information +see the [Code of Conduct +FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact +[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional +questions or comments. + +See [CONTRIBUTING.md](CONTRIBUTING.md) for contribution guidelines. + +To give feedback and/or report an issue, open a [GitHub +Issue](https://help.github.com/articles/creating-an-issue/). + + +## Other relevant projects + +* [Microsoft Cognitive Toolkit](https://github.com/Microsoft/CNTK) + +* [Azure Machine Learning + Operationalization](https://github.com/Azure/Machine-Learning-Operationalization) + +*Apache®, Apache Spark, and Spark® are either registered trademarks or +trademarks of the Apache Software Foundation in the United States and/or other +countries.* diff --git a/docs/developer-readme.md b/docs/developer-readme.md new file mode 100644 index 0000000000..04d0e97a03 --- /dev/null +++ b/docs/developer-readme.md @@ -0,0 +1,80 @@ +# MMLSpark + +## Repository Layout + +* `runme`: main build entry point +* `src/`: scala and python sources + - `core/`: shared functionality + - `project/`: sbt build-related materials +* `tools/`: build-related tools + + +## Build + +### Build Environment + +Currently, this code is developed and built on Linux. The main build entry +point, `./runme`, will install the needed packages. When everything is +installed, you can use `./runme` again to do a build. + + +### Development + +From now on, you can continue using `./runme` for builds. Alternatively, use +`sbt full-build` to do the build directly through SBT. The output will show +the individual steps that are running, and you can use them directly as usual +with SBT. For example, use `sbt "project foo-bar" test` to run the tests of +the `foo-bar` sub-project, or `sbt ~compile` to do a full compilation step +whenever any file changes. + +Note that the SBT environment is set up in a way that makes *all* code in +`com.microsoft.ml.spark` available in the Scala console that you get when you +run `sbt console`. This can be a very useful debugging tool, since you get to +play with your code in an interactive REPL. + +Every once in a while the installed libraries will be updated. In this case, +executing `./runme` will update the libraries, and the next run will do a build +as usual. If you're using `sbt` directly, it will warn you whenever there was +a change to the library configurations. + +Note: the libraries are all installed in `$HOME/lib` with a few +executable symlinks in `$HOME/bin`. The environment is configured in +`$HOME/.mmlspark_profile` which will be executed whenever a shell starts. +Occasionally, `./runme` will tell you that there was an update to the +`.mmlspark_profile` file --- when this happens, you can start a new shell +to get the updated version, but you can also apply the changes to your +running shell with `. ~/.mmlspark_profile` which will evaluate its +contents and save a shell restart. + + +## Adding a Module + +To add a new module, create a directory with an appropriate name, and in the +new directory create a `build.sbt` file. The contents of `build.sbt` is +optional, and can be completely empty: its presence will make the build include +your directory as a sub-project which gets included in SBT work. + +You can put the usual SBT customizations in your `build.sbt`, for example: + + version := "1.0" + name := "A Useful Module" + +In addition, there are a few utilities in `Extras` that can be useful to +specify some things. Currently, there is only one such utility: + + Extras.noJar + +putting this in your `build.sbt` indicates that no `.jar` file should be +created for your sub-project in the `package` step. (Useful, for example, for +build tools and test-only directories.) + +Finally, whenever SBT runs it generates an `autogen.sbt` file that specifies +the sub-projects. This file is generated automatically so there is no need to +edit a central file when you add a module, and therefore customizing what +appears in it is done via "meta comments" in your `build.sbt`. This is +currently used to specify dependencies for your sub-project --- in most cases +you will want to add this: + + //> DependsOn: core + +to use the shared code in the `common` sub-project. diff --git a/docs/third-party-notices.txt b/docs/third-party-notices.txt new file mode 100644 index 0000000000..58540ba262 --- /dev/null +++ b/docs/third-party-notices.txt @@ -0,0 +1,298 @@ +================================================================================ +*** OpenCV +================================================================================ + +By downloading, copying, installing or using the software you agree to +this license. If you do not agree to this license, do not download, +install, copy or use the software. + + + License Agreement + For Open Source Computer Vision Library + (3-clause BSD License) + +Copyright (C) 2000-2016, Intel Corporation, all rights reserved. +Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. +Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. +Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. +Copyright (C) 2015-2016, Itseez Inc., all rights reserved. +Third party copyrights are property of their respective owners. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the names of the copyright holders nor the names of the contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +This software is provided by the copyright holders and contributors "as +is" and any express or implied warranties, including, but not limited +to, the implied warranties of merchantability and fitness for a +particular purpose are disclaimed. In no event shall copyright holders +or contributors be liable for any direct, indirect, incidental, special, +exemplary, or consequential damages (including, but not limited to, +procurement of substitute goods or services; loss of use, data, or +profits; or business interruption) however caused and on any theory of +liability, whether in contract, strict liability, or tort (including +negligence or otherwise) arising in any way out of the use of this +software, even if advised of the possibility of such damage. + + + +================================================================================ +*** File with code "taken from" PCL library +================================================================================ + +Software License Agreement (BSD License) + +Point Cloud Library (PCL) - www.pointclouds.org +Copyright (c) 2009-2012, Willow Garage, Inc. +Copyright (c) 2012-, Open Perception, Inc. +Copyright (c) XXX, respective authors. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +================================================================================ +*** KAZE +================================================================================ + +Copyright (c) 2012, Pablo Fernández Alcantarilla +All Rights Reserved + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of the copyright holders nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +================================================================================ +*** libwebp +================================================================================ + +Copyright (c) 2010, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Google nor the names of its contributors may be + used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Additional IP Rights Grant (Patents) +------------------------------------ + +"These implementations" means the copyrightable works that implement the +WebM codecs distributed by Google as part of the WebM Project. + +Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, +royalty-free, irrevocable (except as stated in this section) patent license to +make, have made, use, offer to sell, sell, import, transfer, and otherwise +run, modify and propagate the contents of these implementations of WebM, where +such license applies only to those patent claims, both currently owned by +Google and acquired in the future, licensable by Google that are necessarily +infringed by these implementations of WebM. This grant does not include claims +that would be infringed only as a consequence of further modification of these +implementations. If you or your agent or exclusive licensee institute or order +or agree to the institution of patent litigation or any other patent +enforcement activity against any entity (including a cross-claim or +counterclaim in a lawsuit) alleging that any of these implementations of WebM +or any code incorporated within any of these implementations of WebM +constitute direct or contributory patent infringement, or inducement of +patent infringement, then any patent rights granted to you under this License +for these implementations of WebM shall terminate as of the date such +litigation is filed." + + + +================================================================================ +*** File with code "based on" a message of Laurent Pinchart on the +*** video4linux mailing list +================================================================================ + +LEGAL ISSUES +============ + +In plain English: + +1. We don't promise that this software works. (But if you find any + bugs, please let us know!) +2. You can use this software for whatever you want. You don't have to + pay us. +3. You may not pretend that you wrote this software. If you use it in a + program, you must acknowledge somewhere in your documentation that + you've used the IJG code. + +In legalese: + +The authors make NO WARRANTY or representation, either express or +implied, with respect to this software, its quality, accuracy, +merchantability, or fitness for a particular purpose. This software is +provided "AS IS", and you, its user, assume the entire risk as to its +quality and accuracy. + +This software is copyright (C) 1991-2013, Thomas G. Lane, Guido +Vollbeding. All Rights Reserved except as specified below. + +Permission is hereby granted to use, copy, modify, and distribute this +software (or portions thereof) for any purpose, without fee, subject to +these conditions: +(1) If any part of the source code for this software is distributed, + then this README file must be included, with this copyright and + no-warranty notice unaltered; and any additions, deletions, or + changes to the original files must be clearly indicated in + accompanying documentation. +(2) If only executable code is distributed, then the accompanying + documentation must state that "this software is based in part on the + work of the Independent JPEG Group". +(3) Permission for use of this software is granted only if the user + accepts full responsibility for any undesirable consequences; the + authors accept NO LIABILITY for damages of any kind. + +These conditions apply to any software derived from or based on the IJG +code, not just to the unmodified library. If you use our work, you +ought to acknowledge us. + +Permission is NOT granted for the use of any IJG author's name or +company name in advertising or publicity relating to this software or +products derived from it. This software may be referred to only as "the +Independent JPEG Group's software". + +We specifically permit and encourage the use of this software as the +basis of commercial products, provided that all warranty or liability +claims are assumed by the product vendor. + +The Unix configuration script "configure" was produced with GNU +Autoconf. It is copyright by the Free Software Foundation but is freely +distributable. The same holds for its supporting scripts (config.guess, +config.sub, ltmain.sh). Another support script, install-sh, is +copyright by X Consortium but is also freely distributable. + +The IJG distribution formerly included code to read and write GIF files. +To avoid entanglement with the Unisys LZW patent, GIF reading support +has been removed altogether, and the GIF writer has been simplified to +produce "uncompressed GIFs". This technique does not use the LZW +algorithm; the resulting GIF files are larger than usual, but are +readable by all standard GIF decoders. + +We are required to state that + "The Graphics Interchange Format(c) is the Copyright property of + CompuServe Incorporated. GIF(sm) is a Service Mark property of + CompuServe Incorporated." + + + +================================================================================ +*** File with code copyright Yossi Rubner, as well as code copyright +*** MD-Mathematische Dienste GmbH +================================================================================ + + Copyright (c) 2002, + MD-Mathematische Dienste GmbH + Im Defdahl 5-10 + 44141 Dortmund + Germany + www.md-it.de + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. Redistributions +in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. The name of Contributor +may not be used to endorse or promote products derived from this +software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +THE POSSIBILITY OF SUCH DAMAGE. diff --git a/docs/your-first-model.md b/docs/your-first-model.md new file mode 100644 index 0000000000..6bafdb1cc0 --- /dev/null +++ b/docs/your-first-model.md @@ -0,0 +1,109 @@ +## Your First Model + +In this example, we construct a basic classification model to predict a person's +income level given demographics data such as education level or marital status. +We also learn how to use Jupyter notebooks for developing and running the model. + + +### Prerequisites + +* You have installed the MMLSpark package, either as a Docker image or on a + Spark cluster, +* You have basic knowledge of Python language, +* You have basic understanding of machine learning concepts: training, testing, + classification. + + +### Working with Jupyter Notebooks + +Once you have the MMLSpark package installed, open Jupyter notebooks folder in +your web browser + +* Local Docker: `http://localhost:8888` +* Spark cluster: `https:///jupyter` + +Create a new notebook by selecting "New" -> "PySpark3". Let's also give the +notebook a friendlier name, *Adult Census Income Prediction*, by clicking the +title. + + +### Importing Packages and Starting the Spark Application + +At this point, the notebook is not yet running a Spark application. In the +first cell, let's import some needed packages + + import numpy as np + import pandas as pd + +Click the "run cell" button on the toolbar to start the application. After a +few moments, you should see the message "SparkSession available as 'spark'". +Now you're ready to start coding and running your application. + + +### Reading in Data + +In a typical Spark application, you'd likely work with huge datasets stored on +distributed file system, such as HDFS. However, to keep this tutorial simple +and quick, we'll copy over a small dataset from a URL. We then read this data +into memory using Pandas CSV reader, and distribute the data as a Spark +DataFrame. Finally, we show the first 5 rows of the dataset. Copy the following +code to the next cell in your notebook, and run the cell. + + dataFile = "AdultCensusIncome.csv" + import os, urllib + if not os.path.isfile(dataFile): + urllib.request.urlretrieve("https://mmlspark.azureedge.net/datasets/" + dataFile, dataFile) + data = spark.createDataFrame(pd.read_csv(dataFile, dtype={" hours-per-week": np.float64})) + data.show(5) + + +### Selecting Features and Splitting Data to Train and Test Sets + +Next, select some features to use in our model. You can try out different +features, but you should include `" income"` as it is the label column the model +is trying to predict. We then split the data into a `train` and `test` sets. + + data = data.select([" education", " marital-status", " hours-per-week", " income"]) + train, test = data.randomSplit([0.75, 0.25], seed=123) + + +### Training a Model + +To train the classifier model, we use the `mmlspark.TrainClassifier` class. It +takes in training data and a base SparkML classifier, maps the data into the +format expected by the base classifier algorithm, and fits a model. + + from mmlspark.TrainClassifier import TrainClassifier + from pyspark.ml.classification import LogisticRegression + model = TrainClassifier(model=LogisticRegression(), labelCol=" income").fit(train) + +Note that `TrainClassifier` implicitly handles string-valued columns and +binarizes the label column. + + +### Scoring and Evaluating the Model + +Finally, let's score the model against the test set, and use +`mmlspark.ComputeModelStatistics` class to compute metrics — accuracy, AUC, +precision, recall — from the scored data. + + from mmlspark.ComputeModelStatistics import ComputeModelStatistics + prediction = model.transform(test) + metrics = ComputeModelStatistics().transform(prediction) + metrics.select('accuracy').show() + +And that's it: you've build your first machine learning model using the MMLSpark +package. For help on mmlspark classes and methods, you can use Python's help() +function, for example + + help(mmlspark.TrainClassifier) + +Next, view our other tutorials to learn how to +* Tune model parameters to find the best model +* Use SparkML pipelines to build a more complex model +* Use deep neural networks for image classification +* Use text analytics for document classification + +*Apache®, Apache Spark, and Spark® are either registered trademarks or +trademarks of the Apache Software Foundation in the United States and/or other +countries.* diff --git a/notebooks/samples/101 - Adult Census Income Training.ipynb b/notebooks/samples/101 - Adult Census Income Training.ipynb new file mode 100644 index 0000000000..58d0239476 --- /dev/null +++ b/notebooks/samples/101 - Adult Census Income Training.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we try to predict incomes from the *Adult Census* dataset.\n", + "\n", + "First, we import the packages (use `help(mmlspark)` to view contents)," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import mmlspark\n", + "\n", + "# help(mmlspark)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's read the data and split it to train and test sets:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dataFile = \"AdultCensusIncome.csv\"\n", + "import os, urllib\n", + "if not os.path.isfile(dataFile):\n", + " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n", + "data = spark.createDataFrame(pd.read_csv(dataFile, dtype={\" hours-per-week\": np.float64}))\n", + "data = data.select([\" education\", \" marital-status\", \" hours-per-week\", \" income\"])\n", + "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", + "train.limit(10).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`TrainClassifier` can be used to initialize and fit a model, it wraps SparkML classifiers.\n", + "You can use `help(mmlspark.TrainClassifier)` to view the different parameters.\n", + "\n", + "Note that it implicitly converts the data into the format expected by the algorithm: tokenize", + " and hash strings, one-hot encodes categorical variables, assembles the features into vector", + " and so on. The parameter `numFeatures` controls the number of hashed features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark.TrainClassifier import TrainClassifier\n", + "from pyspark.ml.classification import LogisticRegression\n", + "model = TrainClassifier(model=LogisticRegression(), labelCol=\" income\", numFeatures=256).fit(train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the model is trained, we score it against the test dataset and view metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark.ComputeModelStatistics import ComputeModelStatistics\n", + "prediction = model.transform(test)\n", + "metrics = ComputeModelStatistics().transform(prediction)\n", + "metrics.limit(10).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we save the model so it can be used in a scoring program." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "model.write().overwrite().save(\"AdultCensus.mml\")" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/samples/102 - Regression Example with Flight Delay Dataset.ipynb b/notebooks/samples/102 - Regression Example with Flight Delay Dataset.ipynb new file mode 100644 index 0000000000..b835110085 --- /dev/null +++ b/notebooks/samples/102 - Regression Example with Flight Delay Dataset.ipynb @@ -0,0 +1,161 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we run a linear regression on the *Flight Delay* dataset to predict the delay times.\n", + "\n", + "We demonstrate how to use the `TrainRegressor` and the `ComputePerInstanceStatistics` APIs.\n", + "\n", + "First, import the packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import mmlspark" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, import the CSV dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# load raw data from small-sized 30 MB CSV file (trimmed to contain just what we use)\n", + "dataFile = \"On_Time_Performance_2012_9.csv\"\n", + "import os, urllib\n", + "if not os.path.isfile(dataFile):\n", + " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n", + "flightDelay = spark.createDataFrame(\n", + " pd.read_csv(dataFile, dtype={\"Month\": np.float64, \"Quarter\": np.float64,\n", + " \"DayofMonth\": np.float64, \"DayOfWeek\": np.float64,\n", + " \"OriginAirportID\": np.float64, \"DestAirportID\": np.float64,\n", + " \"CRSDepTime\": np.float64, \"CRSArrTime\": np.float64}))\n", + "# Print information on the dataset we loaded\n", + "print(\"records read: \" + str(flightDelay.count()))\n", + "print(\"Schema:\")\n", + "flightDelay.printSchema()\n", + "flightDelay.limit(10).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split the dataset into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train,test = flightDelay.randomSplit([0.75, 0.25])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train a regressor on dataset with `l-bfgs`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark.TrainRegressor import TrainRegressor\n", + "from pyspark.ml.regression import LinearRegression\n", + "lr = LinearRegression().setSolver(\"l-bfgs\").setRegParam(0.1).setElasticNetParam(0.3)\n", + "model = TrainRegressor(model=lr, labelCol=\"ArrDelay\", numFeatures=1 << 18).fit(train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Score the regressor on the test data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "scoredData = model.transform(test)\n", + "scoredData.limit(10).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, compute and show per-instance statistics, demonstrating the usage", + " of `ComputePerInstanceStatistics`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark import ComputePerInstanceStatistics\n", + "evalPerInstance = ComputePerInstanceStatistics().transform(scoredData)\n", + "evalPerInstance.select(\"ArrDelay\", \"Scores\", \"L1_loss\", \"L2_loss\").limit(10).toPandas()" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/samples/103 - Before and After MMLSpark.ipynb b/notebooks/samples/103 - Before and After MMLSpark.ipynb new file mode 100644 index 0000000000..0542221ea2 --- /dev/null +++ b/notebooks/samples/103 - Before and After MMLSpark.ipynb @@ -0,0 +1,286 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Introduction\n", + "\n", + "


", + "\n", + "In this tutorial, we perform the same classification task in two\n", + "diffeerent ways: once using plain **`pyspark`** and once using the\n", + "**`mmlspark`** library. The two methods yield the same performance,\n", + "but one of the two libraries is drastically simpler to use and iterate\n", + "on (can you guess which one?).\n", + "\n", + "The task is simple: Predict whether a user's review of a book sold on\n", + "Amazon is good (rating > 3) or bad based on the text of the review. We\n", + "accomplish this by training LogisticRegression learners with different\n", + "hyperparameters and choosing the best model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Read the data\n", + "\n", + "We download and read in the data. We show a sample below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import mmlspark\n", + "from pyspark.sql.types import IntegerType, StringType, StructType, StructField\n", + "\n", + "dataFile = \"BookReviewsFromAmazon10K.tsv\"\n", + "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n", + " StructField(\"text\", StringType(), False)])\n", + "import os, urllib\n", + "if not os.path.isfile(dataFile):\n", + " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n", + "raw_data = spark.createDataFrame(pd.read_csv(dataFile, sep=\"\\t\", header=None), textSchema)\n", + "raw_data.show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Extract more features and process data\n", + "\n", + "Real data however is more complex than the above dataset. It is common\n", + "for a dataset to have features of multiple types: text, numeric,\n", + "categorical. To illustrate how difficult it is to work with these\n", + "datasets, we add two numerical features to the dataset: the **word\n", + "count** of the review and the **mean word length**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import LongType, FloatType, DoubleType\n", + "def word_count(s):\n", + " return len(s.split())\n", + "def word_length(s):\n", + " import numpy as np\n", + " ss = [len(w) for w in s.split()]\n", + " return round(float(np.mean(ss)), 2)\n", + "word_length_udf = udf(word_length, DoubleType())\n", + "word_count_udf = udf(word_count, IntegerType())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = raw_data \\\n", + " .select(\"rating\", \"text\",\n", + " word_count_udf(\"text\").alias(\"wordCount\"),\n", + " word_length_udf(\"text\").alias(\"wordLength\")) \\\n", + " .withColumn(\"label\", raw_data[\"rating\"] > 3).drop(\"rating\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4a. Classify using pyspark\n", + "\n", + "To choose the best LogisticRegression classifier using the `pyspark`\n", + "library, need to *explictly* perform the following steps:\n", + "\n", + "1. Process the features:\n", + " * Tokenize the text column\n", + " * Hash the tokenized column into a vector using hashing\n", + " * Merge the numeric features with the vector in the step above\n", + "2. Process the label column: cast it into the proper type.\n", + "3. Train multiple LogisticRegression algorithms on the `train` dataset\n", + " with different hyperparameters\n", + "4. Compute the area under the ROC curve for each of the trained models\n", + " and select the model with the highest metric as computed on the\n", + " `test` dataset\n", + "5. Evaluate the best model on the `validation` set\n", + "\n", + "As you can see below, there is a lot of work involved and a lot of\n", + "steps where something can go wrong!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pyspark.ml.feature import Tokenizer, HashingTF\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "# Featurize text column\n", + "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"tokenizedText\")\n", + "numFeatures = 10000\n", + "hashingScheme = HashingTF(inputCol=\"tokenizedText\",\n", + " outputCol=\"TextFeatures\",\n", + " numFeatures=numFeatures)\n", + "tokenizedData = tokenizer.transform(data)\n", + "featurizedData = hashingScheme.transform(tokenizedData)\n", + "\n", + "# Merge text and numeric features in one feature column\n", + "feature_columns_array = [\"TextFeatures\", \"wordCount\", \"wordLength\"]\n", + "assembler = VectorAssembler(\n", + " inputCols = feature_columns_array,\n", + " outputCol=\"features\")\n", + "assembledData = assembler.transform(featurizedData)\n", + "\n", + "# Select only columns of interest\n", + "# Convert rating column from boolean to int\n", + "processedData = assembledData \\\n", + " .select(\"label\", \"features\") \\\n", + " .withColumn(\"label\", assembledData.label.cast(IntegerType()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.classification import LogisticRegression\n", + "\n", + "# Prepare data for learning\n", + "train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20], seed=123)\n", + "\n", + "# Train the models on the 'train' data\n", + "lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n", + "logisticRegressions = [LogisticRegression(regParam = hyperParam)\n", + " for hyperParam in lrHyperParams]\n", + "evaluator = BinaryClassificationEvaluator(rawPredictionCol=\"rawPrediction\",\n", + " metricName=\"areaUnderROC\")\n", + "metrics = []\n", + "models = []\n", + "\n", + "# Select the best model\n", + "for learner in logisticRegressions:\n", + " model = learner.fit(train)\n", + " models.append(model)\n", + " scored_data = model.transform(test)\n", + " metrics.append(evaluator.evaluate(scored_data))\n", + "best_metric = max(metrics)\n", + "best_model = models[metrics.index(best_metric)]\n", + "\n", + "# Save model\n", + "best_model.write().overwrite().save(\"SparkMLExperiment.mmls\")\n", + "# Get AUC on the validation dataset\n", + "scored_val = best_model.transform(validation)\n", + "print(evaluator.evaluate(scored_val))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4b. Classify using mmlspark\n", + "\n", + "Life is a lot simpler when using `mmlspark`!\n", + "\n", + "1. The **`TrainClassifier`** Estimator featurizes the data internally,\n", + " as long as the columns selected in the `train`, `test`, `validation`\n", + " dataset represent the features\n", + "\n", + "2. The **`FindBestModel`** Estimator find the best model from a pool of\n", + " trained models by find the model which performs best on the `test`\n", + " dataset given the specified metric\n", + "\n", + "3. The **`CompueModelStatistics`** Transformer computes the different\n", + " metrics on a scored dataset (in our case, the `validation` dataset)\n", + " at the same time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark import TrainClassifier, FindBestModel, ComputeModelStatistics\n", + "\n", + "# Prepare data for learning\n", + "train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=123)\n", + "\n", + "# Train the models on the 'train' data\n", + "lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n", + "logisticRegressions = [LogisticRegression(regParam = hyperParam)\n", + " for hyperParam in lrHyperParams]\n", + "lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\", numFeatures=10000).fit(train)\n", + " for lrm in logisticRegressions]\n", + "\n", + "# Select the best model\n", + "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=lrmodels).fit(test)\n", + "\n", + "# Save model\n", + "bestModel.write().overwrite().save(\"MMLSExperiment.mmls\")\n", + "# Get AUC on the validation dataset\n", + "predictions = bestModel.transform(validation)\n", + "metrics = ComputeModelStatistics().transform(predictions)\n", + "print(metrics.first()[\"AUC\"])" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/samples/201 - Amazon Book Reviews - TextFeaturizer.ipynb b/notebooks/samples/201 - Amazon Book Reviews - TextFeaturizer.ipynb new file mode 100644 index 0000000000..2d1013747a --- /dev/null +++ b/notebooks/samples/201 - Amazon Book Reviews - TextFeaturizer.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, try to predict Amazon book ratings greater than 3 out of 5, this time usaging the", + " `TextFeaturizer` module which is a composition of several text analytics APIs that are", + " native to Spark." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import mmlspark\n", + "from pyspark.sql.types import IntegerType, StringType, StructType, StructField" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dataFile = \"BookReviewsFromAmazon10K.tsv\"\n", + "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n", + " StructField(\"text\", StringType(), False)])\n", + "import os, urllib\n", + "if not os.path.isfile(dataFile):\n", + " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n", + "data = spark.createDataFrame(pd.read_csv(dataFile, sep=\"\\t\", header=None), textSchema)\n", + "data.limit(10).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use `TextFeaturizer` to generate our features column. We remove stop words, and use TF-IDF", + " to generate 2²⁰ sparse features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark.TextFeaturizer import TextFeaturizer\n", + "textFeaturizer = TextFeaturizer() \\\n", + " .setInputCol(\"text\").setOutputCol(\"features\") \\\n", + " .setUseStopWordsRemover(True).setUseIDF(True).setMinDocFreq(5).setNumFeatures(1 << 16).fit(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "processedData = textFeaturizer.transform(data)\n", + "processedData.limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Change the label so that we can predict whether the rating is greater than 3 using a binary", + " classifier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "processedData = processedData.withColumn(\"label\", processedData[\"rating\"] > 3) \\\n", + " .select([\"features\", \"label\"])\n", + "processedData.limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train several Logistic Regression models with different regularizations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20])\n", + "from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n", + "\n", + "lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n", + "logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams]\n", + "\n", + "from mmlspark.TrainClassifier import TrainClassifier\n", + "lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\").fit(train) for lrm in logisticRegressions]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Find the model with the best AUC on the test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark import FindBestModel\n", + "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=lrmodels).fit(test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the optimized `ComputeModelStatistics` API to find the model accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark.ComputeModelStatistics import ComputeModelStatistics\n", + "predictions = bestModel.transform(validation)\n", + "metrics = ComputeModelStatistics().transform(predictions)\n", + "metrics.first()[\"accuracy\"]" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/samples/202 - Amazon Book Reviews - Word2Vec.ipynb b/notebooks/samples/202 - Amazon Book Reviews - Word2Vec.ipynb new file mode 100644 index 0000000000..30d5aa7d64 --- /dev/null +++ b/notebooks/samples/202 - Amazon Book Reviews - Word2Vec.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Yet again, now using the `Word2Vec` Estimator from Spark. We can use the tree-based learners", + " from spark in this scenario due to the lower dimensionality representation of features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import mmlspark\n", + "from pyspark.sql.types import IntegerType, StringType, StructType, StructField" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dataFile = \"BookReviewsFromAmazon10K.tsv\"\n", + "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n", + " StructField(\"text\", StringType(), False)])\n", + "import os, urllib\n", + "if not os.path.isfile(dataFile):\n", + " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n", + "data = spark.createDataFrame(pd.read_csv(dataFile, sep=\"\\t\", header=None), textSchema)\n", + "data.limit(10).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Modify the label column to predict a rating greater than 3." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "processedData = data.withColumn(\"label\", data[\"rating\"] > 3) \\\n", + " .select([\"text\", \"label\"])\n", + "processedData.limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split the dataset into train, test and validation sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use `Tokenizer` and `Word2Vec` to generate the features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.feature import Tokenizer, Word2Vec\n", + "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"words\")\n", + "partitions = train.rdd.getNumPartitions()\n", + "word2vec = Word2Vec(maxIter=4, seed=42, inputCol=\"words\", outputCol=\"features\",\n", + " numPartitions=partitions)\n", + "textFeaturizer = Pipeline(stages = [tokenizer, word2vec]).fit(train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transform each of the train, test and validation datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ptrain = textFeaturizer.transform(train).select([\"label\", \"features\"])\n", + "ptest = textFeaturizer.transform(test).select([\"label\", \"features\"])\n", + "pvalidation = textFeaturizer.transform(validation).select([\"label\", \"features\"])\n", + "ptrain.limit(5).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate several models with different parameters from the training data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n", + "from mmlspark.TrainClassifier import TrainClassifier\n", + "import itertools\n", + "\n", + "lrHyperParams = [0.05, 0.2]\n", + "logisticRegressions = [LogisticRegression(regParam = hyperParam)\n", + " for hyperParam in lrHyperParams]\n", + "lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\").fit(ptrain)\n", + " for lrm in logisticRegressions]\n", + "\n", + "rfHyperParams = itertools.product([5, 10], [3, 5])\n", + "randomForests = [RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1])\n", + " for hyperParam in rfHyperParams]\n", + "rfmodels = [TrainClassifier(model=rfm, labelCol=\"label\").fit(ptrain)\n", + " for rfm in randomForests]\n", + "\n", + "rfHyperParams = itertools.product([8, 16], [3, 5])\n", + "gbtclassifiers = [GBTClassifier(maxBins=hyperParam[0], maxDepth=hyperParam[1])\n", + " for hyperParam in rfHyperParams]\n", + "gbtmodels = [TrainClassifier(model=gbt, labelCol=\"label\").fit(ptrain)\n", + " for gbt in gbtclassifiers]\n", + "\n", + "trainedModels = lrmodels + rfmodels + gbtmodels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Find the best model for the given test dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark import FindBestModel\n", + "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=trainedModels).fit(ptest)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the accuracy from the validation dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark.ComputeModelStatistics import ComputeModelStatistics\n", + "predictions = bestModel.transform(pvalidation)\n", + "metrics = ComputeModelStatistics().transform(predictions)\n", + "metrics.first()[\"accuracy\"]" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/samples/301 - CIFAR10 CNTK CNN Evaluation.ipynb b/notebooks/samples/301 - CIFAR10 CNTK CNN Evaluation.ipynb new file mode 100644 index 0000000000..015e94f2e1 --- /dev/null +++ b/notebooks/samples/301 - CIFAR10 CNTK CNN Evaluation.ipynb @@ -0,0 +1,264 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark import CNTKModel, ModelDownloader\n", + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import IntegerType\n", + "from os.path import abspath" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set some paths." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cdnURL = \"https://mmlspark.azureedge.net/datasets\"\n", + "\n", + "# Please note that this is a copy of the CIFAR10 dataset originally found here:\n", + "# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n", + "dataFile = \"cifar-10-python.tar.gz\"\n", + "dataURL = cdnURL + \"/CIFAR10/\" + dataFile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "mml-deploy": "hdinsight", + "collapsed": false + }, + "outputs": [], + "source": [ + "modelName = \"ConvNet\"\n", + "modelDir = \"wasb:///models/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "mml-deploy": "local", + "collapsed": false + }, + "outputs": [], + "source": [ + "modelName = \"ConvNet\"\n", + "modelDir = \"file:\" + abspath(\"models\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the model and extract the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import os, tarfile, pickle\n", + "import urllib.request\n", + "\n", + "d = ModelDownloader(spark, modelDir)\n", + "model = d.downloadByName(modelName)\n", + "if not os.path.isfile(dataFile):\n", + " urllib.request.urlretrieve(dataURL, dataFile)\n", + "with tarfile.open(dataFile, \"r:gz\") as f:\n", + " test_dict = pickle.load(f.extractfile(\"cifar-10-batches-py/test_batch\"),\n", + " encoding=\"latin1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Preprocess the images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import array\n", + "from pyspark.sql.functions import col\n", + "from pyspark.sql.types import *\n", + "\n", + "def reshape_image(record):\n", + " image, label, filename = record\n", + " data = [float(x) for x in image.reshape(3,32,32).flatten()]\n", + " return data, label, filename\n", + "\n", + "convert_to_float = udf(lambda x: x, ArrayType(FloatType()))\n", + "\n", + "image_rdd = zip(test_dict[\"data\"], test_dict[\"labels\"], test_dict[\"filenames\"])\n", + "image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)\n", + "\n", + "imagesWithLabels = image_rdd.toDF([\"images\", \"labels\", \"filename\"])\n", + "imagesWithLabels = imagesWithLabels.withColumn(\"images\", convert_to_float(col(\"images\")))\n", + "imagesWithLabels.printSchema()\n", + "\n", + "imagesWithLabels.cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evaluate CNTK model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import time\n", + "start = time.time()\n", + "\n", + "# Use CNTK model to get log probabilities\n", + "cntkModel = CNTKModel().setInputCol(\"images\").setOutputCol(\"output\").setModel(spark, model.uri).setOutputNodeName(\"z\")\n", + "scoredImages = cntkModel.transform(imagesWithLabels)\n", + "\n", + "# Transform the log probabilities to predictions\n", + "def argmax(x): return max(enumerate(x),key=lambda p: p[1])[0]\n", + "\n", + "argmaxUDF = udf(argmax, IntegerType())\n", + "imagePredictions = scoredImages.withColumn(\"predictions\", argmaxUDF(\"output\")) \\\n", + " .select(\"predictions\", \"labels\")\n", + "\n", + "numRows = imagePredictions.count()\n", + "\n", + "end = time.time()\n", + "print(\"classifying {} images took {} seconds\".format(numRows,end-start))\n", + "\n", + "# Register the predictions as a temp table for further analysis using SQL\n", + "imagePredictions.registerTempTable(\"ImagePredictions\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot confusion matrix." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "mml-deploy": "hdinsight", + "collapsed": false + }, + "outputs": [], + "source": [ + "%%sql -q -o imagePredictions\n", + "select * from ImagePredictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "mml-deploy": "hdinsight", + "collapsed": false + }, + "outputs": [], + "source": [ + "%%local\n", + "y, y_hat = imagePredictions[\"labels\"], imagePredictions[\"predictions\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "mml-deploy": "local", + "collapsed": false + }, + "outputs": [], + "source": [ + "imagePredictions = imagePredictions.toPandas()\n", + "y, y_hat = imagePredictions[\"labels\"], imagePredictions[\"predictions\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from sklearn.metrics import confusion_matrix\n", + "\n", + "cm = confusion_matrix(y, y_hat)\n", + "\n", + "labels = [\"airplane\", \"automobile\", \"bird\", \"cat\", \"deer\", \"dog\", \"frog\",\n", + " \"horse\", \"ship\", \"truck\"]\n", + "plt.imshow(cm, interpolation=\"nearest\", cmap=plt.cm.Blues)\n", + "plt.colorbar()\n", + "tick_marks = np.arange(len(labels))\n", + "plt.xticks(tick_marks, labels, rotation=90)\n", + "plt.yticks(tick_marks, labels)\n", + "plt.xlabel(\"Predicted label\")\n", + "plt.ylabel(\"True Label\")\n", + "plt.show()" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/samples/302 - Pipeline Image Transformations.ipynb b/notebooks/samples/302 - Pipeline Image Transformations.ipynb new file mode 100644 index 0000000000..51c0d5d1dc --- /dev/null +++ b/notebooks/samples/302 - Pipeline Image Transformations.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example shows how to manipulate the collection of images.\n", + "First, the images are downloaded to the local directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "mml-deploy": "local", + "collapsed": false + }, + "outputs": [], + "source": [ + "IMAGE_PATH = \"datasets/CIFAR10\"\n", + "\n", + "import os, subprocess\n", + "from urllib.request import urlretrieve\n", + "dataFile = \"test.zip\"\n", + "if not os.path.isdir(IMAGE_PATH):\n", + " os.makedirs(IMAGE_PATH)\n", + " urlretrieve(\"https://mmlspark.azureedge.net/datasets/CIFAR10/test.zip\",\n", + " IMAGE_PATH + \".zip\")\n", + " print(subprocess.check_output(\n", + " \"ip=\\\"%s\\\"; cd \\\"$ip\\\" && unzip -q \\\"../$(basename $PWD).zip\\\"\" % IMAGE_PATH,\n", + " stderr = subprocess.STDOUT, shell = True)\n", + " .decode(\"utf-8\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "mml-deploy": "hdinsight", + "collapsed": false + }, + "outputs": [], + "source": [ + "%%local\n", + "IMAGE_PATH = \"/datasets/CIFAR10/test\"\n", + "import subprocess\n", + "if subprocess.call([\"hdfs\", \"dfs\", \"-test\", \"-d\", IMAGE_PATH]):\n", + " from urllib import urlretrieve\n", + " urlretrieve(\"https://mmlspark.azureedge.net/datasets/CIFAR10/test.zip\", \"/tmp/test.zip\")\n", + " print subprocess.check_output(\"rm -rf /tmp/CIFAR10 && mkdir -p /tmp/CIFAR10 && unzip /tmp/test.zip -d /tmp/CIFAR10\", stderr=subprocess.STDOUT, shell=True)\n", + " print subprocess.check_output(\"hdfs dfs -mkdir -p %s\" % IMAGE_PATH, stderr=subprocess.STDOUT, shell=True)\n", + " print subprocess.check_output(\"hdfs dfs -copyFromLocal -f /tmp/CIFAR10/test/011*.png %s\"%IMAGE_PATH, stderr=subprocess.STDOUT, shell=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "mml-deploy": "hdinsight", + "collapsed": false + }, + "outputs": [], + "source": [ + "IMAGE_PATH = \"/datasets/CIFAR10/test\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The images are loaded from the directory (for fast prototyping, consider loading a fraction of", + " images). Inside the dataframe, each image is a single field in the image column. The image has", + " sub-fields (path, height, width, OpenCV type and OpenCV bytes)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import mmlspark\n", + "import numpy as np\n", + "from mmlspark import toNDArray\n", + "\n", + "images = spark.readImages(IMAGE_PATH, recursive = True, sampleRatio = 0.1).cache()\n", + "images.printSchema()\n", + "print(images.count())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When collected from the *DataFrame*, the image data are stored in a *Row*, which is Spark's way", + " to represent structures (in the current example, each dataframe row has a single Image, which", + " itself is a Row). It is possible to address image fields by name and use `toNDArray()` helper", + " function to convert the image into numpy array for further manipulations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from PIL import Image\n", + "\n", + "data = images.take(3) # take first three rows of the dataframe\n", + "im = data[2][0] # the image is in the first column of a given row\n", + "\n", + "print(\"image type: {}, number of fields: {}\".format(type(im), len(im)))\n", + "print(\"image path: {}\".format(im.path))\n", + "print(\"height: {}, width: {}, OpenCV type: {}\".format(im.height, im.width, im.type))\n", + "\n", + "arr = toNDArray(im) # convert to numpy array\n", + "Image.fromarray(arr, \"RGB\") # display the image inside notebook\n", + "print(images.count())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use ImageTransform for the basic image manipulation: resizing, cropping, etc.\n", + "Internally, operations are pipelined and backed by OpenCV implementation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark import ImageTransform\n", + "\n", + "tr = (ImageTransform() # images are resized and then cropped\n", + " .setOutputCol(\"transformed\")\n", + " .resize(height = 200, width = 200)\n", + " .crop(0, 0, height = 180, width = 180) )\n", + "\n", + "small = tr.transform(images).select(\"transformed\")\n", + "\n", + "im = small.take(3)[2][0] # take third image\n", + "Image.fromarray(toNDArray(im), \"RGB\") # display the image inside notebook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the advanced image manipulations, use Spark UDFs.\n", + "The MMLSpark package provides conversion function between *Spark Row* and", + " *ndarray* image representations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf\n", + "from mmlspark import ImageSchema, toNDArray, toImage\n", + "\n", + "def u(row):\n", + " array = toNDArray(row) # convert Image to numpy ndarray[height, width, 3]\n", + " array[:,:,2] = 0\n", + " return toImage(array) # numpy array back to Spark Row structure\n", + "\n", + "noBlueUDF = udf(u,ImageSchema)\n", + "\n", + "noblue = small.withColumn(\"noblue\", noBlueUDF(small[\"transformed\"])).select(\"noblue\")\n", + "\n", + "im = noblue.take(3)[2][0] # take second image\n", + "Image.fromarray(toNDArray(im), \"RGB\") # display the image inside notebook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Images could be unrolled into the dense 1D vectors suitable for CNKT evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mmlspark import UnrollImage\n", + "\n", + "unroller = UnrollImage().setInputCol(\"noblue\").setOutputCol(\"unrolled\")\n", + "\n", + "unrolled = unroller.transform(noblue).select(\"unrolled\")\n", + "\n", + "vector = unrolled.take(1)[0][0]\n", + "print(type(vector))\n", + "len(vector.toArray())" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3.0 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/tests/BasicDFOpsSmokeTest.ipynb b/notebooks/tests/BasicDFOpsSmokeTest.ipynb new file mode 100644 index 0000000000..32222c3ecd --- /dev/null +++ b/notebooks/tests/BasicDFOpsSmokeTest.ipynb @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "assert(\"spark\" in globals())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "assert(sc.defaultParallelism > 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.datasets import load_iris\n", + "from pyspark.sql.types import StringType, FloatType, StructField, StructType\n", + "\n", + "d = load_iris()\n", + "\n", + "def make_records(features, label, label_names):\n", + " temp = [float(f) for f in features]\n", + " temp.append(str(label_names[label]))\n", + " return temp\n", + "\n", + "col_types = [StructField(fname, FloatType(), False) for fname in d[\"feature_names\"]]\n", + "col_types.append(StructField(\"target\", StringType(), False))\n", + "schema = StructType(col_types)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = spark.createDataFrame([make_records(feature, label, d[\"target_names\"]) \\\n", + " for feature, label in zip(d[\"data\"], d[\"target\"])], schema)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "assert(df.count() == 150)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "expected_columns = d[\"feature_names\"].copy()\n", + "expected_columns.append(\"target\")\n", + "assert(all(actual == expected for actual, expected in zip(df.columns, expected_columns)))" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/runme b/runme new file mode 100755 index 0000000000..8d1cf0c77e --- /dev/null +++ b/runme @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +######################################################################## +# This script serves as both an environment installation script and as a +# build script, for use either on a build machine or a on a developer +# machine. +# +# Arguments to the script are all parsed as variable settings in the +# form of "VAR=VAL", which means that you can use either "X=Y ./runme" +# or "./runme X=Y". +# +# The script works in a "mode" determined by $BUILDMODE. The current +# modes are: +# +# * "build": performs all steps listed below, +# * "server": similar, but intended for use on a build agent, +# * "setup": perform only the setup/update steps, skipping the build, +# * "runtime": similar to "setup", but installs only runtime libraries. +# +# The default (when $BUILDMODE is not set) is similar to doing the +# environment setup part -- but if that does nothing then it continues +# with the build. You can therefore run it once to work directly in the +# IDE, running it again only on updates, or use it to do full builds +# (but you can use sbt/etc for that too). +# +# Here are the steps that are performed: +# +# 1. Setup a working environment with a bunch of needed packages +# installed. (All are installed in $HOME/lib with symlinks in +# $HOME/bin, no system changes.) +# +# 2. Possibly update existing packages if the configuration (in +# "tools/config.sh") was updated. +# +# 3. Ensure that the environment is properly set up for these tools. +# For example, set $PATH to include $HOME/bin, verify a configured +# git identity. +# +# 4. Runs a build, including tests, packaging the result, etc. +# +# Look for `defvar` in "config.sh" to see other variables that customize +# the build (e.g., $TESTS). +######################################################################## + +. "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/tools/runme/runme.sh" + +# run an actual build only when this is invoked directly +if ((${#BASH_SOURCE[@]} == 1)); then _runme; fi diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000000..cbb6cff395 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,44 @@ +# Compilation/build/tests +target/ +# Generated python code +/src/main/resources/mmlspark + +# SBT meta-level generated files (build.properties is created by runme) +/autogen.sbt +/project/build.properties +/project/autogen.scala +/project/dependencies.* +/project/project-roots.txt +/scalastyle-config.xml + +# More SBT things +lib_managed/ +src_managed/ +.cache* +.history +.lib +/project/boot/ +/project/plugins/project/ +/project/project/ +/scalastyle-config.xml + +# IntelliJ w/ Scala +.idea +.scala_dependencies +.worksheet +*.sc +*.iml +*.ipr +*.iws +out + +# Additional Eclipse things +.classpath +.project +.settings +.target + +# ENSIME +.ensime +.ensime_lucene +.ensime_cache diff --git a/src/.sbtopts b/src/.sbtopts new file mode 100644 index 0000000000..870a32019f --- /dev/null +++ b/src/.sbtopts @@ -0,0 +1,2 @@ +-J-Xmx4G +-J-XX:ReservedCodeCacheSize=256M diff --git a/src/build.sbt b/src/build.sbt new file mode 100644 index 0000000000..63c5f776fc --- /dev/null +++ b/src/build.sbt @@ -0,0 +1,11 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +name := "mmlspark" + +Extras.rootSettings + +enablePlugins(ScalaUnidocPlugin) + +// Use `in ThisBuild` to provide defaults for all sub-projects +version in ThisBuild := Extras.mmlVer diff --git a/src/checkpoint-data/build.sbt b/src/checkpoint-data/build.sbt new file mode 100644 index 0000000000..6d55f118b6 --- /dev/null +++ b/src/checkpoint-data/build.sbt @@ -0,0 +1 @@ +//> DependsOn: core diff --git a/src/checkpoint-data/src/main/scala/CheckpointData.scala b/src/checkpoint-data/src/main/scala/CheckpointData.scala new file mode 100644 index 0000000000..eaf0d91466 --- /dev/null +++ b/src/checkpoint-data/src/main/scala/CheckpointData.scala @@ -0,0 +1,71 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.ml.param._ +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.types._ +import org.apache.spark.storage._ + +trait CheckpointDataParams extends MMLParams { + + // Determines the storage level: MEMORY_ONLY or MEMORY_AND_DISK + val diskIncluded: BooleanParam = BooleanParam(this, "diskIncluded", "Persist to disk as well as memory", false) + final def getDiskIncluded: Boolean = $(diskIncluded) + def setDiskIncluded(value: Boolean): this.type = set(diskIncluded, value) + + // Enables reverse operation to free up memory + val removeCheckpoint: BooleanParam = BooleanParam(this, "removeCheckpoint", "Unpersist a cached dataset", false) + final def getRemoveCheckpoint: Boolean = $(removeCheckpoint) + def setRemoveCheckpoint(value: Boolean): this.type = set(removeCheckpoint, value) + + protected def validateAndTransformSchema(schema: StructType): StructType = { + schema + } + +} + +class CheckpointData(override val uid: String) extends Transformer with CheckpointDataParams { + + def this() = this(Identifiable.randomUID("CheckpointData")) + + override def transform(dataset: Dataset[_]): DataFrame = { + if ($(removeCheckpoint)) { + CheckpointData.clearCache(dataset, false) + } else { + CheckpointData.cache(dataset, $(diskIncluded), false) + } + } + + def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema) + } + + def copy(extra: ParamMap): CheckpointData = defaultCopy(extra) + +} + +object CheckpointData extends DefaultParamsReadable[CheckpointData]{ + + def clearCache(ds: Dataset[_], blocking: Boolean): DataFrame = { + ds.unpersist(blocking) + ds.toDF + } + + def cache(ds: Dataset[_], disk: Boolean, serialized: Boolean): DataFrame = { + ds.persist(if (disk && serialized) StorageLevel.MEMORY_AND_DISK_SER + else if (serialized) StorageLevel.MEMORY_ONLY_SER + else if (disk) StorageLevel.MEMORY_AND_DISK + else StorageLevel.MEMORY_ONLY) + ds.toDF + } + + def persistToHive(ds: Dataset[_], dbName: String, tableName: String): DataFrame = { + ds.write.mode("overwrite").saveAsTable(dbName + "." + tableName) + ds.toDF + } + +} diff --git a/src/checkpoint-data/src/test/scala/CheckpointDataSuite.scala b/src/checkpoint-data/src/test/scala/CheckpointDataSuite.scala new file mode 100644 index 0000000000..5fd146cc35 --- /dev/null +++ b/src/checkpoint-data/src/test/scala/CheckpointDataSuite.scala @@ -0,0 +1,41 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql.DataFrame +import org.apache.spark.ml.Transformer +import org.apache.spark.sql.types._ +import org.apache.spark.ml.param._ + +class CheckpointDataSuite extends TestBase { + + test("Smoke test for Spark session version") { + assert(session.sparkContext.version == + sys.env.getOrElse("SPARK_VERSION", + sys.error("Missing $SPARK_VER environment variable"))) + } + + import session.implicits._ + + test("Cache DF") { + val input = makeBasicDF() + input.createOrReplaceTempView("cachingDFView") + + val checkpointer = new CheckpointData().setDiskIncluded(false).setRemoveCheckpoint(false) + checkpointer.transform(input) + + assert(input.sqlContext.isCached("cachingDFView")) + } + + test("Remove Cache on DF") { + assert(session.sqlContext.isCached("cachingDFView")) + val input = session.table("cachingDFView") + + val checkpointer = new CheckpointData().setDiskIncluded(false).setRemoveCheckpoint(true) + checkpointer.transform(input) + + assert(!input.sqlContext.isCached("cachingDFView")) + } + +} diff --git a/src/cntk-model/build.sbt b/src/cntk-model/build.sbt new file mode 100644 index 0000000000..83b404851c --- /dev/null +++ b/src/cntk-model/build.sbt @@ -0,0 +1,3 @@ +//> DependsOn: core +//> DependsOn: readers +//> DependsOn: image-transformer diff --git a/src/cntk-model/src/main/python/CNTKModel.py b/src/cntk-model/src/main/python/CNTKModel.py new file mode 100644 index 0000000000..695943e38d --- /dev/null +++ b/src/cntk-model/src/main/python/CNTKModel.py @@ -0,0 +1,21 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +import sys + +if sys.version >= '3': + basestring = str + +from mmlspark._CNTKModel import _CNTKModel +from pyspark.ml.common import inherit_doc + +@inherit_doc +class CNTKModel(_CNTKModel): + """ + :param SparkSession SparkSession: The SparkSession that will be used to find the model + :param str location: The location of the model, either on local or HDFS + """ + def setModel(self, sparkSession, location): + jSpark = sparkSession._jsparkSession + self._java_obj = self._java_obj.setModel(jSpark, location) + return self diff --git a/src/cntk-model/src/main/scala/CNTKModel.scala b/src/cntk-model/src/main/scala/CNTKModel.scala new file mode 100644 index 0000000000..d84c552879 --- /dev/null +++ b/src/cntk-model/src/main/scala/CNTKModel.scala @@ -0,0 +1,230 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io.File +import javax.xml.bind.DatatypeConverter._ + +import com.microsoft.CNTK.{Function => CNTKFunction, DataType => CNTKDataType, _} +import com.microsoft.ml.spark.schema.DatasetExtensions +import org.apache.commons.io.FileUtils._ +import org.apache.spark.broadcast._ +import org.apache.spark.SparkContext +import org.apache.spark.SparkFiles +import org.apache.spark.ml.Model +import org.apache.spark.ml.linalg.{DenseVector, Vectors} +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType +import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} +import org.apache.spark.ml.util._ +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +import scala.collection.mutable +import scala.collection.mutable.ListBuffer + +private object CNTKModelUtils extends java.io.Serializable { + + private def applyModel(inputIndex: Int, + broadcastModelBytes: Broadcast[Array[Byte]], + minibatchSize: Int, + inputNode: Int, + outputNode: Option[String])(inputRows: Iterator[Row]): Iterator[Row] = { + val device = DeviceDescriptor.useDefaultDevice + val m = CNTKModel.loadModelFromBytes(broadcastModelBytes.value, device) + val model = outputNode + .map { name => CNTKLib.AsComposite(Option(m.findByName(name)).getOrElse( + throw new IllegalArgumentException(s"Node $name does not exist"))) } + .getOrElse(m) + + val inputVar = model.getArguments.get(inputNode) + require(inputVar.getDataType() == CNTKDataType.Float, "input variable type is not Float input type") + val inputShape = inputVar.getShape + + // This defines and instantiates an iterator, hasNext and next are the abstract methods that + // define the interface and inputBuffer and outputBuffer hold the input and output rows so that + // they can be joined and returned. + // The logic inside next checks to see if the buffer is empty, and if so sends a new batch off + // to be evaluated + new Iterator[Row] { + val inputBuffer = new ListBuffer[Row]() + val outputBuffer = new ListBuffer[Row]() + val inputSize: Int = inputShape.getTotalSize().toInt + val inputFVV = new FloatVectorVector(minibatchSize.toLong) + val fvs: Array[FloatVector] = + (0 until minibatchSize).map(_ => new FloatVector(inputSize.toLong)).toArray + + def hasNext: Boolean = inputRows.hasNext || outputBuffer.nonEmpty + + def next(): Row = { + if (outputBuffer.isEmpty) { + var paddedRows = 0 + for (i <- 0 until minibatchSize) { + if (inputRows.hasNext) { + val row = inputRows.next() + inputBuffer += row + for ((x, j) <- row.getSeq[Float](inputIndex).view.zipWithIndex) { + fvs(i).set(j, x) + } + } else { + //TODO remove padding after CNTK bug is fixed + paddedRows += 1 + for (j <- 0 until inputSize) { + fvs(i).set(j, 0.0.toFloat) + } + } + inputFVV.set(i, fvs(i)) + } + + val inputVal = + Value.createDenseFloat(inputShape, inputFVV, device) + val inputDataMap = new UnorderedMapVariableValuePtr() + inputDataMap.add(inputVar, inputVal) + + val outputDataMap = new UnorderedMapVariableValuePtr() + val outputVar = model.getOutputs.get(0) + outputDataMap.add(outputVar, null) + + model.evaluate(inputDataMap, outputDataMap, device) + + val outputFVV = new FloatVectorVector() + outputDataMap.getitem(outputVar).copyVariableValueToFloat(outputVar, outputFVV) + assert(outputBuffer.isEmpty, + "The output row buffer should be empty before new elements are added.") + outputBuffer ++= toSeqSeq(outputFVV) + .dropRight(paddedRows) + .map(fs => Row(Vectors.dense(fs.map(_.toDouble).toArray))) + } + val ret = Row.merge(inputBuffer.head, outputBuffer.head) + inputBuffer.remove(0) + outputBuffer.remove(0) + ret + } + } + } + + // here just for serialization + val applyModelFunc = (inputIndex: Int, broadcastModelBytes: Broadcast[Array[Byte]], + minibatchSize: Int, inputNode: Int, + outputNode: Option[String]) => { + (inputRows: Iterator[Row]) => { + applyModel(inputIndex, broadcastModelBytes, minibatchSize, inputNode, outputNode)(inputRows) + } + } + + private def toSeqSeq(fvv: FloatVectorVector): Seq[Seq[Float]] = { + (0 until fvv.size.toInt).map(i => (0 until fvv.get(i).size.toInt).map(j => fvv.get(i).get(j))) + } +} + +object CNTKModel extends DefaultParamsReadable[CNTKModel] { + def loadModelFromBytes(bytes: Array[Byte], + device: DeviceDescriptor = + DeviceDescriptor.useDefaultDevice): CNTKFunction = { + import java.util.UUID._ + val modelFile = new File(s"$getTempDirectoryPath/$randomUUID.model") + writeByteArrayToFile(modelFile, bytes) + val model = try { + CNTKFunction.load(modelFile.getPath, device) + } finally forceDelete(modelFile) + model + } + + override def load(path: String): CNTKModel = super.load(path) +} + +@InternalWrapper +class CNTKModel(override val uid: String) extends Model[CNTKModel] with DefaultParamsWritable + with HasInputCol with HasOutputCol { + + def this() = this(Identifiable.randomUID("CNTKModel")) + + val model: Param[String] = + new Param(this, "model", "Array of bytes containing the serialized CNTKModel") + def setModel(spark: SparkSession, path: String): CNTKModel = { + val modelBytes = spark.sparkContext.binaryFiles(path).first()._2.toArray + set(model, printBase64Binary(modelBytes)) + } + def getModel: Array[Byte] = parseBase64Binary($(model)) + + val inputNode: IntParam = new IntParam(this, "inputNode", "index of the input node") + def setInputNode(value: Int): this.type = set(inputNode, value) + def getInputNode: Int = $(inputNode) + setDefault(inputNode -> 0) + + val outputNodeIndex: IntParam = new IntParam(this, "outputNodeIndex", "index of the output node") + def setOutputNodeIndex(value: Int): this.type = set(outputNodeIndex, value) + def getOutputNodeIndex: Int = $(outputNodeIndex) + + val outputNodeName: Param[String] = new Param(this, "outputNodeName", "name of the output node") + def setOutputNodeName(value: String): this.type = set(outputNodeName, value) + def getOutputNodeName: String = $(outputNodeName) + + val miniBatchSize: IntParam = + new IntParam(this, "miniBatchSize", "size of minibatches", ParamValidators.gt(0)) + def setMiniBatchSize(value: Int): this.type = set(miniBatchSize, value) + def getMiniBatchSize: Int = $(miniBatchSize) + setDefault(miniBatchSize -> 10) + + def transformSchema(schema: StructType): StructType = schema.add(getOutputCol, VectorType) + + override def copy(extra: ParamMap): this.type = defaultCopy(extra) + + def transform(dataset: Dataset[_]): DataFrame = { + val spark = dataset.sparkSession + val sc = spark.sparkContext + val inputIndex = dataset.columns.indexOf(getInputCol) + val device = DeviceDescriptor.useDefaultDevice + + if (inputIndex == -1) + throw new IllegalArgumentException(s"Input column $getInputCol does not exist") + + val model = CNTKModel.loadModelFromBytes(getModel, device) + + val setByName = get(outputNodeName) + val setByIndex = get(outputNodeIndex) + if ((setByName.isDefined && setByIndex.isDefined) || + (!setByName.isDefined && !setByIndex.isDefined)) + throw new Exception("Must specify one and only one of outputNodeName or outputNodeIndex") + + val outputNode: Option[String] = + if (setByName.isDefined) setByName + else setByIndex.map(i => model.getOutputs.get(i).getName) + + val coersionOptionUDF = dataset.schema.fields(inputIndex).dataType match { + case ArrayType(tp, _) => + tp match { + case DoubleType => Some(udf((x: mutable.WrappedArray[Double]) => x.map(_.toFloat))) + case FloatType => None + case _ => + throw new IllegalArgumentException(s"improper column type: $tp, need Array[Float]") + } + case VectorType => Some(udf((x: DenseVector) => x.toArray.map(_.toFloat))) + } + + val coercedCol = DatasetExtensions.findUnusedColumnName("coerced")(dataset.columns.toSet) + val (df, selectedIndex) = coersionOptionUDF match { + case Some(coersionUDF) => + val coercedDF = dataset.toDF().withColumn(coercedCol, coersionUDF(col(getInputCol))) + (coercedDF, coercedDF.columns.indexOf(coercedCol)) + case None => (dataset.toDF(), inputIndex) + } + + val inputType = df.schema($(inputCol)).dataType + val broadcastModelBytes = sc.broadcast(getModel) + val rdd = df.rdd.mapPartitions( + CNTKModelUtils.applyModelFunc(selectedIndex, + broadcastModelBytes, + getMiniBatchSize, + getInputNode, + outputNode)) + val output = spark.createDataFrame(rdd, df.schema.add(StructField(getOutputCol, VectorType))) + + coersionOptionUDF match { + case Some(_) => output.drop(coercedCol) + case None => output + } + } + +} diff --git a/src/cntk-model/src/test/scala/CNTKBindingSuite.scala b/src/cntk-model/src/test/scala/CNTKBindingSuite.scala new file mode 100644 index 0000000000..1bf842e7cf --- /dev/null +++ b/src/cntk-model/src/test/scala/CNTKBindingSuite.scala @@ -0,0 +1,60 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.CNTK.{Function => CNTKFunction, _} + +class CNTKBindingSuite extends LinuxOnly with CNTKTestUtils { + + def toSeqSeq(fvv: FloatVectorVector): Seq[Seq[Float]] = { + (0 until fvv.size.toInt).map(i => + (0 until fvv.get(i).size().toInt).map(j => fvv.get(i).get(j))) + } + + def toFVV(minibatch: Seq[Seq[Float]]): FloatVectorVector = { + minibatch.foldLeft(new FloatVectorVector()) { + case (fvv, floats) => + fvv.add(floats.foldLeft(new FloatVector()) { case (fv, f) => fv.add(f); fv }) + fvv + } + } + + def randomSeqSeq(outerSize: Int, innerSize: Int = 32 * 32 * 3): Seq[Seq[Float]] = { + val r = scala.util.Random + (1 to outerSize).map(i => { + (1 to innerSize).map(j => { + r.nextFloat() + }) + }) + } + + ignore("Evaluate should be able to change batch size ") { + val model = CNTKFunction.load(modelPath, DeviceDescriptor.useDefaultDevice) + val inputVar = model.getArguments.get(0) + val inputShape = inputVar.getShape + + def evaluateRandomMinibatch(batchSize: Int): Seq[Seq[Float]] = { + val fakeImages = randomSeqSeq(batchSize) + val inputFVV = toFVV(fakeImages) + val inputVal = Value.createDenseFloat(inputShape, inputFVV, DeviceDescriptor.getCPUDevice) + val inputDataMap = new UnorderedMapVariableValuePtr() + inputDataMap.add(inputVar, inputVal) + + val outputDataMap = new UnorderedMapVariableValuePtr() + val outputVar = model.getOutputs.get(0) + outputDataMap.add(outputVar, null) + + println(s"evaluating shape ${inputVal.getShape().getDimensions}") + model.evaluate(inputDataMap, outputDataMap, DeviceDescriptor.getCPUDevice) + val outputFVV = new FloatVectorVector() + outputDataMap.getitem(outputVar).copyVariableValueToFloat(outputVar, outputFVV) + toSeqSeq(outputFVV) + } + evaluateRandomMinibatch(1) + evaluateRandomMinibatch(3) + evaluateRandomMinibatch(2) + + } + +} diff --git a/src/cntk-model/src/test/scala/CNTKModelSuite.scala b/src/cntk-model/src/test/scala/CNTKModelSuite.scala new file mode 100644 index 0000000000..13e021530e --- /dev/null +++ b/src/cntk-model/src/test/scala/CNTKModelSuite.scala @@ -0,0 +1,157 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io.File +import java.util.Date + +import org.apache.commons.io.FileUtils.getTempDirectoryPath +import org.apache.spark.SparkException +import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType +import org.apache.spark.ml.{Pipeline, PipelineModel} +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.types._ + +class CNTKModelSuite extends LinuxOnly with CNTKTestUtils { + + // TODO: Move away from getTempDirectoryPath and have TestBase provide one + val saveFile = s"$getTempDirectoryPath/${new Date()}-spark-z.model" + + def testModel(minibatchSize: Int = 10): CNTKModel = { + new CNTKModel() + .setModel(session, modelPath) + .setInputCol(inputCol) + .setOutputCol(outputCol) + .setMiniBatchSize(minibatchSize) + .setOutputNodeIndex(3) + } + + val images = testImages(session) + + private def checkParameters(minibatchSize: Int) = { + val model = testModel(minibatchSize) + val result = model.transform(images) + compareToTestModel(result) + } + + test("A CNTK model should be able to support setting the input and output node") { + val model = testModel().setInputNode(0) + + val data = makeFakeData(session, 3, featureVectorLength) + val result = model.transform(data) + assert(result.select(outputCol).count() == 3) + } + + test("A CNTK model should support finding a node by name") { + val model = new CNTKModel() + .setModel(session, modelPath) + .setInputCol(inputCol) + .setOutputCol(outputCol) + .setOutputNodeName("z") + + val data = makeFakeData(session, 3, featureVectorLength) + val result = model.transform(data) + assert(result.select(outputCol).collect()(0).getAs[DenseVector](0).size == 10) + assert(result.select(outputCol).count() == 3) + } + + test("throws useful exception when invalid node name is given") { + val model = new CNTKModel() + .setInputCol(inputCol) + .setOutputCol(outputCol) + .setOutputNodeName("nonexistant-node") + .setModel(session, modelPath) + + val data = makeFakeData(session, 3, featureVectorLength) + val se = intercept[SparkException] { model.transform(data).collect() } + assert(se.getCause.isInstanceOf[IllegalArgumentException]) + } + + test("A CNTK model should work on doubles") { + val model = testModel() + val data = makeFakeData(session, 3, featureVectorLength, outputDouble = true) + val result = model.transform(data) + assert(result.select(outputCol).collect()(0).getAs[DenseVector](0).size == 10) + assert(result.count() == 3) + } + + test("A CNTK model should output Vectors and interop with other estimators") { + val model = testModel() + val data = makeFakeData(session, 3, featureVectorLength, outputDouble = true) + val result = model.transform(data) + assert(result.select(outputCol).schema.fields(0).dataType == VectorType) + + val predictions = new LogisticRegression() + .setFeaturesCol(outputCol) + .setLabelCol(labelCol) + .fit(result) + .transform(result) + assert(predictions.select("prediction").collect().length == 3) + } + + test("A CNTK model should have a default minibatch size") { + val model = testModel() + val result = model.transform(images) + compareToTestModel(result) + } + + test("A CNTK model should work on resized batches") { + val model = testModel() + val result = model.transform(images.repartition(1)) + compareToTestModel(result) + //images.printSchema() + //result.show() + } + + test("A CNTK model should work on an empty dataframe") { + val images = session.createDataFrame(sc.emptyRDD[Row], + StructType( + StructField(inputCol, ArrayType(FloatType, false)) :: + Nil)) + val model = testModel() + val result = model.transform(images) + assert(result.count == 0) + } + + test("A CNTK Model should process images") { + checkParameters(1) + checkParameters(10) + checkParameters(100) + } + + test("A CNTK Model should be saveable") { + val model = testModel() + model.write.overwrite().save(saveFile) + val modelLoaded = CNTKModel.load(saveFile) + val result = modelLoaded.transform(images) + compareToTestModel(result) + } + + test("A CNTK Model should be pipeline compatible") { + val model = testModel() + val pipe = new Pipeline().setStages(Array(model)).fit(images) + pipe.write.overwrite().save(saveFile) + val pipeLoaded = PipelineModel.load(saveFile) + val result = pipeLoaded.transform(images) + compareToTestModel(result) + } + + test("useful error message if invalid column name is given") { + val model = testModel().setInputCol("images") + val pipe = new Pipeline().setStages(Array(model)).fit(images) + pipe.write.overwrite().save(saveFile) + val pipeLoaded = PipelineModel.load(saveFile) + assertThrows[IllegalArgumentException] { + pipeLoaded.transform(images) + } + } + + override def afterAll(): Unit = { + new File(saveFile).delete() + super.afterAll() + } + +} diff --git a/src/cntk-model/src/test/scala/CNTKTestUtils.scala b/src/cntk-model/src/test/scala/CNTKTestUtils.scala new file mode 100644 index 0000000000..30f134a8f5 --- /dev/null +++ b/src/cntk-model/src/test/scala/CNTKTestUtils.scala @@ -0,0 +1,74 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql._ +import org.apache.spark.ml.linalg.DenseVector +import com.microsoft.ml.spark.Readers.implicits._ + +trait CNTKTestUtils { + + val filesRoot = s"${sys.env("DATASETS_HOME")}/" + val imagePath = s"$filesRoot/Images/CIFAR" + val modelPath = s"$filesRoot/CNTKModel/ConvNet_CIFAR10.model" + + val inputCol = "cntk_images" + val outputCol = "out" + val labelCol = "labels" + + val featureVectorLength = 3 * 32 * 32 + + def testModelDF(spark: SparkSession): DataFrame = { + import spark.implicits._ + spark.sparkContext.parallelize(Seq( + Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720, + -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090), + Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990, + -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880), + Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967, + 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830), + Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430, + -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510), + Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690, + 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270), + Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470, + 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF + } + + def testImages(spark: SparkSession): DataFrame = { + val images = spark.readImages(imagePath, true) + + val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol) + + unroll.transform(images).select(inputCol) + } + + def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = { + import spark.implicits._ + if (outputDouble) { + List + .fill(rows)(List.fill(size)(0.0).toArray) + .zip(List.fill(rows)(0.0)) + .toDF(inputCol, labelCol) + } else { + List + .fill(rows)(List.fill(size)(0.0.toFloat).toArray) + .zip(List.fill(rows)(0.0)) + .toDF(inputCol, labelCol) + } + } + + protected def compareToTestModel(result: DataFrame) = { + //TODO improve checks + assert(result.columns.toSet == Set(inputCol, outputCol)) + assert(result.count() == testModelDF(result.sparkSession).count()) + val max = result + .select(outputCol) + .collect() + .map(row => row.getAs[DenseVector](0).toArray.max) + .max + assert(max < 10 & max > -10) + } + +} diff --git a/src/cntk-train/build.sbt b/src/cntk-train/build.sbt new file mode 100644 index 0000000000..f418ec0a86 --- /dev/null +++ b/src/cntk-train/build.sbt @@ -0,0 +1,3 @@ +//> DependsOn: core +//> DependsOn: featurize +//> DependsOn: cntk-model diff --git a/src/cntk-train/src/main/python/CNTKLearner.py b/src/cntk-train/src/main/python/CNTKLearner.py new file mode 100644 index 0000000000..0e985b33ae --- /dev/null +++ b/src/cntk-train/src/main/python/CNTKLearner.py @@ -0,0 +1,23 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +import sys + +if sys.version >= '3': + basestring = str + +from mmlspark._CNTKLearner import _CNTKLearner +from mmlspark.CNTKModel import CNTKModel as CNTKmod +from pyspark.ml.common import inherit_doc + +@inherit_doc +class CNTKLearner(_CNTKLearner): + """ + Create CNTK model from existing java model + :param py4j.java_gateway.JavaObject java_model: see Scala CNTKModel documentation + """ + def _create_model(self, java_model): + model = CNTKmod() + model._java_obj = java_model + model._transfer_params_from_java() + return model diff --git a/src/cntk-train/src/main/scala/BrainscriptBuilder.scala b/src/cntk-train/src/main/scala/BrainscriptBuilder.scala new file mode 100644 index 0000000000..cbe6172eba --- /dev/null +++ b/src/cntk-train/src/main/scala/BrainscriptBuilder.scala @@ -0,0 +1,117 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io.{FileOutputStream, ObjectOutputStream} +import java.util.UUID + +import scala.collection.mutable.ListBuffer +import scala.sys.process._ + +import com.microsoft.ml.spark.schema._ +import FileUtilities._ + +import org.apache.hadoop.fs.Path + +import org.apache.spark.ml.classification._ +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.linalg.{DenseVector, SparseVector} +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} +import org.apache.spark.ml._ + +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ + +// Don't get too excited AK. This is starting to look like a set of contracts.. +case class InputShape(dim: Int, form: String) +case class InputData(format: String, path: String, shapes: Map[String, InputShape]) +case class BrainScriptConfig(name: String, text: Seq[String]) + +// It would be nice to extend from Params for this, but this +// seems more useful than just Spark, so not doing it for now +class BrainScriptBuilder { + + // We need to know a few things: + // 1. Where is the input data? + // 2. How do we configure the training itself? + // 3. Where should we put the outputs? + var modelName = "ModelOut" + + var inData: Option[InputData] = None + + var rootDir: String = "" + var outDir: String = "" + var weightPrecision: String = "float" + + var commands = ListBuffer[String]("trainNetwork") + var testModel = false + + def setInputFile(path: String, format: String, shapes: Map[String, InputShape]): this.type = { + inData = Some(InputData(format, path, shapes)) + this + } + + def setModelName(n: String): this.type = { + modelName = n + this + } + + def getModelPath(): String = { + s"""$outDir/Models/$modelName""" + } + + def setRootDir(p: String): this.type = { + outDir = p + this + } + + def setOutputRoot(p: String): this.type = { + outDir = p + this + } + + private def getInputString(): String = { + val ips = inData.get.shapes + .map { case(name, shape) => name + " = [ dim = " + + shape.dim.toString + " ; format = \"" + shape.form + "\" ]" } + .mkString("; ") + s"input = [ $ips ]" + } + + def setCommands(c: String*): this.type = { + this + } + + def setTestModel(b: Boolean): this.type = { + if (!testModel && b) { + commands.append("testNetwork") + } + this + } + + def toReaderConfig(): String = { + val ipstring = getInputString() + val loc = inData.get.path + val form = inData.get.format match { + case "text" => "CNTKTextFormatReader" + } + s"""reader = [ readerType = $form ; file = "$loc" ; $ipstring ]""" + } + + def toOverrideConfig(): Seq[String] = { + val rootOverrides = Seq( + s"""command = ${ commands.mkString(":") }""", + s"precision=$weightPrecision", + "traceLevel=1", + "deviceId=\"auto\"", + s"""rootDir="$rootDir" """, + s"""outputDir="$outDir" """, + s"""modelPath="${getModelPath}" """) + val commandReaders = commands.map(c => s"$c = [ ${toReaderConfig()} ]") + + rootOverrides ++ commandReaders + } + +} diff --git a/src/cntk-train/src/main/scala/CNTKLearner.scala b/src/cntk-train/src/main/scala/CNTKLearner.scala new file mode 100644 index 0000000000..4d601d4f3d --- /dev/null +++ b/src/cntk-train/src/main/scala/CNTKLearner.scala @@ -0,0 +1,168 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.net.URI + +import com.microsoft.ml.spark.FileUtilities._ +import org.apache.spark.ml._ +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ + +trait CNTKParams extends MMLParams { + + // This is only needed until Train* accepts CNTKLearner instead of CL acting like Train* + val labelsColumnName = StringParam(this, "labelsColumnName", "Label col", "labels") + val featuresColumnName = StringParam(this, "featuresColumnName", "feats col", "features") + + // This will go away after the CNTK HDFS Deserializer + val localHdfsMount = StringParam(this, "localHdfsMount", "local mount point for hdfs:///") + val dataTransfer = StringParam(this, "dataTransfer", "transfer strategy", "local") + def setTransferStrategy(s: String): this.type = set(dataTransfer, s) + + // TODO: Convert to enum contract shared with CNTK's HDFS Deserializer + val dataFormat = StringParam(this, "dataFormat", "transfer format", "text") + val weightPrecision = StringParam(this, "weightPrecision", "weights", "double") + val featureCount = IntParam(this, "featureCount", "num features for reduction", 1) + def setFeatureCount(c: Int): this.type = set(featureCount, c) + + val brainScript = StringParam(this, "brainScript", "string of BS config") + def setBrainScriptText(t: String): this.type = set(brainScript, t) + def setBrainScriptFile(f: String): this.type = set(brainScript, FileUtilities.readFile(new File(f))) + + val parallelTrain = BooleanParam(this, "parallelTrain", "train using an MPI ring", true) + def setParallelTrain(b: Boolean): this.type = set(parallelTrain, b) + + val workingDir = StringParam(this, "workingDir", "working directory for CNTK", "tmp") + def setWorkingDirectory(d: String): this.type = set(workingDir, d) + +} + +object CNTKLearner extends DefaultParamsReadable[CNTKLearner] + +@InternalWrapper +class CNTKLearner(override val uid: String) extends Estimator[CNTKModel] with CNTKParams { + + def this() = this(Identifiable.randomUID("CNTKLearner")) + + override def fit(dataset: Dataset[_]): CNTKModel = { + val spark = dataset.sparkSession + val labels = $(labelsColumnName) + val features = $(featuresColumnName) + + // Convert label column to categorical on train, remove rows with missing labels + val convertedLabelDataset = dataset.na.drop(Seq(labels)) + + // This utility function is a stub for the reduction step of Featurize, and + // will probably be covered in TrainClass/Regressor + val reducedData = DataTransferUtils.reduceAndAssemble( + convertedLabelDataset, + labels, + features, + $(weightPrecision), + $(featureCount)) + + // TODO: Very bad hack - we should store vector sizes in schema for quick retrieval + // Apparently this needs some design, not natively supported. This schema transfer + // in general needs to be more robust... + val feature1 = reducedData.select(features).head.getAs[Vector](0) + val featureDim = feature1.size + val featureForm = feature1 match { + case dv: DenseVector => "dense" + case sv: SparseVector => "sparse" + } + + val label1 = reducedData.select(labels).head.getAs[Vector](0) + val labelDim = label1.size + val labelForm = label1 match { + case dv: DenseVector => "dense" + case sv: SparseVector => "sparse" + } + + val partitions = reducedData.rdd.getNumPartitions + + val cntkrootURI = new URI($(workingDir)) + val cntkrootPath = new File(cntkrootURI).getAbsolutePath + println(s"$uid working in $cntkrootPath") + val relativeInPath = s"$cntkrootURI/$uid-inputdata" + + val writer = $(dataTransfer) match { + case "local" => new LocalWriter(relativeInPath) + case "hdfs-mount" => { + val mntpt = if (isDefined(localHdfsMount)) { + val x = $(localHdfsMount) + println(s"Using override hdfsm point: $x") + x + } else { + val x = sys.env.getOrElse("HDFS_MOUNTPOINT", "tmp/mnt") + println(s"Using deduced hdfsm point: $x") + x + } + println(s"hdfs-mount mounted at $mntpt") + new HdfsMountWriter(mntpt, 1, relativeInPath, spark.sparkContext) + } + case _ => ??? + } + + // Actual data movement step + + // As discussed above, this pipelining needs to be elsewhere, so for now + // creating utility functions for reuse and not combining the steps + val conformedData = $(dataFormat) match { + case "text" => DataTransferUtils.convertDatasetToCNTKTextFormat(reducedData, labels, features) + case "parquet" => reducedData + } + + conformedData.persist() + + val remappedInPath = $(dataFormat) match { + case "text" => writer.checkpointToText(conformedData) + case "parquet" => writer.checkpointToParquet(conformedData) + } + + val relativeOutRoot = s"$cntkrootPath/$uid-outdir" + + val config = new BrainScriptBuilder() + .setOutputRoot(relativeOutRoot) + // TODO: We need a more structured form of converting schema to CNTK config + // this will come in after the parquet + CNTK-as-library work comes in + .setInputFile( + remappedInPath, + $(dataFormat), + Map(features -> InputShape(featureDim, featureForm), + labels -> InputShape(labelDim, labelForm))) + + // Train the learner + val cb = if ($(parallelTrain)) new MPICommandBuilder() else new CNTKCommandBuilder() + cb + .setWorkingDir(cntkrootPath) + .insertBaseConfig($(brainScript)) + .appendOverrideConfig(config.toOverrideConfig) + + val modelRet = ProcessUtils.runProcess(cb.buildCommand) + println(s"CNTK exited with code $modelRet") + if (modelRet != 0) { + // TODO: Use exception heirarchy + throw new Exception("CNTK Training failed. Please view output log for details") + } + + conformedData.unpersist() + + // This does not work :( + // CNTKModel.load(config.getModelPath) + // This also needs a windows dll - currently only runs on linux + new CNTKModel(uid + "-model") + .setModel(spark, config.getModelPath) + .setInputCol(features) + .setOutputCol(labels) + } + + override def copy(extra: ParamMap): Estimator[CNTKModel] = defaultCopy(extra) + + override def transformSchema(schema: StructType): StructType = ??? + +} diff --git a/src/cntk-train/src/main/scala/CommandBuilders.scala b/src/cntk-train/src/main/scala/CommandBuilders.scala new file mode 100644 index 0000000000..4c02cc0e0c --- /dev/null +++ b/src/cntk-train/src/main/scala/CommandBuilders.scala @@ -0,0 +1,117 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io.{FileOutputStream, ObjectOutputStream} +import java.util.UUID +import java.net.URI + +import scala.collection.mutable.ListBuffer +import scala.sys.process._ + +import com.microsoft.ml.spark.schema._ +import FileUtilities._ + +import org.apache.hadoop.fs.Path + +import org.apache.spark.ml.classification._ +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.linalg.{DenseVector, SparseVector} +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} +import org.apache.spark.ml._ + +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ + +abstract class CNTKCommandBuilderBase { + val command: String + def arguments(): Seq[String] + val configs = ListBuffer.empty[BrainScriptConfig] + + var workingDir = new File(".").toURI + + def setWorkingDir(p: String): this.type = { + workingDir = new File(p).toURI + this + } + + def insertBaseConfig(t: String): this.type = { + configs.insert(0, BrainScriptConfig("baseConfig", Seq(t))) + this + } + + def appendOverrideConfig(t: Seq[String]): this.type = { + configs.append(BrainScriptConfig("overrideConfig", t)) + this + } + + protected def configToFile(c: BrainScriptConfig): String = { + val outFile = new File(new File(workingDir).getAbsolutePath + s"/${c.name}.cntk") + writeFile(outFile, c.text.mkString("\n")) + println(s"wrote string to ${outFile.getName}") + outFile.getAbsolutePath + } + + def buildCommand(): String +} + +class CNTKCommandBuilder(fileBased: Boolean = true) extends CNTKCommandBuilderBase { + val command = "cntk" + val arguments = Seq[String]() + + def buildCommand(): String = { + val cntkArgs = configs + .map(c => if (fileBased) s"configFile=${configToFile(c)} " else c.text.mkString(" ")) + .mkString(" ") + + command + " " + cntkArgs + } +} + +trait MPIConfiguration { + val command = "mpiexec" + // nodename -> workers per node + def nodeConfig: Map[String, Int] +} + +class MPICommandBuilder(fileBased: Boolean = true) extends CNTKCommandBuilderBase with MPIConfiguration { + + def nodeConfig: Map[String, Int] = Map("127.0.0.1" -> EnvironmentUtils.GPUCount.get) + + val argName = "-n" + val arguments = Seq(argName, nodeConfig.head._2.toString) + + def buildCommand(): String = { + val cntkArgs = "cntk " + configs + .map(c => if (fileBased) s"configFile=${configToFile(c)} " else c.text.mkString(" ")) + .mkString(" ") + + Seq(command, arguments.mkString(" "), cntkArgs, "parallelTrain=true").mkString(" ") + } +} + +class MultiNodeParallelLauncher(fileBased: Boolean = false) extends CNTKCommandBuilderBase with MPIConfiguration { + + // The difference here is the requirement of locating + // and passing on the hosts information + val nodeConfig = Map("localhost" -> 1, "remotehost" -> 1) + val arguments = if (EnvironmentUtils.IsWindows) { + Seq("--hosts", nodeConfig.size.toString) ++ nodeConfig.map { case(name, num) => s"$name $num" } + } else { + val hostFile = new File(".", "hostfile.txt") + val txt = nodeConfig.map { case(name, num) => s"$name slots=$num" }.mkString("\n") + writeFile(hostFile, txt) + Seq("-hostfile", hostFile.getCanonicalPath) + } + + def buildCommand(): String = { + val cntkArgs = configs + .map(c => if (fileBased) s"configFile=${configToFile(c)} " else c.text.mkString(" ")) + .mkString(" ") + + Seq(command, arguments.mkString(" "), cntkArgs).mkString(" ") + } + +} diff --git a/src/cntk-train/src/main/scala/DataConversion.scala b/src/cntk-train/src/main/scala/DataConversion.scala new file mode 100644 index 0000000000..b1821a9a54 --- /dev/null +++ b/src/cntk-train/src/main/scala/DataConversion.scala @@ -0,0 +1,173 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.SparkContext + +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.functions._ + +import org.apache.spark.ml.linalg._ +import org.apache.spark.ml.util.Identifiable + +import FileUtilities._ +import hadoop.HadoopUtils + +object DataTransferUtils { + + // This needs to be broken up into a few areas: + // 1. data-conversion is a library that has knowledge + // of type mappings, schema, and a canonical implementation of + // the type map conversion functions themselves + // 2. Featurize must be moved to a library utilized by the Train* APIs + // this is a 2 stage estimator: + // a. Type "reduction" utilizing a typemapper + // b. Assembly via single or "feature channel" based multi vector assembler + // at present, the current architecture limits us to + + def toText(value: Any): String = { + value match { + case v: Vector => return convertVectorToText(v) + case d: Double => return d.toString + case f: Float => return f.toString + } + } + + def toVec(value: Any): Vector = { + value match { + case v: Vector => return v + case d: Double => return new DenseVector(Array(d)) + case f: Float => return new DenseVector(Array(f.toDouble)) + case i: Integer => return new DenseVector(Array(i.toDouble)) + case l: Long => return new DenseVector(Array(l.toDouble)) + } + } + + def convertVectorToText(v: Vector): String = { + val heuristicBloat = 8 + val sb = new StringBuilder(v.numActives * heuristicBloat) + v match { + case sv: SparseVector => { + sv.foreachActive { (idx, value) => + sb.append(idx).append(":").append(value).append(" ") + () + } + } + case dv: DenseVector => { + dv.values.foreach(value => sb.append(value).append(" ")) + } + } + sb.toString + } + + private def col2vec = udf(toVec _) + private def col2str = udf(toText _) + + // This needs to be converted to a pipeline stage as stated above + def reduceAndAssemble(data: Dataset[_], + label: String, + outputVecName: String, + precision: String, + features: Int): DataFrame = { + if (precision != "double") throw new NotImplementedError("only doubles") + + val tempFeaturizer = new Featurize() + .setFeatureColumns(Map(outputVecName -> data.columns.filter(_ != label))) + .setNumberOfFeatures(features) + .setOneHotEncodeCategoricals(true) + .fit(data) + val reduced = tempFeaturizer.transform(data) + reduced.select(col2vec(reduced(label)).as(label), reduced(outputVecName)) + } + + def convertDatasetToCNTKTextFormat(data: Dataset[_], label: String, feats: String): DataFrame = { + val labelStrCol = col2str(data(label)) + val featStrCol = col2str(data(feats)) + val uberCol = concat( + lit(s"|$label "), + labelStrCol, + lit(" "), + lit(s"|$feats "), + featStrCol) + data.select(uberCol.as('value)) + } + +} + +// This is all horrid, find a better way to be cluster/local agnostic +// via DataSource and DataSink-type model. This will allow us to move to +// other source/sinks in the future more easily, but think is out of scope here +// TODO: this should become a set of extensions onto CheckpointData, which can +// also return a model that is JSON serializable into the DataSource representation +import java.net.URI + +abstract class DataWriter(destPath: String) { + protected val destUri = new URI(destPath) + protected val relativeDest = destUri.getPath + + protected val partitions: Int + + protected def remapPath(ext: String): String + + def constructedPath: String + + def checkpointToText(data: Dataset[_]): String = { + val fullPath = constructedPath + println(s"Writing dataset to $fullPath") + data.coalesce(partitions).write.text(fullPath) + remapPath("txt") + } + + def checkpointToParquet(data: Dataset[_]): String = { + val fullPath = constructedPath + println(s"Writing dataset to $fullPath") + data.coalesce(partitions).write.format("parquet").save(fullPath) + remapPath("parquet") + } +} + +abstract class NormalWriter(path: String) extends DataWriter(path) { + protected def remapPath(ext: String): String = constructedPath +} + +// This is used when Hadoop creates the actual single part file +// inside the path we've provided - we want that one. +abstract class SingleFileResolver(path: String) extends DataWriter(path) { + protected val remappedRoot: String + + protected def remapPath(extension: String): String = { + val dir = new File(remappedRoot) + println(s"Probing $dir for single file $constructedPath") + val file = dir.listFiles.filter(f => f.isFile && f.getName.endsWith(extension)).head + println(s"Resolving single file ${file.getAbsolutePath}") + file.getAbsolutePath + } +} + +class LocalWriter(path: String) extends SingleFileResolver(path) { + val partitions = 1 + val constructedPath = new URI("file", null, relativeDest, null, null).normalize.toString + + // TODO: Move this logic to Apache commons lang helper that already exists + // And then provide a helper function in FileUtilities. Why doesn't URI normalize properly for new File()? + val remappedRoot = { + val root = if (EnvironmentUtils.IsWindows) "C:" else "" + root + relativeDest + } +} + +class DefaultHdfsWriter(parts: Int, path: String) extends NormalWriter(path) { + val partitions = parts + val constructedPath = new URI(null, null, relativeDest, null, null).toString +} + +class HdfsMountWriter(localMnt: String, parts: Int, path: String, sc: SparkContext) extends SingleFileResolver(path) { + val partitions = parts + // TODO: Why is this required on the edge node? + val hConf = sc.hadoopConfiguration + val namenode = new HadoopUtils(hConf).getActiveNameNode + val constructedPath = new URI("hdfs", namenode, relativeDest, null, null).toString + val remappedRoot = new URI(localMnt).toString + s"/$relativeDest" +} diff --git a/src/cntk-train/src/main/scala/TypeMapping.scala b/src/cntk-train/src/main/scala/TypeMapping.scala new file mode 100644 index 0000000000..2b4f55f92d --- /dev/null +++ b/src/cntk-train/src/main/scala/TypeMapping.scala @@ -0,0 +1,41 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql.types._ +import org.apache.spark.ml._ + +object TypeMapping { + val mmlTypes = Seq[DataType]( + BooleanType, + ByteType, + ShortType, + IntegerType, + LongType, + FloatType, + DoubleType, + StringType) +} + +trait TypeConversion { + def conversionMap: Map[DataType, DataType] +} + +// This is the root of Featurize stage 1: type mapping, where stage 2 is assembly tactic +// There is one problem I cannot resolve: Can type mapping be dependent on assembly strategy? +// If so, the lines are a bit blurry and it's likely not going to a 2 stage pipeline, but +// rather a single estimator configurable (params) by an ITypeMapping and IAssemblyStrategy, to use +// C# terminology for clarity. +abstract class SingleTypeReducer(target: DataType) extends Transformer with TypeConversion { + private lazy val map = TypeMapping.mmlTypes.map(t => t -> target).toMap + def conversionMap: Map[DataType, DataType] = map +} + +abstract class VectorAssembler() + +class SingleVectorAssembler() extends VectorAssembler +class MultiVectorAssembler extends VectorAssembler +object MultiVectorAssembler { + def create(groups: Map[String, Seq[Int]]): Unit = {} +} diff --git a/src/cntk-train/src/test/scala/ValidateCntkTrain.scala b/src/cntk-train/src/test/scala/ValidateCntkTrain.scala new file mode 100644 index 0000000000..07a20ad989 --- /dev/null +++ b/src/cntk-train/src/test/scala/ValidateCntkTrain.scala @@ -0,0 +1,267 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.net.URI + +import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexerModel} +import com.microsoft.ml.spark.Readers.implicits._ +import FileUtilities._ +import org.scalatest.{BeforeAndAfterEach, Suite} + +trait TestFileCleanup extends BeforeAndAfterEach { + this: Suite => + var cleanupPath: File + override def afterEach(): Unit = { + try super.afterEach() // To be stackable, must call super.afterEach + finally { + if (cleanupPath.exists) { + FileUtilities.delTree(cleanupPath) + () + } + } + } +} + +class ValidateCntkTrain extends TestBase with TestFileCleanup { + + override var cleanupPath: File = new File(new URI(dir)) + + import session.implicits._ + + val dummyTrainScript = s""" +command = trainNetwork:testNetwork + +precision = "float"; traceLevel = 1 ; deviceId = "auto" + +rootDir = ".." ; dataDir = "$$rootDir$$/DataSets/MNIST" ; +outputDir = "./Output" ; + +modelPath = "$$outputDir$$/Models/01_OneHidden" + +# TRAINING CONFIG +trainNetwork = { + action = "train" + + BrainScriptNetworkBuilder = { + labelDim = 1 # number of distinct labels + + # This model returns multiple nodes as a record, which + # can be accessed using .x syntax. + model(x) = { + h1 = DenseLayer {5, activation=ReLU} (x) + z = LinearLayer {labelDim} (h1) + } + + # inputs + features = Input {9} + labels = Input {labelDim} + + # apply model to features + out = model (features) + + # loss and error computation + ce = CrossEntropyWithSoftmax (labels, out.z) + errs = ClassificationError (labels, out.z) + + # declare special nodes + featureNodes = (features) + labelNodes = (labels) + criterionNodes = (ce) + evaluationNodes = (errs) + outputNodes = (out.z) + } + + SGD = { + epochSize = 60 + minibatchSize = 6 + maxEpochs = 3 + learningRatesPerSample = 0.0001 + momentumAsTimeConstant = 0 + + numMBsToShowResult = 500 + } + + reader = { + readerType = "CNTKTextFormatReader" + # See ../README.md for details on getting the data (Train-28x28_cntk_text.txt). + file = "file:///Train-28x28_cntk_text.txt" + input = { + features = { dim = 784 ; format = "dense" } + labels = { dim = 10 ; format = "dense" } + } + } +} + +# TEST CONFIG +testNetwork = { + action = "test" + minibatchSize = 1024 # reduce this if you run out of memory + + reader = { + readerType = "CNTKTextFormatReader" + file = "file:///Test-28x28_cntk_text.txt" + input = { + features = { dim = 784 ; format = "dense" } + labels = { dim = 10 ; format = "dense" } + } + } +} +""" + + val cifarScript = s""" +# ConvNet applied on CIFAR-10 dataset, with no data augmentation. + +command = TrainNetwork + +precision = "float"; traceLevel = 0 ; deviceId = "auto" + +rootDir = "../../.." ; dataDir = "$$rootDir$$/DataSets/CIFAR-10" ; +outputDir = "./Output" ; + +TrainNetwork = { + action = "train" + + BrainScriptNetworkBuilder = { + imageShape = 32:32:3 + labelDim = 6 + + featMean = 128 + featScale = 1/256 + Normalize{m,f} = x => f .* (x - m) + + model = Sequential ( + Normalize {featMean, featScale} : + ConvolutionalLayer {64, (3:3), pad = true} : ReLU : + ConvolutionalLayer {64, (3:3), pad = true} : ReLU : + MaxPoolingLayer {(3:3), stride = (2:2)} : + ConvolutionalLayer {64, (3:3), pad = true} : ReLU : + ConvolutionalLayer {64, (3:3), pad = true} : ReLU : + MaxPoolingLayer {(3:3), stride = (2:2)} : + DenseLayer {256} : ReLU : Dropout : + DenseLayer {128} : ReLU : Dropout : + LinearLayer {labelDim} + ) + + # inputs + features = Input {imageShape} + labels = Input {labelDim} + + # apply model to features + z = model (features) + + # connect to system + ce = CrossEntropyWithSoftmax (labels, z) + errs = ClassificationError (labels, z) + top5Errs = ClassificationError (labels, z, topN=5) # only used in Eval action + + featureNodes = (features) + labelNodes = (labels) + criterionNodes = (ce) + evaluationNodes = (errs) # top5Errs only used in Eval + outputNodes = (z) + } + + SGD = { + epochSize = 0 + minibatchSize = 256 + + learningRatesPerSample = 0.0015625*10:0.00046875*10:0.00015625 + momentumAsTimeConstant = 0*20:607.44 + maxEpochs = 30 + L2RegWeight = 0.002 + dropoutRate = 0*5:0.5 + + numMBsToShowResult = 100 + parallelTrain = { + parallelizationMethod = "DataParallelSGD" + parallelizationStartEpoch = 2 # warm start: don't use 1-bit SGD for first epoch + distributedMBReading = true + dataParallelSGD = { gradientBits = 1 } + } + } + + reader = { + readerType = "CNTKTextFormatReader" + file = "$$DataDir$$/Train_cntk_text.txt" + randomize = true + keepDataInMemory = true # cache all data in memory + input = { + features = { dim = 3072 ; format = "dense" } + labels = { dim = 6 ; format = "dense" } + } + } +} +""" + + test("Smoke test for training on a classifier") { + val rawPath = new File(s"${sys.env("DATASETS_HOME")}/Binary/Train", "breast-cancer.train.csv").toString + val path = normalizePath(rawPath) + val dataset = session.read + .option("header", true) + .option("inferSchema", true) + .option("nullValue", "?") + .csv(path) + .withColumnRenamed("Label", "labels") + + val learner = new CNTKLearner() + .setBrainScriptText(dummyTrainScript) + .setParallelTrain(false) + .setWorkingDirectory(dir) + + val data = dataset.randomSplit(Seq(0.6, 0.4).toArray, 42) + val trainData = data(0) + val testData = data(1) + + val model = learner.fit(trainData) + println(model) + } + + // TODO: Redo this test with the proper image sizes now that full CIFAR is our dataset collection + // Also make this an E2E test and reduce validation scope down to smaller chunks. + test("train and eval CIFAR") { + val trigger = session.sparkContext + val filesRoot = s"${sys.env("DATASETS_HOME")}/" + val imagePath = s"$filesRoot/Images/CIFAR" + + val inputCol = "cntk_images" + val tmpLabel = "labelscol" + val indexedLabel = "idxlabels" + val labelCol = "labels" + + val images = session.readImages(imagePath, true) + + // Label annotation: CIFAR is constructed here as + // 01234-01.png, meaning (len - 5, len - 3) is label + val pathLen = images.first.getStruct(0).getString(0).length + val labeledData = images.withColumn(tmpLabel, images("image.path").substr(pathLen - 5, 2).cast("float")) + + // Unroll images into Spark representation + val unroller = new UnrollImage().setOutputCol(inputCol).setInputCol("image") + val unrolled = unroller.transform(labeledData).select(inputCol, tmpLabel) + + // Prepare Spark-like DF with known labels + + val ohe = new OneHotEncoder().setInputCol(tmpLabel).setOutputCol(labelCol).setDropLast(false) + val dataset = ohe.transform(unrolled).select(inputCol, labelCol) + + //dataset.printSchema() + //dataset.show() + + val learner = new CNTKLearner() + .setBrainScriptText(cifarScript) + // Build machine doesn't have GPUs + .setParallelTrain(false) + .setWorkingDirectory(dir) + + val model = learner.fit(dataset) + .setInputCol(inputCol) + .setOutputCol("out_labels") + .setOutputNodeIndex(3) + + val result = model.transform(dataset) + result.take(1) + //result.show() + } +} diff --git a/src/cntk-train/src/test/scala/ValidateConfiguration.scala b/src/cntk-train/src/test/scala/ValidateConfiguration.scala new file mode 100644 index 0000000000..1a87d035e4 --- /dev/null +++ b/src/cntk-train/src/test/scala/ValidateConfiguration.scala @@ -0,0 +1,28 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +class ValidateConfiguration extends TestBase { + + test("Basic BrainScript config E2E") { + val relativeOutRoot = "out" + val remappedInPath = "in.txt" + val dataFormat = "text" + + val config = new BrainScriptBuilder() + .setOutputRoot(relativeOutRoot) + .setInputFile( + remappedInPath, + dataFormat, + Map("features" -> InputShape(10000, "sparse"), + "labels" -> InputShape(1, "dense"))) + + val cb = new CNTKCommandBuilder(false) + .appendOverrideConfig(config.toOverrideConfig) + + // TODO: add assertions to really validate instead + println(cb.buildCommand) + } + +} diff --git a/src/cntk-train/src/test/scala/ValidateDataConversion.scala b/src/cntk-train/src/test/scala/ValidateDataConversion.scala new file mode 100644 index 0000000000..d18c2c13f5 --- /dev/null +++ b/src/cntk-train/src/test/scala/ValidateDataConversion.scala @@ -0,0 +1,83 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql.DataFrame +import org.apache.spark.ml.Transformer +import org.apache.spark.sql.types._ +import org.apache.spark.ml.param._ +import org.apache.spark.ml.linalg._ + +import FileUtilities._ + +class ValidateDataConversion extends TestBase { + + import session.implicits._ + + test("vector to text") { + val testVectors = List( + new DenseVector(Array(1.0, 0.0)), + new SparseVector( 1, Array(0), Array(8.0)), + new SparseVector( 100, Array(0,10,18,33,62,67,80), Array(1.0,2.0,1.0,1.0,1.0,1.0,1.0)), + new SparseVector(100000, Array(5833,9467,16680,29018,68900,85762,97510), Array(1.0,1.0,1.0,1.0,1.0,1.0,2.0)) + ) + + val expected = Seq( + "1.0 0.0 ", + "0:8.0 ", + "0:1.0 10:2.0 18:1.0 33:1.0 62:1.0 67:1.0 80:1.0 ", + "5833:1.0 9467:1.0 16680:1.0 29018:1.0 68900:1.0 85762:1.0 97510:2.0 ") + + val outputs = testVectors.map(DataTransferUtils.convertVectorToText) + assert(outputs === expected) + } + + val mockLabelColumn = "Label" + + def createMockDataset: DataFrame = { + session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (0, 4, 0.78, 0.99, 2), + (1, 5, 0.12, 0.34, 3), + (0, 1, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (0, 3, 0.78, 0.99, 2), + (1, 4, 0.12, 0.34, 3), + (0, 0, 0.50, 0.60, 0), + (1, 2, 0.40, 0.50, 1), + (0, 3, 0.78, 0.99, 2), + (1, 4, 0.12, 0.34, 3))) + .toDF(mockLabelColumn, "col1", "col2", "col3", "col4") + } + + test("Checkpoint the data") { + val data = createMockDataset + + val rData = DataTransferUtils.reduceAndAssemble(data, mockLabelColumn, "feats", "double", 10) + val cdata = DataTransferUtils.convertDatasetToCNTKTextFormat(rData, mockLabelColumn, "feats") + + val transfer = new LocalWriter(s"$dir/smoke") + val path = transfer.checkpointToText(cdata) + + val out = session.read.text(path) + + assert(verifyResult(cdata, out)) + } + + test("Verify vector labels") { + val data = createMockDataset + val rData1 = DataTransferUtils.reduceAndAssemble(data, mockLabelColumn, "feats", "double", 10) + val rData = DataTransferUtils.reduceAndAssemble(rData1, "feats", "labels", "double", 10) + val cdata = DataTransferUtils.convertDatasetToCNTKTextFormat(rData, "labels", "feats") + + val transfer = new LocalWriter(s"$dir/vectorlabel") + val path = transfer.checkpointToText(cdata) + + val out = session.read.text(path) + + assert(verifyResult(cdata, out)) + } + +} diff --git a/src/cntk-train/src/test/scala/ValidateEnvironmentUtils.scala b/src/cntk-train/src/test/scala/ValidateEnvironmentUtils.scala new file mode 100644 index 0000000000..37cc52a4b3 --- /dev/null +++ b/src/cntk-train/src/test/scala/ValidateEnvironmentUtils.scala @@ -0,0 +1,14 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +class ValidateEnvironmentUtils extends TestBase { + + // This is more of a run harness as asserting this is obviously dumb + ignore("Test env features") { + println(EnvironmentUtils.IsWindows) + println(EnvironmentUtils.GPUCount) + } + +} diff --git a/src/codegen/build.sbt b/src/codegen/build.sbt new file mode 100644 index 0000000000..3e904ff0e0 --- /dev/null +++ b/src/codegen/build.sbt @@ -0,0 +1,12 @@ +//> DependsOn: core + +Extras.noJar + +// Running this project will load all jars, which will fail if they're +// all "provided". This magic makes it as if the "provided" is not +// there for the run task. See https://github.com/sbt/sbt-assembly and +// http://stackoverflow.com/questions/18838944/ +run in Compile := + Defaults + .runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)) + .evaluated diff --git a/src/codegen/src/main/scala/CodeGen.scala b/src/codegen/src/main/scala/CodeGen.scala new file mode 100644 index 0000000000..075a8e3d27 --- /dev/null +++ b/src/codegen/src/main/scala/CodeGen.scala @@ -0,0 +1,79 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import com.microsoft.ml.spark.FileUtilities._ +import Config._ + +import scala.util.matching.Regex +import java.util.regex.Pattern + +object CodeGen { + + def copyAllFiles(fromDir: File, rx: Regex, toDir: File): Unit = { + if (!fromDir.isDirectory) { println(s"'$fromDir' is not a directory"); return } + allFiles(fromDir, if (rx == null) null else (f => rx.findFirstIn(f.getName) != None)) + .foreach{x => copyFile(x, toDir, overwrite=true)} + } + def copyAllFiles(fromDir: File, extension: String, toDir: File): Unit = + copyAllFiles(fromDir, + if (extension == null || extension == "") null + else (Pattern.quote("." + extension) + "$").r, + toDir) + + def copyAllFilesFromRoots(fromDir: File, roots: List[String], relPath: String, + extension: String, toDir: File): Unit = { + roots.foreach { root => + val dir = new File(new File(fromDir, root), relPath) + if (dir.exists && dir.isDirectory) copyAllFiles(dir, extension, toDir) + } + } + def copyAllFilesFromRoots(fromDir: File, roots: List[String], relPath: String, + rx: Regex, toDir: File): Unit = { + roots.foreach { root => + val dir = new File(new File(fromDir, root), relPath) + if (dir.exists && dir.isDirectory) copyAllFiles(dir, rx, toDir) + } + } + + def generateArtifacts(): Unit = { + println(s"""|Running registration with config: + | topDir: $topDir + | srcDir: $srcDir + | outputDir: $outputDir + | toZipDir: $toZipDir + | pyTestDir: $pyTestDir""".stripMargin) + val roots = // note: excludes the toplevel project + if (!rootsFile.exists) sys.error(s"Could not find roots file at $rootsFile") + else readFile(rootsFile, _.getLines.toList).filter(_ != ".") + println("Creating temp folders") + toZipDir.mkdirs + pyTestDir.mkdirs + println("Copy jar files to output directory") + copyAllFilesFromRoots(srcDir, roots, jarRelPath, + (Pattern.quote("-" + mmlVer + ".jar") + "$").r, + outputDir) + println("Copy source python files") + copyAllFilesFromRoots(srcDir, roots, pyRelPath, "py", toZipDir) + println("Generate python APIs") + PySparkWrapperGenerator() + // build init file + val importStrings = + (copyrightLines.mkString("\n") + "\n\n") +: + allFiles(toZipDir, _.getName.endsWith(".py")) + .filter(f => !f.getName.startsWith(internalPrefix)) + .map(f => s"from mmlspark.${f.getName.dropRight(3)} import *\n") + writeFile(new File(toZipDir, "__init__.py"), importStrings.mkString("")) + // package python zip file + zipFolder(toZipDir, zipFile) + // leave the source files there so they will be included in the super-jar + // if (!delTree(toZipDir)) println(s"Error: could not delete $toZipDir") + } + + def main(args: Array[String]): Unit = { + org.apache.log4j.BasicConfigurator.configure(new org.apache.log4j.varia.NullAppender()) + generateArtifacts() + } + +} diff --git a/src/codegen/src/main/scala/Config.scala b/src/codegen/src/main/scala/Config.scala new file mode 100644 index 0000000000..e6dacd9625 --- /dev/null +++ b/src/codegen/src/main/scala/Config.scala @@ -0,0 +1,29 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import com.microsoft.ml.spark.FileUtilities._ +import sys.process.Process + +object Config { + + val srcDir = new File(".").getCanonicalFile() + val topDir = new File("..").getCanonicalFile() + val rootsFile = new File(srcDir, "project/project-roots.txt") + val outputDir = new File(topDir, "BuildArtifacts/sdk") + val toZipDir = new File(srcDir, "src/main/resources/mmlspark") + val zipFile = new File(outputDir, "mmlspark.zip") + val pyTestDir = new File(topDir, "TestResults/generated_pytests") + val jarRelPath = "target/scala-" + sys.env("SCALA_VERSION") + val pyRelPath = "src/main/python" + val mmlVer = sys.env.getOrElse("MML_VERSION", + Process("../tools/runme/show-version").!!.trim) + val debugMode = sys.env.getOrElse("DEBUGMODE", "").trim.toLowerCase == "true" + val internalPrefix = "_" + + val copyrightLines = + Seq("# Copyright (C) Microsoft Corporation. All rights reserved.", + "# Licensed under the MIT License. See LICENSE in the project root for information.") + +} diff --git a/src/codegen/src/main/scala/PySparkWrapper.scala b/src/codegen/src/main/scala/PySparkWrapper.scala new file mode 100644 index 0000000000..406bc7218b --- /dev/null +++ b/src/codegen/src/main/scala/PySparkWrapper.scala @@ -0,0 +1,345 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import scala.collection.mutable.ListBuffer +import scala.tools.nsc.util.DocStrings + +import org.apache.commons.lang3.StringUtils +import org.apache.spark.ml.{Estimator, Transformer} +import org.apache.spark.ml.PipelineStage +import org.apache.spark.ml.param.Param + +import com.microsoft.ml.spark.FileUtilities._ +import Config._ + +/** + * :: DeveloperApi :: + * Abstraction for PySpark wrapper generators. + */ +abstract class PySparkWrapper(entryPoint: PipelineStage, + entryPointName: String, + entryPointQualifiedName: String) { + + private val ScopeDepth = " " + private val additionalImports = Map( + ("complexTypes", + s"from ${toZipDir.getName}.TypeConversionUtils import generateTypeConverter, complexTypeConverter"), + ("utils", s"from ${toZipDir.getName}.Utils import *") + ) + + def toPySpark(): String = { + val output = new StringBuilder() + "" + } + + protected val classTemplate = Seq( + copyrightLines.mkString("\n"), + "", + "import sys", + "if sys.version >= '3':", + " basestring = str", + "", + "from pyspark.ml.param.shared import *", + "from pyspark import keyword_only", + "from pyspark.ml.util import JavaMLReadable, JavaMLWritable", + "from pyspark.ml.wrapper import JavaTransformer, JavaEstimator, JavaModel", + "from pyspark.ml.common import inherit_doc", + "%1$s", + "","", + "@inherit_doc", + "class %2$s(%3$s):", + " \"\"\"", + " %9$s", + "%11$s", + " \"\"\"", + "", + " @keyword_only", + " def __init__(self, %4$s):", + " super(%2$s, self).__init__()", + " self._java_obj = self._new_java_obj(\"%5$s\")", + "%6$s", + // since 2.1.1, kwargs is an instance attribute... + " if hasattr(self, \"_input_kwargs\"):", + " kwargs = self._input_kwargs", + " else:", + // ... so this can be removed when we drop support for 2.1.0 + " kwargs = self.__init__._input_kwargs", + " self.setParams(**kwargs)", + "", + " @keyword_only", + " def setParams(self, %4$s):", + " \"\"\"", + " Set the (keyword only) parameters","", + "%10$s", + " \"\"\"", + " if hasattr(self, \"_input_kwargs\"):", + " kwargs = self._input_kwargs", + " else:", + // ... same here: remove when we drop support for 2.1.0 + " kwargs = self.setParams._input_kwargs", + " return self._set(**kwargs)\n" + + "%7$s", + "%8$s", + "") + + protected val defineParamsTemplate = + " self.%1$s = Param(self, \"%1$s\", \"%2$s\")" + // Complex parameters need type converters + protected val defineComplexParamsTemplate = + " self.%1$s = Param(self, \"%1$s\", \"%2$s\", %3$s)" + protected val setParamDefaultTemplate = + " self._setDefault(%1$s=%2$s)" + protected val setParamDefaultWithGuidTemplate = + " self._setDefault(%1$s=self.uid + \"%2$s\")" + protected val setTemplate = + Seq("", + " def set%1$s(self, value):", + " \"\"\"\n\n%4$s:param %3$s %5$s\n%4$s\"\"\"", + " self._set(%2$s=value)", + " return self") + protected val getTemplate = + Seq("", + " def get%1$s(self):", + " \"\"\"", + " :return: %2$s", + " :rtype: %3$s", + " \"\"\"", + " return self.getOrDefault(self.%2$s)", + "") + protected val getComplexTemplate = + Seq("", + " def get%1$s(self):", + " \"\"\"", + " :return: %2$s", + " :rtype: %3$s", + " \"\"\"", + " return self._cache[\"%2$s\"]") + protected val saveLoadTemplate = + Seq("", + " @classmethod", + " def read(cls):", + " \"\"\" Returns an MLReader instance for this class. \"\"\"", + " return JavaMMLReader(cls)", + "", + " @staticmethod", + " def getJavaPackage():", + " \"\"\" Returns package name String. \"\"\"", + " return \"%1$s\"", + "", + " @staticmethod", + " def _from_java(java_stage):", + " stage_name=%2$s.__module__", + " return from_java(java_stage, stage_name)","") + + // TODO: Get a brief description of the class from the scala and put it here. There is not a simple + // and intuitive way to do this via reflections, similar to the way that we are able to + // retrieve the parameter explanations, for example. + protected val classDocTemplate = + "This wraps the scala class %1$s\n" + + protected val paramDocTemplate = + "%3$s:param %4$s %2$s" + + val psType: String + private lazy val objectBaseClass: String = "Java" + psType + private lazy val autoInheritedClasses = Seq("JavaMLReadable", "JavaMLWritable", objectBaseClass) + // Complex types are not easily recognized by Py4j. They need special processing. + private lazy val complexTypes = Set[String]( + "TransformerParam", + "TransformerArrayParam", + "EstimatorParam") + protected def isComplexType(paramType: String): Boolean = complexTypes.contains(paramType) + + protected def getParamExplanation(param: Param[_]): String = { + entryPoint.explainParam(param) + } + + protected def getPythonizedDefault(paramDefault: String, paramType: String, + defaultStringIsParsable: Boolean): String = + paramType match { + case "BooleanParam" => + StringUtils.capitalize(paramDefault) + case "DoubleParam" | "FloatParam" | "IntParam" | "LongParam" => + paramDefault + case x if x == "Param" || defaultStringIsParsable => + "\"" + paramDefault + "\"" + case _ => + "None" + } + + protected def getPythonizedDataType(paramType: String): String = + paramType match { + case "BooleanParam" => "bool" + case "IntParam" => "int" + case "LongParam" => "long" + case "FloatParam" => "float" + case "DoubleParam" => "double" + case "StringParam" => "str" + case "Param" => "str" + case "StringArrayParam" => "list of str" + case "MapArrayParam" => "dict of str to list of str" + case _ => "object" + } + + protected def getParamDefault(param: Param[_]): (String, String) = { + var paramDefault: String = null + var pyParamDefault: String = "None" + var autogenSuffix: String = null + var defaultStringIsParsable: Boolean = true + + if (entryPoint.hasDefault(param)) { + val paramParent: String = param.parent + paramDefault = entryPoint.getDefault(param).get.toString + if (paramDefault.toLowerCase.contains(paramParent.toLowerCase)) + autogenSuffix = paramDefault.substring(paramDefault.lastIndexOf(paramParent) + + paramParent.length) + else { + try{ + entryPoint.getParam(param.name).w(paramDefault) + } + catch{ + case e: Exception => + defaultStringIsParsable = false + } + pyParamDefault = getPythonizedDefault(paramDefault, + param.getClass.getSimpleName, defaultStringIsParsable) + } + } + (pyParamDefault, autogenSuffix) + } + + private def defineParam(param: Param[_]): String = { + defineParamsTemplate + } + + protected def getPysparkWrapperBase: String = { + // Construct relevant strings + val imports = ListBuffer[String](additionalImports("utils")) + val inheritedClasses = ListBuffer[String]() + inheritedClasses ++= autoInheritedClasses + val paramsAndDefaults = ListBuffer[String]() + val paramDefinitionsAndDefaults = ListBuffer[String]() + val paramGettersAndSetters = ListBuffer[String]() + val paramDocList = ListBuffer[String]() + val classParamDocList = ListBuffer[String]() + + // Iterate over the params to build strings + val allParams: Array[Param[_]] = entryPoint.params + // Check for complex types + if(allParams.exists(p => isComplexType(p.getClass.getSimpleName))){ + // Add special imports + imports += additionalImports("complexTypes") + // Add cache + paramDefinitionsAndDefaults += ScopeDepth * 2 + "self._cache = {}" + } + for (param <- allParams) { + val pname = param.name + val docType = getPythonizedDataType(param.getClass.getSimpleName) + paramGettersAndSetters += + setTemplate.mkString("\n").format(StringUtils.capitalize(pname), pname, docType, ScopeDepth * 2, + getParamExplanation(param)) + if(isComplexType(param.getClass.getSimpleName)){ + paramDefinitionsAndDefaults += + defineComplexParamsTemplate.format( + pname, getParamExplanation(param), + s"""generateTypeConverter("$pname", self._cache, complexTypeConverter)""") + paramGettersAndSetters += + getComplexTemplate.mkString("\n").format(StringUtils.capitalize(pname), pname, param.getClass.getSimpleName) + paramDocList += + paramDocTemplate.format(pname, getParamExplanation(param), ScopeDepth * 2, param.getClass.getSimpleName) + classParamDocList += + paramDocTemplate.format(pname, getParamExplanation(param), ScopeDepth, param.getClass.getSimpleName) + } + else{ + paramDefinitionsAndDefaults += + defineParamsTemplate.format(pname, getParamExplanation(param)) + paramGettersAndSetters += + getTemplate.mkString("\n").format(StringUtils.capitalize(pname), pname, docType) + paramDocList += + paramDocTemplate.format(pname, getParamExplanation(param), ScopeDepth * 2, docType) + classParamDocList += + paramDocTemplate.format(pname, getParamExplanation(param), ScopeDepth, param.getClass.getSimpleName) + } + + val (pyParamDefault, autogenSuffix) = getParamDefault(param) + paramsAndDefaults += pname + "=" + pyParamDefault + + if (pyParamDefault != "None") { + paramDefinitionsAndDefaults += setParamDefaultTemplate.format(pname, pyParamDefault) + } else if (autogenSuffix != null) { + paramDefinitionsAndDefaults += setParamDefaultWithGuidTemplate.format(pname, autogenSuffix) + } + } + + // Build strings + val importsString = imports.mkString("\n") + val inheritanceString = inheritedClasses.mkString(", ") + val classParamsString = paramsAndDefaults.mkString(", ") + val paramDefinitionsAndDefaultsString = paramDefinitionsAndDefaults.mkString("\n") + val paramGettersAndSettersString = paramGettersAndSetters.mkString("\n") + val saveLoadString = saveLoadTemplate.mkString("\n").format(entryPointQualifiedName, entryPointName) + val classDocString = classDocTemplate.format(entryPointName) + val paramDocString = paramDocList.mkString("\n") + val classParamDocString = classParamDocList.mkString("\n") + + String.format(classTemplate.mkString("\n"), importsString, entryPointName, inheritanceString, + classParamsString, entryPointQualifiedName, + paramDefinitionsAndDefaultsString, paramGettersAndSettersString, saveLoadString, + classDocString, paramDocString, classParamDocString) + "\n" + } + + def pysparkWrapperBuilder(): String = { + getPysparkWrapperBase + } + + def writeWrapperToFile(dir: File): Unit = { + writeFile(new File(dir, entryPointName + ".py"), pysparkWrapperBuilder()) + } +} + +class SparkTransformerWrapper(entryPoint: Transformer, + entryPointName: String, + entryPointQualifiedName: String) + extends PySparkWrapper(entryPoint, + entryPointName, + entryPointQualifiedName) { + + override val psType = "Transformer" +} + +class SparkEstimatorWrapper(entryPoint: Estimator[_], + entryPointName: String, + entryPointQualifiedName: String, + companionModelName: String, + companionModelQualifiedName: String) + extends PySparkWrapper(entryPoint, + entryPointName, + entryPointQualifiedName) { + + private val createModelStringTemplate = Seq( + " def _create_model(self, java_model):", + " return %1$s(java_model)", + "").mkString("\n") + + private val modelClassString = Seq( + "class %1$s(JavaModel, JavaMLWritable, JavaMLReadable):", + " \"\"\"", + " Model fitted by :class:`%2$s`.", + " This class is left empty on purpose.", + " All necessary methods are exposed through inheritance.", + " \"\"\"", + "").mkString("\n") + + override def pysparkWrapperBuilder(): String = { + Seq(super.pysparkWrapperBuilder, + createModelStringTemplate.format(companionModelName), + modelClassString.format(companionModelName, entryPointName), + saveLoadTemplate.mkString("\n").format(companionModelQualifiedName, companionModelName), + "").mkString("\n") + } + + override val psType = "Estimator" + +} diff --git a/src/codegen/src/main/scala/PySparkWrapperGenerator.scala b/src/codegen/src/main/scala/PySparkWrapperGenerator.scala new file mode 100644 index 0000000000..e4116aee36 --- /dev/null +++ b/src/codegen/src/main/scala/PySparkWrapperGenerator.scala @@ -0,0 +1,123 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import collection.JavaConverters._ +import java.io.File +import java.lang.reflect.{Type, ParameterizedType} +import java.util.jar._ + +import scala.reflect.internal.util.ScalaClassLoader.URLClassLoader +import org.apache.spark.ml.{Estimator, Transformer} + +import com.microsoft.ml.spark.FileUtilities._ +import Config._ + +import scala.language.existentials +import com.microsoft.ml.spark.InternalWrapper +import scala.reflect.runtime.universe._ + +object PySparkWrapperGenerator { + + // check if the class is annotated with InternalWrapper + private[spark] def needsInternalWrapper(myClass: Class[_]):Boolean = { + val typ: ClassSymbol = runtimeMirror(myClass.getClassLoader).classSymbol(myClass) + typ.annotations.exists(a => a.tree.tpe =:= typeOf[InternalWrapper]) + } + + private[spark] def pyWrapperName(myClass: Class[_]):String = { + val prefix = if(needsInternalWrapper(myClass)) internalPrefix else "" + prefix + myClass.getSimpleName + } + + def writeWrappersToFile(myClass: Class[_], qualifiedClassName: String): Unit = { + try { + val classInstance = myClass.newInstance() + + val (wrapper: PySparkWrapper, wrapperTests: PySparkWrapperTest) = + classInstance match { + case t: Transformer => + val className = pyWrapperName(myClass) + (new SparkTransformerWrapper(t, className, qualifiedClassName), + new SparkTransformerWrapperTest(t, className, qualifiedClassName)) + case e: Estimator[_] => + var sc = myClass + while(!Seq("Estimator", "Predictor").contains(sc.getSuperclass.getSimpleName)) { + sc = sc.getSuperclass + } + val typeArgs = sc.getGenericSuperclass.asInstanceOf[ParameterizedType] + .getActualTypeArguments + val getModelFromGenericType = (modelType: Type) => { + val modelClass = modelType.getTypeName.split("<").head + (modelClass.split("\\.").last, modelClass) + } + val (modelClass, modelQualifiedClass) = sc.getSuperclass.getSimpleName match { + case "Estimator" => + getModelFromGenericType(typeArgs.head) + case "Predictor" => + getModelFromGenericType(typeArgs(2)) + } + + val className = pyWrapperName(myClass) + (new SparkEstimatorWrapper(e, + className, + qualifiedClassName, + modelClass, + modelQualifiedClass), + new SparkEstimatorWrapperTest(e, + className, + qualifiedClassName, + modelClass, + modelQualifiedClass)) + case _ => return + } + wrapper.writeWrapperToFile(toZipDir) + wrapperTests.writeWrapperToFile(pyTestDir) + if (debugMode) println(s"Generated wrapper for class ${myClass.getSimpleName}") + } catch { + // Classes without default constructor + case ie: InstantiationException => + if (debugMode) println(s"Could not generate wrapper for class ${myClass.getSimpleName}: $ie") + // Classes with "private" modifiers on constructors + case iae: IllegalAccessException => + if (debugMode) println(s"Could not generate wrapper for class ${myClass.getSimpleName}: $iae") + // Classes that require runtime library loading + case ule: UnsatisfiedLinkError => + if (debugMode) println(s"Could not generate wrapper for class ${myClass.getSimpleName}: $ule") + case e: Exception => + println(s"Could not generate wrapper for class ${myClass.getSimpleName}: ${e.printStackTrace}") + } + } + + def getWrappersFromJarFile(jarFilePath: String, cl2: URLClassLoader): Unit = { + val cld = new URLClassLoader(Array(new File(jarFilePath).toURI.toURL), cl2) + val jfd = new JarFile(jarFilePath) + + using(Seq(cld, jfd)) { s => + val cl = s(0).asInstanceOf[URLClassLoader] + val jarFile = s(1).asInstanceOf[JarFile] + val _ = jarFile.entries.asScala + .filter(e => e.getName.endsWith(".class")) + .map(e => e.getName.replace("/", ".").stripSuffix(".class")) + .filter(q => { + val clazz = cl.loadClass(q) + try { + clazz.getEnclosingClass == null + } catch { + case _: java.lang.NoClassDefFoundError => false + } + }) + .foreach(q => writeWrappersToFile(cl.loadClass(q), q)) + }.get + } + + def apply(): Unit = { + val jarFiles = outputDir.listFiles.filter(_.getName.endsWith(".jar")) + val jarUrls = jarFiles.map(_.toURI.toURL) + using(Seq(new URLClassLoader(jarUrls, this.getClass.getClassLoader))) { s => + jarFiles.foreach(f => getWrappersFromJarFile(f.getAbsolutePath, s(0))) + }.get + } + +} diff --git a/src/codegen/src/main/scala/PySparkWrapperTest.scala b/src/codegen/src/main/scala/PySparkWrapperTest.scala new file mode 100644 index 0000000000..54cf23ec21 --- /dev/null +++ b/src/codegen/src/main/scala/PySparkWrapperTest.scala @@ -0,0 +1,275 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import org.apache.commons.lang3.StringUtils +import org.apache.spark.ml.{Estimator, Transformer} +import org.apache.spark.ml.PipelineStage +import org.apache.spark.ml.param.Param + +import com.microsoft.ml.spark.FileUtilities._ +import Config._ + +/** + * :: DeveloperApi :: + * Abstraction for PySpark wrapper generators. + */ +abstract class PySparkWrapperTest(entryPoint: PipelineStage, + entryPointName: String, + entryPointQualifiedName: String) { + + // general classes are imported from the mmlspark directy; + // internal classes have to be imported from their packages + private def importClass(entryPointName:String):String = { + if(entryPointName startsWith internalPrefix) s"from mmlspark.$entryPointName import $entryPointName" + else s"from mmlspark import $entryPointName" + } + + protected def classTemplate(classParams: String, paramGettersAndSetters: String) = + ( + s"""| + | + |import unittest + |import pandas as pd + |import numpy as np + |import pyspark.ml, pyspark.ml.feature + |from pyspark import SparkContext + |from pyspark.sql import SQLContext + |from pyspark.ml.classification import LogisticRegression + |from pyspark.ml.regression import LinearRegression + |""" + importClass(entryPointName) + + s""" + |from pyspark.ml.feature import Tokenizer + |from mmlspark import TrainClassifier + | + |sc = SparkContext() + | + |class ${entryPointName}Test(unittest.TestCase): + | + | def test_${entryPointName}AllDefaults(self): + | my$entryPointName = $entryPointName() + | my$entryPointName.setParams($classParams) + | self.assertNotEqual(my$entryPointName, None) + | + |$paramGettersAndSetters + | + |""").stripMargin + + protected val unittestString = + s"""| + |import os, xmlrunner + |if __name__ == "__main__": + | result = unittest.main(testRunner=xmlrunner.XMLTestRunner(output=os.getenv("TEST_RESULTS","TestResults")), + | failfast=False, buffer=False, catchbreak=False) + |""".stripMargin + + protected def setAndGetTemplate(paramName: String, value: String) = + s"""| def test_set$paramName(self): + | my$entryPointName = $entryPointName() + | val = $value + | my$entryPointName.set$paramName(val) + | retVal = my$entryPointName.get$paramName() + | self.assertEqual(val, retVal) + |""".stripMargin + + protected def tryFitSetupTemplate(entryPointName: String) = + s"""| def test_$entryPointName(self): + | dog = "dog" + | cat = "cat" + | bird = "bird" + | tmp1 = { + | "col1": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], + | "col2": [2, 3, 4, 5, 1, 3, 3, 4, 0, 2, 3, 4], + | "col3": [0.50, 0.40, 0.78, 0.12, 0.50, 0.40, 0.78, 0.12, 0.50, 0.40, 0.78, 0.12], + | "col4": [0.60, 0.50, 0.99, 0.34, 0.60, 0.50, 0.99, 0.34, 0.60, 0.50, 0.99, 0.34], + | "col5": [dog, cat, dog, cat, dog, bird, dog, cat, dog, bird, dog, cat], + | "col6": [cat, dog, bird, dog, bird, dog, cat, dog, cat, dog, bird, dog], + | "image": [cat, dog, bird, dog, bird, dog, cat, dog, cat, dog, bird, dog] + | } + | sqlC = SQLContext(sc) + | pddf = pd.DataFrame(tmp1) + | pddf["col1"] = pddf["col1"].astype(np.float64) + | pddf["col2"] = pddf["col2"].astype(np.int32) + | data = sqlC.createDataFrame(pddf) + |""".stripMargin + + protected def tryTransformTemplate(entryPointName: String, param: String) = + s"""| my$entryPointName = $entryPointName($param) + | prediction = my$entryPointName.transform(data) + | self.assertNotEqual(prediction, None) + |""".stripMargin + + protected def tryFitTemplate(entryPointName: String, model: String) = + s"""| my$entryPointName = $entryPointName(model=$model, labelCol="col1", numFeatures=5) + | model = my$entryPointName.fit(data) + | self.assertNotEqual(model, None)""".stripMargin + + private def evaluateSetupTemplate(entryPointName: String) = + s"""| def test_$entryPointName(self): + | data = { + | "labelColumn": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], + | "col1": [2, 3, 4, 5, 1, 3, 3, 4, 0, 2, 3, 4], + | "col2": [0.50, 0.40, 0.78, 0.12, 0.50, 0.40, 0.78, 0.12, 0.50, 0.40, 0.78, 0.12], + | "col3": [0.60, 0.50, 0.99, 0.34, 0.60, 0.50, 0.99, 0.34, 0.60, 0.50, 0.99, 0.34], + | "col4": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3] + | } + | sqlC = SQLContext(sc) + | pddf = pd.DataFrame(data) + | data = sqlC.createDataFrame(pddf) + | model = TrainClassifier(model=LogisticRegression(), labelCol="labelColumn", + | numFeatures=256).fit(data) + |""".stripMargin + + protected def computeStatisticsTemplate(entryPointName: String) = + s"""|${evaluateSetupTemplate(entryPointName)} + | scoredData = model.transform(data) + | scoredData.limit(10).toPandas() + | evaluatedData = $entryPointName().transform(scoredData) + | self.assertNotEqual(evaluatedData, None) + |""".stripMargin + + protected def evaluateTemplate(entryPointName: String) = + s"""|${evaluateSetupTemplate(entryPointName)} + | model = TrainClassifier(model=LogisticRegression(), labelCol="labelColumn", + | numFeatures=256).fit(data) + | evaluateModels = FindBestModel(models=[model, model]).fit(data) + | bestModel = evaluateModels.transform(data) + | self.assertNotEqual(bestModel, None) + |""".stripMargin + + // These params are need custom handling. For now, just skip them so we have tests that pass. + private lazy val skippedParams = Set[String]("models") + protected def isSkippedParam(paramName: String): Boolean = skippedParams.contains(paramName) + protected def isModel(paramName: String): Boolean = paramName.toLowerCase() == "model" + protected def isBaseTransformer(paramName: String): Boolean = paramName.toLowerCase() == "basetransformer" + protected def tryFitString(entryPointName: String): String = + if (entryPointName.contains("Regressor")) + tryFitTemplate(entryPointName, "LinearRegression(solver=\"l-bfgs\")") + else if (entryPointName.contains("Classifier")) + tryFitTemplate(entryPointName, "LogisticRegression()") + else "" + protected def computeStatisticsString(entryPointName: String): String = computeStatisticsTemplate(entryPointName) + protected def evaluateString(entryPointName: String): String = evaluateTemplate(entryPointName) + protected def tryTransformString(entryPointName: String): String = { + val param: String = + entryPointName match { + case "WriteBlob" => "blobPath=\"file:///tmp/" + java.util.UUID.randomUUID + ".tsv\"" + case "MultiColumnAdapter" => + "baseTransformer=Tokenizer(), inputCols = \"col5,col6\", outputCols = \"output1,output2\"\n " + case "DataConversion" => "col=\"col1\", convertTo=\"double\"" + case "FastVectorAssembler" => "inputCols=\"col1\"" + case "MultiNGram" => "inputColumns=np.array([ \"col5\", \"col6\" ])" + case "SelectColumns" => "cols=[\"col1\"]" + case "ImageFeaturizer" => "modelSaveDir=\"file:///tmp\"" + case "Repartition" => "n=2" + case "_CNTKModel" | "MultiTokenizer" | "NltTokenizeTransform" | "TextTransform" + | "TextNormalizerTransform" | "WordTokenizeTransform" => "inputCol=\"col5\"" + case _ => "" + } + tryTransformTemplate(entryPointName, param) + } + + protected def getPythonizedDefault(paramDefault: String, paramType: String, + defaultStringIsParsable: Boolean): String = + paramType match { + case "BooleanParam" => + StringUtils.capitalize(paramDefault) + case "DoubleParam" | "FloatParam" | "IntParam" | "LongParam" => + paramDefault + case x if x == "Param" || defaultStringIsParsable => + "\"" + paramDefault + "\"" + case _ => + "None" + } + + protected def getParamDefault(param: Param[_]): (String, String) = { + if (!entryPoint.hasDefault(param)) ("None", null) + else { + val paramParent: String = param.parent + val paramDefault = entryPoint.getDefault(param).get.toString + if (paramDefault.toLowerCase.contains(paramParent.toLowerCase)) + ("None", + paramDefault.substring(paramDefault.lastIndexOf(paramParent) + paramParent.length)) + else { + val defaultStringIsParsable: Boolean = + try { + entryPoint.getParam(param.name).w(paramDefault) + true + } catch { + case e: Exception => false + } + (getPythonizedDefault(paramDefault, param.getClass.getSimpleName, defaultStringIsParsable), + null) + } + } + } + + protected def getPysparkWrapperTestBase: String = { + // Iterate over the params to build strings + val paramGettersAndSettersString = + entryPoint.params.filter { param => !isSkippedParam(param.name) + }.map { param => + val value = if (isModel(param.name)) "LogisticRegression()" + else if (isBaseTransformer(param.name)) "Tokenizer()" + else getParamDefault(param)._1 + setAndGetTemplate(StringUtils.capitalize(param.name), value) + }.mkString("\n") + val classParamsString = + entryPoint.params.map(param => param.name + "=" + getParamDefault(param)._1).mkString(", ") + classTemplate(classParamsString, paramGettersAndSettersString) + } + + def pysparkWrapperTestBuilder(): String = { + copyrightLines.mkString("\n") + getPysparkWrapperTestBase + } + + def writeWrapperToFile(dir: File): Unit = { + writeFile(new File(dir, entryPointName + "_tests.py"), pysparkWrapperTestBuilder()) + } + +} + +class SparkTransformerWrapperTest(entryPoint: Transformer, + entryPointName: String, + entryPointQualifiedName: String) + extends PySparkWrapperTest(entryPoint, + entryPointName, + entryPointQualifiedName) { + + // The transformer tests for FastVectorAssembler ... UnrollImage are disabled for the moment. + override def pysparkWrapperTestBuilder(): String = { + val transformTest = + entryPointName match { + case "ComputeModelStatistics" => computeStatisticsString(entryPointName) + case "ComputePerInstanceStatistics" => computeStatisticsString(entryPointName) + case "_CNTKModel" | "FastVectorAssembler" | "MultiNGram" | "ImageFeaturizer" + | "_ImageTransformer" | "UnrollImage" | "HashTransform" | "StopWordsRemoverTransform" + => "" + case _ => + tryFitSetupTemplate(entryPointName) + tryTransformString(entryPointName) + } + super.pysparkWrapperTestBuilder + transformTest + unittestString + } + +} + +class SparkEstimatorWrapperTest(entryPoint: Estimator[_], + entryPointName: String, + entryPointQualifiedName: String, + companionModelName: String, + companionModelQualifiedName: String) + extends PySparkWrapperTest(entryPoint, entryPointName, entryPointQualifiedName) { + + private val modelName = entryPointName + "Model" + + override def pysparkWrapperTestBuilder(): String = { + val testString = + if (entryPointName == "FindBestModel") + evaluateString(entryPointName) + else + tryFitSetupTemplate(entryPointName) + tryFitString(entryPointName) + super.pysparkWrapperTestBuilder + testString + unittestString + } + +} diff --git a/src/compute-model-statistics/build.sbt b/src/compute-model-statistics/build.sbt new file mode 100644 index 0000000000..1ddf71d75d --- /dev/null +++ b/src/compute-model-statistics/build.sbt @@ -0,0 +1,3 @@ +//> DependsOn: core +//> DependsOn: train-regressor +//> DependsOn: train-classifier diff --git a/src/compute-model-statistics/src/main/scala/ComputeModelStatistics.scala b/src/compute-model-statistics/src/main/scala/ComputeModelStatistics.scala new file mode 100644 index 0000000000..7d958abea8 --- /dev/null +++ b/src/compute-model-statistics/src/main/scala/ComputeModelStatistics.scala @@ -0,0 +1,559 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.contracts.MetricData +import com.microsoft.ml.spark.schema.SchemaConstants._ +import com.microsoft.ml.spark.schema.{CategoricalUtilities, SchemaConstants, SparkSchema} +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics, RegressionMetrics} +import org.apache.spark.ml.param.{Param, ParamMap} +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.mllib.linalg.{Matrices, Matrix} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.log4j.Logger + +/** + * Contains constants used by Compute Model Statistics. + */ +object ComputeModelStatistics extends DefaultParamsReadable[ComputeModelStatistics] { + // Regression metrics + val MseSparkMetric = "mse" + val RmseSparkMetric = "rmse" + val R2SparkMetric = "r2" + val MaeSparkMetric = "mae" + + // Binary Classification metrics + val AreaUnderROCMetric = "areaUnderROC" + val AucSparkMetric = "AUC" + val AccuracySparkMetric = "accuracy" + val PrecisionSparkMetric = "precision" + val RecallSparkMetric = "recall" + val AllSparkMetrics = "all" + + // Regression column names + val MseColumnName = "mean_squared_error" + val RmseColumnName = "root_mean_squared_error" + val R2ColumnName = "R^2" + val MaeColumnName = "mean_absolute_error" + + // Binary Classification column names + val AucColumnName = "AUC" + + // Binary and Multiclass (micro-averaged) column names + val PrecisionColumnName = "precision" + val RecallColumnName = "recall" + val AccuracyColumnName = "accuracy" + + // Multiclass Classification column names + val AverageAccuracy = "average_accuracy" + val MacroAveragedRecall = "macro_averaged_recall" + val MacroAveragedPrecision = "macro_averaged_precision" + + // Metric to column name + val metricToColumnName = Map(AccuracySparkMetric -> AccuracyColumnName, + PrecisionSparkMetric -> PrecisionColumnName, + RecallSparkMetric -> RecallColumnName, + MseSparkMetric -> MseColumnName, + RmseSparkMetric -> RmseColumnName, + R2SparkMetric -> R2ColumnName, + MaeSparkMetric -> MaeColumnName) + + val classificationColumns = List(AccuracyColumnName, PrecisionColumnName, RecallColumnName) + + val regressionColumns = List(MseColumnName, RmseColumnName, R2ColumnName, MaeColumnName) + + val ClassificationEvaluationType = "Classification" + val EvaluationType = "evaluation_type" + + val FpRateROCColumnName = "false_positive_rate" + val TpRateROCColumnName = "true_positive_rate" + + val FpRateROCLog = "fpr" + val TpRateROCLog = "tpr" + + val BinningThreshold = 1000 +} + +/** + * Evaluates the given scored dataset. + */ +class ComputeModelStatistics(override val uid: String) extends Transformer with MMLParams { + + def this() = this(Identifiable.randomUID("ComputeModelStatistics")) + + val evaluationMetric: Param[String] = StringParam(this, "evaluationMetric", "Metric to evaluate models with", "all") + + def getEvaluationMetric: String = $(evaluationMetric) + + /** @group setParam **/ + def setEvaluationMetric(value: String): this.type = set(evaluationMetric, value) + + lazy val logger = Logger.getLogger(this.getClass.getName) + + /** + * The ROC curve evaluated for a binary classifier. + */ + var rocCurve: DataFrame = null + + override def transform(dataset: Dataset[_]): DataFrame = { + val (modelName, labelColumnName, scoreValueKind) = getSchemaInfo(dataset.schema) + + // For creating the result dataframe in classification or regression case + val spark = dataset.sparkSession + import spark.implicits._ + + if (scoreValueKind == SchemaConstants.ClassificationKind) { + + var resultDF: DataFrame = Seq(ComputeModelStatistics.ClassificationEvaluationType) + .toDF(ComputeModelStatistics.EvaluationType) + val scoredLabelsColumnName = SparkSchema.getScoredLabelsColumnName(dataset.schema, modelName) + + // Get levels for label column if categorical + val levels = CategoricalUtilities.getLevels(dataset.schema, labelColumnName) + + val levelsExist = levels.isDefined + + lazy val levelsToIndexMap: Map[Any, Double] = getLevelsToIndexMap(levels.get) + + lazy val predictionAndLabels = if (levelsExist) { + getPredictionAndLabels(dataset, labelColumnName, scoredLabelsColumnName, levelsToIndexMap) + } else { + selectAndCastToRDD(dataset, scoredLabelsColumnName, labelColumnName) + } + + lazy val scoresAndLabels = { + val scoresColumnName = SparkSchema.getScoresColumnName(dataset.schema, modelName) + if (scoresColumnName != null) { + if (levelsExist) { + getScoresAndLabels(dataset, labelColumnName, scoresColumnName, levelsToIndexMap) + } else { + getScalarScoresAndLabels(dataset, labelColumnName, scoresColumnName) + } + } else { + predictionAndLabels + } + } + + lazy val (labels: Array[Double], confusionMatrix: Matrix) = createConfusionMatrix(predictionAndLabels) + + // If levels exist, use the extra information they give to get better performance + getEvaluationMetric match { + case ComputeModelStatistics.AllSparkMetrics => { + resultDF = addConfusionMatrixToResult(labels, confusionMatrix, resultDF) + resultDF = addAllClassificationMetrics(modelName, + dataset, + labelColumnName, + predictionAndLabels, + confusionMatrix, + scoresAndLabels, + resultDF) + } + case simpleMetric if simpleMetric == ComputeModelStatistics.AccuracySparkMetric || + simpleMetric == ComputeModelStatistics.PrecisionSparkMetric || + simpleMetric == ComputeModelStatistics.RecallSparkMetric => { + resultDF = addSimpleMetric(simpleMetric, predictionAndLabels, resultDF) + } + case ComputeModelStatistics.AucSparkMetric => { + val numLevels = if (levelsExist) { + levels.get.length + } else { + confusionMatrix.numRows + } + if (numLevels <= 2) { + // Add the AUC + val auc: Double = getAUC(modelName, dataset, labelColumnName, scoresAndLabels) + resultDF = resultDF.withColumn(ComputeModelStatistics.AucColumnName, lit(auc)) + } else { + throw new Exception("Error: AUC is not available for multiclass case") + } + } + case default => { + throw new Exception(s"Error: $default is not a classification metric") + } + } + resultDF + } else if (scoreValueKind == SchemaConstants.RegressionKind) { + val scoresColumnName = SparkSchema.getScoresColumnName(dataset.schema, modelName) + + val scoresAndLabels = selectAndCastToRDD(dataset, scoresColumnName, labelColumnName) + + val regressionMetrics = new RegressionMetrics(scoresAndLabels) + + // get all spark metrics possible: "mse", "rmse", "r2", "mae" + val mse = regressionMetrics.meanSquaredError + val rmse = regressionMetrics.rootMeanSquaredError + val r2 = regressionMetrics.r2 + val mae = regressionMetrics.meanAbsoluteError + + logRegressionMetrics(mse, rmse, r2, mae) + + Seq((mse, rmse, r2, mae)).toDF(ComputeModelStatistics.MseColumnName, + ComputeModelStatistics.RmseColumnName, + ComputeModelStatistics.R2ColumnName, + ComputeModelStatistics.MaeColumnName) + } else { + throwOnInvalidScoringKind(scoreValueKind) + } + } + + private def getSchemaInfo(schema: StructType): (String, String, String) = { + // TODO: evaluate all models; for now, get first model name found + val firstModelName = schema.collectFirst { + case StructField(c, t, _, m) if (getFirstModelName(m) != null && !getFirstModelName(m).isEmpty) => { + getFirstModelName(m).get + } + } + val modelName = if (!firstModelName.isEmpty) firstModelName.get + else throw new Exception("Please score the model prior to evaluating") + val labelColumnName = SparkSchema.getLabelColumnName(schema, modelName) + + val scoreValueKind = SparkSchema.getScoreValueKind(schema, modelName, labelColumnName) + (modelName, labelColumnName, scoreValueKind) + } + + private def addSimpleMetric(simpleMetric: String, + predictionAndLabels: RDD[(Double, Double)], + resultDF: DataFrame): DataFrame = { + var newResultDF = resultDF + val (labels: Array[Double], confusionMatrix: Matrix) = createConfusionMatrix(predictionAndLabels) + // Compute metrics for binary classification + if (confusionMatrix.numCols == 2) { + val (accuracy: Double, precision: Double, recall: Double) = + getBinaryAccuracyPrecisionRecall(confusionMatrix) + // Add the metrics to the DF + if (simpleMetric == ComputeModelStatistics.AccuracySparkMetric) { + newResultDF = newResultDF.withColumn(ComputeModelStatistics.AccuracyColumnName, lit(accuracy)) + } else if (simpleMetric == ComputeModelStatistics.PrecisionSparkMetric) { + newResultDF = newResultDF.withColumn(ComputeModelStatistics.PrecisionColumnName, lit(precision)) + } else if (simpleMetric == ComputeModelStatistics.RecallSparkMetric) { + newResultDF = newResultDF.withColumn(ComputeModelStatistics.RecallColumnName, lit(recall)) + } + logClassificationMetrics(accuracy, precision, recall) + } else { + val (microAvgAccuracy: Double, microAvgPrecision: Double, microAvgRecall: Double, _, _, _) = + getMulticlassMetrics(predictionAndLabels, confusionMatrix) + + // Add the metrics to the DF + if (simpleMetric == ComputeModelStatistics.AccuracySparkMetric) { + newResultDF = newResultDF.withColumn(ComputeModelStatistics.AccuracyColumnName, lit(microAvgAccuracy)) + } else if (simpleMetric == ComputeModelStatistics.PrecisionSparkMetric) { + newResultDF = newResultDF.withColumn(ComputeModelStatistics.PrecisionColumnName, lit(microAvgPrecision)) + } else if (simpleMetric == ComputeModelStatistics.RecallSparkMetric) { + newResultDF = newResultDF.withColumn(ComputeModelStatistics.RecallColumnName, lit(microAvgRecall)) + } + logClassificationMetrics(microAvgAccuracy, microAvgPrecision, microAvgRecall) + } + newResultDF + } + + private def addAllClassificationMetrics(modelName: String, + dataset: Dataset[_], + labelColumnName: String, + predictionAndLabels: RDD[(Double, Double)], + confusionMatrix: Matrix, + scoresAndLabels: RDD[(Double, Double)], + resultDF: DataFrame): DataFrame = { + var newResultDF = resultDF + // Compute metrics for binary classification + if (confusionMatrix.numCols == 2) { + val (accuracy: Double, precision: Double, recall: Double) = getBinaryAccuracyPrecisionRecall(confusionMatrix) + // Add the metrics to the DF + newResultDF = newResultDF.withColumn(ComputeModelStatistics.AccuracyColumnName, lit(accuracy)) + .withColumn(ComputeModelStatistics.PrecisionColumnName, lit(precision)) + .withColumn(ComputeModelStatistics.RecallColumnName, lit(recall)) + + logClassificationMetrics(accuracy, precision, recall) + + // Add the AUC + val auc: Double = getAUC(modelName, dataset, labelColumnName, scoresAndLabels) + newResultDF = newResultDF.withColumn(ComputeModelStatistics.AucColumnName, lit(auc)) + + logAUC(auc) + } else { + val (microAvgAccuracy: Double, + microAvgPrecision: Double, + microAvgRecall: Double, + averageAccuracy: Double, + macroAveragedPrecision: Double, + macroAveragedRecall: Double) = getMulticlassMetrics(predictionAndLabels, confusionMatrix) + + newResultDF = newResultDF.withColumn(ComputeModelStatistics.AccuracyColumnName, lit(microAvgAccuracy)) + .withColumn(ComputeModelStatistics.PrecisionColumnName, lit(microAvgPrecision)) + .withColumn(ComputeModelStatistics.RecallColumnName, lit(microAvgRecall)) + .withColumn(ComputeModelStatistics.AverageAccuracy, lit(averageAccuracy)) + .withColumn(ComputeModelStatistics.MacroAveragedPrecision, lit(macroAveragedPrecision)) + .withColumn(ComputeModelStatistics.MacroAveragedRecall, lit(macroAveragedRecall)) + + logClassificationMetrics(microAvgAccuracy, microAvgPrecision, microAvgRecall) + + } + newResultDF + } + + private def addConfusionMatrixToResult(labels: Array[Double], + confusionMatrix: Matrix, + resultDF: DataFrame): DataFrame = { + var resultDFModified = resultDF + for (col: Int <- 0 until confusionMatrix.numCols; + row: Int <- 0 until confusionMatrix.numRows) { + resultDFModified = resultDFModified + .withColumn(s"predicted_class_as_${labels(col).toString}_actual_is_${labels(row).toString}", + lit(confusionMatrix(row, col))) + } + resultDFModified + } + + private def selectAndCastToDF(dataset: Dataset[_], + predictionColumnName: String, + labelColumnName: String): DataFrame = { + dataset.select(col(predictionColumnName), col(labelColumnName).cast(DoubleType)) + .na + .drop(Array(predictionColumnName, labelColumnName)) + } + + private def selectAndCastToRDD(dataset: Dataset[_], + predictionColumnName: String, + labelColumnName: String): RDD[(Double, Double)] = { + selectAndCastToDF(dataset, predictionColumnName, labelColumnName) + .rdd + .map { + case Row(prediction: Double, label: Double) => (prediction, label) + case default => throw new Exception(s"Error: prediction and label columns invalid or missing") + } + } + + private def getPredictionAndLabels(dataset: Dataset[_], + labelColumnName: String, + scoredLabelsColumnName: String, + levelsToIndexMap: Map[Any, Double]): RDD[(Double, Double)] = { + // Calculate confusion matrix and output it as DataFrame + dataset.select(col(scoredLabelsColumnName), col(labelColumnName)) + .na + .drop(Array(scoredLabelsColumnName, labelColumnName)) + .rdd + .map { + case Row(prediction: Double, label) => (prediction, levelsToIndexMap(label)) + case default => throw new Exception(s"Error: prediction and label columns invalid or missing") + } + } + + private def getScalarScoresAndLabels(dataset: Dataset[_], + labelColumnName: String, + scoresColumnName: String): RDD[(Double, Double)] = { + selectAndCastToDF(dataset, scoresColumnName, labelColumnName) + .rdd + .map { + case Row(prediction: Vector, label: Double) => (prediction(1), label) + case default => throw new Exception(s"Error: prediction and label columns invalid or missing") + } + } + + private def getScoresAndLabels(dataset: Dataset[_], + labelColumnName: String, + scoresColumnName: String, + levelsToIndexMap: Map[Any, Double]): RDD[(Double, Double)] = { + dataset.select(col(scoresColumnName), col(labelColumnName)) + .na + .drop(Array(scoresColumnName, labelColumnName)) + .rdd + .map { + case Row(prediction: Vector, label) => (prediction(1), levelsToIndexMap(label)) + case default => throw new Exception(s"Error: prediction and label columns invalid or missing") + } + } + + private def getLevelsToIndexMap(levels: Array[_]): Map[Any, Double] = { + levels.zipWithIndex.map(t => t._1 -> t._2.toDouble).toMap + } + + private def getMulticlassMetrics(predictionAndLabels: RDD[(Double, Double)], + confusionMatrix: Matrix): (Double, Double, Double, Double, Double, Double) = { + // Compute multiclass metrics based on paper "A systematic analysis of performance measure for classification + // tasks", Sokolova and Lapalme + var tpSum: Double = 0.0 + for (diag: Int <- 0 until confusionMatrix.numCols) { + tpSum += confusionMatrix(diag, diag) + } + val totalSum = predictionAndLabels.count() + + val microAvgAccuracy = tpSum / totalSum + val microAvgPrecision = microAvgAccuracy + val microAvgRecall = microAvgAccuracy + + // Compute class counts - these are the row and column sums of the matrix, used to calculate the + // average accuracy, macro averaged precision and macro averaged recall + val actualClassCounts = new Array[Double](confusionMatrix.numCols) + val predictedClassCounts = new Array[Double](confusionMatrix.numRows) + val truePositives = new Array[Double](confusionMatrix.numRows) + for (rowIndex: Int <- 0 until confusionMatrix.numRows) { + for (colIndex: Int <- 0 until confusionMatrix.numCols) { + actualClassCounts(rowIndex) += confusionMatrix(rowIndex, colIndex) + predictedClassCounts(colIndex) += confusionMatrix(rowIndex, colIndex) + + if (rowIndex == colIndex) { + truePositives(rowIndex) += confusionMatrix(rowIndex, colIndex) + } + } + } + + var totalAccuracy = 0.0 + var totalPrecision = 0.0 + var totalRecall = 0.0 + for (classIndex: Int <- 0 until confusionMatrix.numCols) { + // compute the class accuracy as: + // (true positive + true negative) / total => + // (true positive + (total - (actual + predicted - true positive))) / total => + // 2 * true positive + (total - (actual + predicted)) / total + totalAccuracy += (2 * truePositives(classIndex) + + (totalSum - (actualClassCounts(classIndex) + predictedClassCounts(classIndex)))) / totalSum + + // compute the class precision as: + // true positive / predicted as positive (=> tp + fp) + totalPrecision += truePositives(classIndex) / predictedClassCounts(classIndex) + + // compute the class recall as: + // true positive / actual positive (=> tp + fn) + totalRecall += truePositives(classIndex) / actualClassCounts(classIndex) + } + + val averageAccuracy = totalAccuracy / confusionMatrix.numCols + val macroAveragedPrecision = totalPrecision / confusionMatrix.numCols + val macroAveragedRecall = totalRecall / confusionMatrix.numCols + (microAvgAccuracy, microAvgPrecision, microAvgRecall, averageAccuracy, macroAveragedPrecision, macroAveragedRecall) + } + + private def getAUC(modelName: String, + dataset: Dataset[_], + labelColumnName: String, + scoresAndLabels: RDD[(Double, Double)]): Double = { + val binaryMetrics = new BinaryClassificationMetrics(scoresAndLabels, + ComputeModelStatistics.BinningThreshold) + + val spark = dataset.sparkSession + import spark.implicits._ + + rocCurve = binaryMetrics.roc() + .toDF(ComputeModelStatistics.FpRateROCColumnName, ComputeModelStatistics.TpRateROCColumnName) + logROC(rocCurve) + val auc = binaryMetrics.areaUnderROC() + logAUC(auc) + auc + } + + private def getBinaryAccuracyPrecisionRecall(confusionMatrix: Matrix): (Double, Double, Double) = { + val TP: Double = confusionMatrix(1, 1) + val FP: Double = confusionMatrix(0, 1) + val TN: Double = confusionMatrix(0, 0) + val FN: Double = confusionMatrix(1, 0) + + val accuracy: Double = (TP + TN) / (TP + TN + FP + FN) + val precision: Double = TP / (TP + FP) + val recall: Double = TP / (TP + FN) + (accuracy, precision, recall) + } + + private def createConfusionMatrix(predictionAndLabels: RDD[(Double, Double)]): (Array[Double], Matrix) = { + val metrics = new MulticlassMetrics(predictionAndLabels) + var labels = metrics.labels + var confusionMatrix = metrics.confusionMatrix + + val numCols = confusionMatrix.numCols + val numRows = confusionMatrix.numRows + + // Reformat the confusion matrix if less than binary size + if (numCols < 2 && numRows < 2) { + val values = Array.ofDim[Double](2 * 2) + for (col: Int <- 0 until confusionMatrix.numCols; + row: Int <- 0 until confusionMatrix.numRows) { + // We need to interpret the actual label value + val colLabel = if (labels(col) > 0) 1 else 0 + val rowLabel = if (labels(row) > 0) 1 else 0 + values(colLabel + rowLabel * 2) = + confusionMatrix(row, col) + } + confusionMatrix = Matrices.dense(2, 2, values) + labels = Array(0, 1) + } + (labels, confusionMatrix) + } + + private def logClassificationMetrics(accuracy: Double, precision: Double, recall: Double): Unit = { + val metrics = MetricData.create(Map(ComputeModelStatistics.AccuracyColumnName -> accuracy, + ComputeModelStatistics.PrecisionColumnName -> precision, + ComputeModelStatistics.RecallColumnName -> recall), "Classification Metrics", uid) + logger.info(metrics) + } + + private def logRegressionMetrics(mse: Double, rmse: Double, r2: Double, mae: Double): Unit = { + val metrics = MetricData.create(Map(ComputeModelStatistics.MseColumnName -> mse, + ComputeModelStatistics.RmseColumnName -> rmse, + ComputeModelStatistics.R2ColumnName -> r2, + ComputeModelStatistics.MaeColumnName -> mae), "Regression Metrics", uid) + logger.info(metrics) + } + + private def logAUC(auc: Double): Unit = { + val metrics = MetricData.create(Map(ComputeModelStatistics.AucColumnName -> auc), "AUC Metric", uid) + logger.info(metrics) + } + + private def logROC(roc: DataFrame): Unit = { + val metrics = MetricData.createTable( + Map( + ComputeModelStatistics.TpRateROCLog -> + roc.select(ComputeModelStatistics.TpRateROCColumnName) + .collect() + .map(row => row(0).asInstanceOf[Double]).toSeq, + ComputeModelStatistics.FpRateROCLog -> + roc.select(ComputeModelStatistics.FpRateROCColumnName) + .collect() + .map(row => row(0).asInstanceOf[Double]).toSeq + ), + "ROC Metric", + uid) + logger.info(metrics) + } + + private def getFirstModelName(colMetadata: Metadata): Option[String] = { + if (!colMetadata.contains(MMLTag)) null + else { + val mlTagMetadata = colMetadata.getMetadata(MMLTag) + val metadataKeys = MetadataUtilities.getMetadataKeys(mlTagMetadata) + metadataKeys.find(key => key.startsWith(SchemaConstants.ScoreModelPrefix)) + } + } + + override def copy(extra: ParamMap): Transformer = new ComputeModelStatistics() + + override def transformSchema(schema: StructType): StructType = { + val (_, _, scoreValueKind) = getSchemaInfo(schema) + val columns = + if (scoreValueKind == SchemaConstants.ClassificationKind) ComputeModelStatistics.classificationColumns + else if (scoreValueKind == SchemaConstants.RegressionKind) ComputeModelStatistics.regressionColumns + else throwOnInvalidScoringKind(scoreValueKind) + getTransformedSchema(columns, scoreValueKind) + + } + + private def throwOnInvalidScoringKind(scoreValueKind: String) = { + throw new Exception(s"Error: unknown scoring kind $scoreValueKind") + } + + private def getTransformedSchema(columns: List[String], metricType: String) = { + getEvaluationMetric match { + case ComputeModelStatistics.AllSparkMetrics => + StructType(columns.map(StructField(_, DoubleType))) + case metric: String if (ComputeModelStatistics.metricToColumnName.contains(metric)) && + columns.contains(ComputeModelStatistics.metricToColumnName(metric)) => + StructType(Array(StructField(ComputeModelStatistics.metricToColumnName(metric), DoubleType))) + case default => + throw new Exception(s"Error: $default is not a $metricType metric") + } + } +} diff --git a/src/compute-model-statistics/src/test/scala/VerifyComputeModelStatistics.scala b/src/compute-model-statistics/src/test/scala/VerifyComputeModelStatistics.scala new file mode 100644 index 0000000000..5aa538b549 --- /dev/null +++ b/src/compute-model-statistics/src/test/scala/VerifyComputeModelStatistics.scala @@ -0,0 +1,245 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.TrainRegressorTestUtilities._ +import com.microsoft.ml.spark.TrainClassifierTestUtilities._ +import com.microsoft.ml.spark.schema.{CategoricalUtilities, SchemaConstants, SparkSchema} +import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{DoubleType, StructField, StructType} + +/** + * Tests to validate the functionality of Evaluate Model module. + */ +class VerifyComputeModelStatistics extends TestBase { + + test("Smoke test for evaluating a dataset") { + + val labelColumn = "label" + val predictionColumn = SchemaConstants.SparkPredictionColumn + val dataset = session.createDataFrame(Seq( + (0.0, 2, 0.50, 0.60, 0.0), + (1.0, 3, 0.40, 0.50, 1.0), + (2.0, 4, 0.78, 0.99, 2.0), + (3.0, 5, 0.12, 0.34, 3.0), + (0.0, 1, 0.50, 0.60, 0.0), + (1.0, 3, 0.40, 0.50, 1.0), + (2.0, 3, 0.78, 0.99, 2.0), + (3.0, 4, 0.12, 0.34, 3.0), + (0.0, 0, 0.50, 0.60, 0.0), + (1.0, 2, 0.40, 0.50, 1.0), + (2.0, 3, 0.78, 0.99, 2.0), + (3.0, 4, 0.12, 0.34, 3.0))) + .toDF(labelColumn, "col1", "col2", "col3", predictionColumn) + + val scoreModelName = SchemaConstants.ScoreModelPrefix + "_test model" + + val datasetWithLabel = + SparkSchema.setLabelColumnName(dataset, scoreModelName, labelColumn, SchemaConstants.RegressionKind) + val datasetWithScores = + SparkSchema.setScoresColumnName(datasetWithLabel, scoreModelName, predictionColumn, + SchemaConstants.RegressionKind) + + val evaluatedSchema = new ComputeModelStatistics().transformSchema(datasetWithScores.schema) + + val evaluatedData = new ComputeModelStatistics().transform(datasetWithScores) + val firstRow = evaluatedData.first() + assert(firstRow.get(0) == 0.0) + assert(firstRow.get(1) == 0.0) + assert(firstRow.get(2) == 1.0) + assert(firstRow.get(3) == 0.0) + + assert(evaluatedSchema == StructType(ComputeModelStatistics.regressionColumns.map(StructField(_, DoubleType)))) + } + + test("Evaluate a dataset with missing values") { + + val labelColumn = "label" + val predictionColumn = SchemaConstants.SparkPredictionColumn + val dataset = session.createDataFrame(sc.parallelize(Seq( + (0.0, 0.0), + (0.0, null), + (1.0, 1.0), + (2.0, 2.0), + (null, null), + (0.0, 0.0), + (null, 3.0))).map(values => Row(values._1, values._2)), + StructType(Array(StructField(labelColumn, DoubleType, true), + StructField(predictionColumn, DoubleType, true)))) + .toDF(labelColumn, predictionColumn) + + val scoreModelName = SchemaConstants.ScoreModelPrefix + "_test model" + + val datasetWithLabel = + SparkSchema.setLabelColumnName(dataset, scoreModelName, labelColumn, SchemaConstants.RegressionKind) + val datasetWithScores = + SparkSchema.setScoresColumnName(datasetWithLabel, scoreModelName, predictionColumn, + SchemaConstants.RegressionKind) + + val evaluatedData = new ComputeModelStatistics().transform(datasetWithScores) + val firstRow = evaluatedData.first() + assert(firstRow.get(0) == 0.0) + assert(firstRow.get(1) == 0.0) + assert(firstRow.get(2) == 1.0) + assert(firstRow.get(3) == 0.0) + } + + test("Smoke test to train regressor, score and evaluate on a dataset using all three modules") { + val dataset = session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (2, 4, 0.78, 0.99, 2), + (3, 5, 0.12, 0.34, 3), + (0, 1, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (2, 3, 0.78, 0.99, 2), + (3, 4, 0.12, 0.34, 3), + (0, 0, 0.50, 0.60, 0), + (1, 2, 0.40, 0.50, 1), + (2, 3, 0.78, 0.99, 2), + (3, 4, 0.12, 0.34, 3) + )).toDF("labelColumn", "col1", "col2", "col3", "col4") + + val labelColumn = "someOtherColumn" + + val datasetWithAddedColumn = dataset.withColumn(labelColumn, org.apache.spark.sql.functions.lit(0.0)) + + val linearRegressor = createLinearRegressor(labelColumn) + val scoredDataset = + TrainRegressorTestUtilities.trainScoreDataset(labelColumn, datasetWithAddedColumn, linearRegressor) + + val evaluatedData = new ComputeModelStatistics().transform(scoredDataset) + val firstRow = evaluatedData.first() + assert(firstRow.get(0) == 0.0) + assert(firstRow.get(1) == 0.0) + assert(firstRow.get(2).asInstanceOf[Double].isNaN) + assert(firstRow.get(3) == 0.0) + } + + test("Smoke test to train classifier, score and evaluate on a dataset using all three modules") { + val labelColumn = "Label" + val dataset = session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (0, 4, 0.78, 0.99, 2), + (1, 5, 0.12, 0.34, 3), + (0, 1, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (0, 3, 0.78, 0.99, 2), + (1, 4, 0.12, 0.34, 3), + (0, 0, 0.50, 0.60, 0), + (1, 2, 0.40, 0.50, 1), + (0, 3, 0.78, 0.99, 2), + (1, 4, 0.12, 0.34, 3) + )).toDF(labelColumn, "col1", "col2", "col3", "col4") + + val logisticRegressor = createLogisticRegressor(labelColumn) + val scoredDataset = TrainClassifierTestUtilities.trainScoreDataset(labelColumn, dataset, logisticRegressor) + val evaluatedData = new ComputeModelStatistics().transform(scoredDataset) + + val evaluatedSchema = new ComputeModelStatistics().transformSchema(scoredDataset.schema) + assert(evaluatedSchema == StructType(ComputeModelStatistics.classificationColumns.map(StructField(_, DoubleType)))) + } + + test("Verify results of multiclass metrics") { + val labelColumn = "label" + val predictionColumn = SchemaConstants.SparkPredictionColumn + val labelsAndPrediction = session.createDataFrame( + Seq( + (0.0, 0.0), + (0.0, 0.0), + (0.0, 1.0), + (0.0, 2.0), + (1.0, 0.0), + (1.0, 1.0), + (1.0, 1.0), + (1.0, 1.0), + (2.0, 2.0))).toDF(labelColumn, predictionColumn) + + val scoreModelName = SchemaConstants.ScoreModelPrefix + "_test model" + + val datasetWithLabel = + SparkSchema.setLabelColumnName(labelsAndPrediction, scoreModelName, labelColumn, + SchemaConstants.ClassificationKind) + val datasetWithScoredLabels = + SparkSchema.setScoredLabelsColumnName(datasetWithLabel, scoreModelName, predictionColumn, + SchemaConstants.ClassificationKind) + + val evaluatedData = new ComputeModelStatistics().transform(datasetWithScoredLabels) + + val tp0 = 2.0 + val tp1 = 3.0 + val tp2 = 1.0 + val tn0 = 4.0 + val tn1 = 4.0 + val tn2 = 7.0 + val numLabels = 3.0 + val total = labelsAndPrediction.count() + + val precision0 = 2.0 / (2 + 1) + val precision1 = 3.0 / (3 + 1) + val precision2 = 1.0 / (1 + 1) + val recall0 = 2.0 / (2 + 2) + val recall1 = 3.0 / (3 + 1) + val recall2 = 1.0 / (1 + 0) + + val overallAccuracy = (tp0 + tp1 + tp2) / total + val evalRow = evaluatedData.first() + assert(evalRow.getAs[Double](ComputeModelStatistics.AccuracyColumnName) == overallAccuracy) + assert(evalRow.getAs[Double](ComputeModelStatistics.PrecisionColumnName) == overallAccuracy) + assert(evalRow.getAs[Double](ComputeModelStatistics.RecallColumnName) == overallAccuracy) + val avgAccuracy = ((tp0 + tn0) / total + (tp1 + tn1) / total + (tp2 + tn2) / total) / numLabels + val macroPrecision = (precision0 + precision1 + precision2) / numLabels + val macroRecall = (recall0 + recall1 + recall2) / numLabels + assert(evalRow.getAs[Double](ComputeModelStatistics.AverageAccuracy) == avgAccuracy) + assert(evalRow.getAs[Double](ComputeModelStatistics.MacroAveragedPrecision) == macroPrecision) + assert(evalRow.getAs[Double](ComputeModelStatistics.MacroAveragedRecall) == macroRecall) + } + + test("validate AUC from compute model statistic and binary classification evaluator gives the same result") { + val fileLocation = ClassifierTestUtils.classificationTrainFile("transfusion.csv").toString + val label = "Donated" + val dataset: DataFrame = + session.read.format("com.databricks.spark.csv") + .option("header", "true").option("inferSchema", "true") + .option("treatEmptyValuesAsNulls", "false") + .option("delimiter", ",") + .load(fileLocation) + + val split = dataset.randomSplit(Array(0.75,0.25)) + val train = split(0) + val test = split(1) + + val trainClassifier = new TrainClassifier() + val model = trainClassifier.setModel(new LogisticRegression()) + .set(trainClassifier.labelCol, label) + .set(trainClassifier.numFeatures, 1 << 18) + .fit(train) + val scored = model.transform(test) + val eval = new ComputeModelStatistics().transform(scored) + val cmsAUC = eval.first().getAs[Double]("AUC") + + val binaryEvaluator = new BinaryClassificationEvaluator() + .setMetricName("areaUnderROC") + .setLabelCol(label) + .setRawPredictionCol(SchemaConstants.ScoresColumn) + + val levels = CategoricalUtilities.getLevels(scored.schema, label) + val levelsToIndexMap: Map[Any, Double] = levels.get.zipWithIndex.map(t => t._1 -> t._2.toDouble).toMap + + // Calculate confusion matrix and output it as DataFrame + val predictionAndLabels = session + .createDataFrame(scored.select(col(SchemaConstants.ScoresColumn), col(label)).rdd.map { + case Row(prediction: Vector, label) => (prediction(1), levelsToIndexMap(label)) + }).toDF(SchemaConstants.ScoresColumn, label) + + val auc = binaryEvaluator.evaluate(predictionAndLabels) + assert(auc === cmsAUC) + } + +} diff --git a/src/compute-per-instance-statistics/build.sbt b/src/compute-per-instance-statistics/build.sbt new file mode 100644 index 0000000000..1ddf71d75d --- /dev/null +++ b/src/compute-per-instance-statistics/build.sbt @@ -0,0 +1,3 @@ +//> DependsOn: core +//> DependsOn: train-regressor +//> DependsOn: train-classifier diff --git a/src/compute-per-instance-statistics/src/main/scala/ComputePerInstanceStatistics.scala b/src/compute-per-instance-statistics/src/main/scala/ComputePerInstanceStatistics.scala new file mode 100644 index 0000000000..9cb83ea1a3 --- /dev/null +++ b/src/compute-per-instance-statistics/src/main/scala/ComputePerInstanceStatistics.scala @@ -0,0 +1,110 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.schema.SchemaConstants._ +import com.microsoft.ml.spark.schema.{CategoricalUtilities, SchemaConstants, SparkSchema} +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +/** + * Contains constants used by Compute Per Instance Statistics. + */ +object ComputePerInstanceStatistics extends DefaultParamsReadable[ComputePerInstanceStatistics] { + // Regression metrics + val L1LossMetric = "L1_loss" + val L2LossMetric = "L2_loss" + + // Classification metrics + val LogLossMetric = "log_loss" + + val epsilon = 1e-15 +} + +/** + * Evaluates the given scored dataset with per instance metrics. + */ +class ComputePerInstanceStatistics(override val uid: String) extends Transformer with MMLParams { + + def this() = this(Identifiable.randomUID("ComputePerInstanceStatistics")) + + override def transform(dataset: Dataset[_]): DataFrame = { + // TODO: evaluate all models; for now, get first model name found + val firstModelName = dataset.schema.collectFirst { + case StructField(c, t, _, m) if (getFirstModelName(m) != null && !getFirstModelName(m).isEmpty) => { + getFirstModelName(m).get + } + } + val modelName = if (!firstModelName.isEmpty) firstModelName.get + else throw new Exception("Please score the model prior to evaluating") + val dataframe = dataset.toDF() + val labelColumnName = SparkSchema.getLabelColumnName(dataframe, modelName) + + val scoreValueKind = SparkSchema.getScoreValueKind(dataframe, modelName, labelColumnName) + + if (scoreValueKind == SchemaConstants.ClassificationKind) { + // Compute the LogLoss for classification case + val scoredLabelsColumnName = SparkSchema.getScoredLabelsColumnName(dataframe, modelName) + + // Get levels if categorical + val levels = CategoricalUtilities.getLevels(dataframe.schema, labelColumnName) + val numLevels = + if (!levels.isEmpty && levels.get != null) { + if (levels.get.length > 2) levels.get.length else 2 + } else { + // Otherwise compute unique levels + dataset.select(col(labelColumnName).cast(DoubleType)).rdd.distinct().count().toInt + } + + val logLossFunc = udf((scoredLabel: Double, scores: org.apache.spark.ml.linalg.Vector) => + if (scoredLabel < numLevels) { + -Math.log(Math.min(1, Math.max(ComputePerInstanceStatistics.epsilon, scores(scoredLabel.toInt)))) + } else { + // penalize if no label seen in training + -Math.log(ComputePerInstanceStatistics.epsilon) + }) + val probabilitiesColumnName = SparkSchema.getScoredProbabilitiesColumnName(dataframe, modelName) + dataframe.withColumn(ComputePerInstanceStatistics.LogLossMetric, + logLossFunc(dataset(scoredLabelsColumnName), dataset(probabilitiesColumnName))) + } else { + val scoresColumnName = SparkSchema.getScoresColumnName(dataframe, modelName) + // Compute the L1 and L2 loss for regression case + val scoresAndLabels = + dataset.select(col(scoresColumnName), col(labelColumnName).cast(DoubleType)).rdd.map { + case Row(prediction: Double, label: Double) => (prediction, label) + } + val l1LossFunc = udf((trueLabel:Double, scoredLabel: Double) => math.abs(trueLabel - scoredLabel)) + val l2LossFunc = udf((trueLabel:Double, scoredLabel: Double) => + { + val loss = math.abs(trueLabel - scoredLabel) + loss * loss + }) + dataframe.withColumn(ComputePerInstanceStatistics.L1LossMetric, + l1LossFunc(dataset(labelColumnName), dataset(scoresColumnName))) + .withColumn(ComputePerInstanceStatistics.L2LossMetric, + l2LossFunc(dataset(labelColumnName), dataset(scoresColumnName))) + } + } + + private def getFirstModelName(colMetadata: Metadata): Option[String] = { + if (!colMetadata.contains(MMLTag)) null + else { + val mlTagMetadata = colMetadata.getMetadata(MMLTag) + val metadataKeys = MetadataUtilities.getMetadataKeys(mlTagMetadata) + metadataKeys.find(key => key.startsWith(SchemaConstants.ScoreModelPrefix)) + } + } + + override def copy(extra: ParamMap): Transformer = new ComputePerInstanceStatistics() + + // TODO: This should be based on the retrieved score value kind + override def transformSchema(schema: StructType): StructType = + schema.add(new StructField(ComputePerInstanceStatistics.L1LossMetric, DoubleType)) + .add(new StructField(ComputePerInstanceStatistics.L2LossMetric, DoubleType)) + +} diff --git a/src/compute-per-instance-statistics/src/test/scala/VerifyComputePerInstanceStatistics.scala b/src/compute-per-instance-statistics/src/test/scala/VerifyComputePerInstanceStatistics.scala new file mode 100644 index 0000000000..bfacec803e --- /dev/null +++ b/src/compute-per-instance-statistics/src/test/scala/VerifyComputePerInstanceStatistics.scala @@ -0,0 +1,130 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.TrainRegressorTestUtilities._ +import com.microsoft.ml.spark.TrainClassifierTestUtilities._ +import com.microsoft.ml.spark.schema.{SchemaConstants, SparkSchema} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.sql._ + +import scala.tools.nsc.transform.patmat.Lit + +/** + * Tests to validate the functionality of Compute Per Instance Statistics module. + */ +class VerifyComputePerInstanceStatistics extends TestBase { + + test("Smoke test for evaluating a dataset") { + + val labelColumn = "label" + val predictionColumn = SchemaConstants.SparkPredictionColumn + val dataset = session.createDataFrame(Seq( + (0.0, 2, 0.50, 0.60, 0.0), + (1.0, 3, 0.40, 0.50, 1.0), + (2.0, 4, 0.78, 0.99, 2.0), + (3.0, 5, 0.12, 0.34, 3.0), + (0.0, 1, 0.50, 0.60, 0.0), + (1.0, 3, 0.40, 0.50, 1.0), + (2.0, 3, 0.78, 0.99, 2.0), + (3.0, 4, 0.12, 0.34, 3.0), + (0.0, 0, 0.50, 0.60, 0.0), + (1.0, 2, 0.40, 0.50, 1.0), + (2.0, 3, 0.78, 0.99, 2.0), + (3.0, 4, 0.12, 0.34, 3.0))) + .toDF(labelColumn, "col1", "col2", "col3", predictionColumn) + + val scoreModelName = SchemaConstants.ScoreModelPrefix + "_test model" + + val datasetWithLabel = + SparkSchema.setLabelColumnName(dataset, scoreModelName, labelColumn, SchemaConstants.RegressionKind) + val datasetWithScores = + SparkSchema.setScoresColumnName(datasetWithLabel, scoreModelName, predictionColumn, + SchemaConstants.RegressionKind) + + val evaluatedData = new ComputePerInstanceStatistics().transform(datasetWithScores) + validatePerInstanceRegressionStatistics(evaluatedData) + } + + test("Smoke test to train regressor, score and evaluate on a dataset using all three modules") { + val label = "label" + val dataset = session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (2, 4, 0.78, 0.99, 2), + (3, 5, 0.12, 0.34, 3), + (0, 1, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (2, 3, 0.78, 0.99, 2), + (3, 4, 0.12, 0.34, 3), + (0, 0, 0.50, 0.60, 0), + (1, 2, 0.40, 0.50, 1), + (2, 3, 0.78, 0.99, 2), + (3, 4, 0.12, 0.34, 3) + )).toDF(label, "col1", "col2", "col3", "col4") + + val linearRegressor = createLinearRegressor(label) + val scoredDataset = + TrainRegressorTestUtilities.trainScoreDataset(label, dataset, linearRegressor) + + val evaluatedData = new ComputePerInstanceStatistics().transform(scoredDataset) + validatePerInstanceRegressionStatistics(evaluatedData) + } + + test("Smoke test to train classifier, score and evaluate on a dataset using all three modules") { + val labelColumn = "Label" + val dataset = session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (0, 4, 0.78, 0.99, 2), + (1, 5, 0.12, 0.34, 3), + (0, 1, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (0, 3, 0.78, 0.99, 2), + (1, 4, 0.12, 0.34, 3), + (0, 0, 0.50, 0.60, 0), + (1, 2, 0.40, 0.50, 1), + (0, 3, 0.78, 0.99, 2), + (1, 4, 0.12, 0.34, 3) + )).toDF(labelColumn, "col1", "col2", "col3", "col4") + + val logisticRegressor = createLogisticRegressor(labelColumn) + val scoredDataset = TrainClassifierTestUtilities.trainScoreDataset(labelColumn, dataset, logisticRegressor) + val evaluatedData = new ComputePerInstanceStatistics().transform(scoredDataset) + validatePerInstanceClassificationStatistics(evaluatedData) + } + + private def validatePerInstanceRegressionStatistics(evaluatedData: DataFrame): Unit = { + // Validate the per instance statistics + evaluatedData.collect().foreach(row => { + val labelUncast = row(0) + val label = + if (labelUncast.isInstanceOf[Int]) labelUncast.asInstanceOf[Int].toDouble + else labelUncast.asInstanceOf[Double] + val score = row.getDouble(row.length - 3) + val l1Loss = row.getDouble(row.length - 2) + val l2Loss = row.getDouble(row.length - 1) + val loss = math.abs(label - score) + assert(l1Loss === loss) + assert(l2Loss === loss * loss) + }) + } + + private def validatePerInstanceClassificationStatistics(evaluatedData: DataFrame): Unit = { + // Validate the per instance statistics + evaluatedData.collect().foreach(row => { + val labelUncast = row(0) + val label = + if (labelUncast.isInstanceOf[Int]) labelUncast.asInstanceOf[Int].toDouble + else labelUncast.asInstanceOf[Double] + val probabilities = row.get(row.length - 3).asInstanceOf[org.apache.spark.ml.linalg.Vector] + val scoredLabel = row.getDouble(row.length - 2).toInt + val logLoss = row.getDouble(row.length - 1) + val computedLogLoss = -Math.log(Math.min(1, Math.max(ComputePerInstanceStatistics.epsilon, + probabilities(scoredLabel.toInt)))) + assert(computedLogLoss === logLoss) + }) + } + +} diff --git a/src/core/build.sbt b/src/core/build.sbt new file mode 100644 index 0000000000..cd0183132b --- /dev/null +++ b/src/core/build.sbt @@ -0,0 +1 @@ +// nothing here diff --git a/src/core/contracts/build.sbt b/src/core/contracts/build.sbt new file mode 100644 index 0000000000..cd0183132b --- /dev/null +++ b/src/core/contracts/build.sbt @@ -0,0 +1 @@ +// nothing here diff --git a/src/core/contracts/src/main/scala/Exceptions.scala b/src/core/contracts/src/main/scala/Exceptions.scala new file mode 100644 index 0000000000..aa923474d2 --- /dev/null +++ b/src/core/contracts/src/main/scala/Exceptions.scala @@ -0,0 +1,35 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.contracts + +import java.lang.RuntimeException + +import org.apache.spark.ml.util.Identifiable + +object MMLException { + implicit class MMLID(val i: Identifiable) extends AnyVal { + def id: String = i.uid + } + def throwEx(msg: String, inner: Throwable = null)(implicit i: MMLID): MMLException = { + throw new MMLException(i.id, msg, inner) + } +} +import MMLException._ + +// The caller must *explicitly* pass null to the source and inner exception +class MMLException(source: String, msg: String, inner: Throwable) + // suppression = true by default + // writableStackTrace -> true by design for us + extends RuntimeException(msg, inner, true, true) { + + // Fix this to be structured for operationalized scenarios? + // Or will they consume the object? + override def toString(): String = source + super.toString +} + +class FriendlyException(addedInfo: String, inner: Throwable)(implicit aid: MMLID) + extends MMLException(aid.id, addedInfo, inner) + +class ParamException(reason: String)(implicit aid: MMLID) + extends MMLException(aid.id, reason, null) diff --git a/src/core/contracts/src/main/scala/Metrics.scala b/src/core/contracts/src/main/scala/Metrics.scala new file mode 100644 index 0000000000..da1b5aa1f2 --- /dev/null +++ b/src/core/contracts/src/main/scala/Metrics.scala @@ -0,0 +1,47 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.contracts + +// Case class matching +sealed abstract class Metric + +// Just for clarity in the contract file +object ConvenienceTypes { + type UniqueName = String + type MetricTable = Map[UniqueName, Seq[Metric]] +} +import ConvenienceTypes._ + +// One option +case class TypedMetric[T](name: UniqueName, value: T) extends Metric +case class MetricGroup(name: UniqueName, metrics: MetricTable) { + require ({ + val len = metrics.values.head.length + metrics.values.forall(col => col.length == len) + }, s"All metric lists in the table must be the same length") +} + +// Other option (reflection friendly - do we need reflection?) +sealed abstract class TypenameMetric +case class DoubleMetric(name: UniqueName, value: Double) extends TypenameMetric +case class StringMetric(name: UniqueName, value: String) extends TypenameMetric +case class IntegralMetric(name: UniqueName, value: Long) extends TypenameMetric + +case class TypenameMetricGroup(name: UniqueName, values: Map[UniqueName, Seq[TypenameMetric]]) + +/** + * Defines contract for Metric table, which is a metric name to list of values. + * @param data + */ +case class MetricData(data: Map[String, Seq[Double]], metricType: String, modelName: String) + +object MetricData { + def create(data: Map[String, Double], metricType: String, modelName: String): MetricData = { + return new MetricData(data.map(kvp => (kvp._1, List(kvp._2))), metricType, modelName) + } + + def createTable(data: Map[String, Seq[Double]], metricType: String, modelName: String): MetricData = { + return new MetricData(data, metricType, modelName) + } +} diff --git a/src/core/contracts/src/main/scala/Params.scala b/src/core/contracts/src/main/scala/Params.scala new file mode 100644 index 0000000000..f0ab6e8473 --- /dev/null +++ b/src/core/contracts/src/main/scala/Params.scala @@ -0,0 +1,134 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import scala.collection.mutable.Map +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} + +trait MMLParams extends Wrappable with DefaultParamsWritable + +trait Wrappable extends Params { + + // Use this function when instantiating sparkML Identifiable for your + // own use - it allows us to locate the origin of any stacks + def chainedUid(origin: String): String = Identifiable.randomUID(this.uid) + + private var orderCounter = 0 + // TODO: Support non-string "enums"? + val paramDomains = Map[String, Seq[String]]() + + def BooleanParam(i: Identifiable, name: String, description: String): BooleanParam = + BooleanParam(i, name, description, false) + + def BooleanParam(i: Identifiable, name: String, description: String, + default: Boolean): BooleanParam = { + val baseParam = new BooleanParam(i, name, description) + MMLParam(baseParam, Some(default), None) + baseParam + } + + def IntParam(i: Identifiable, name: String, description: String): IntParam = { + val baseParam = new IntParam(i, name, description) + MMLParam(baseParam, None, None) + baseParam + } + + def IntParam(i: Identifiable, name: String, description: String, + default: Int): IntParam = { + val baseParam = new IntParam(i, name, description) + MMLParam(baseParam, Some(default), None) + baseParam + } + + def IntParam(i: Identifiable, name: String, description: String, validation: Int => Boolean): IntParam = { + val baseParam = new IntParam(i, name, description, validation) + MMLParam(baseParam, None, None) + baseParam + } + + def LongParam(i: Identifiable, name: String, description: String): LongParam = { + val baseParam = new LongParam(i, name, description) + MMLParam(baseParam, None, None) + baseParam + } + + def LongParam(i: Identifiable, name: String, description: String, + default: Long): LongParam = { + val baseParam = new LongParam(i, name, description) + MMLParam(baseParam, Some(default), None) + baseParam + } + + def DoubleParam(i: Identifiable, name: String, description: String): DoubleParam = { + val baseParam = new DoubleParam(i, name, description) + MMLParam(baseParam, None, None) + baseParam + } + + def DoubleParam(i: Identifiable, name: String, description: String, + default: Double): DoubleParam = { + val baseParam = new DoubleParam(i, name, description) + MMLParam(baseParam, Some(default), None) + baseParam + } + + def StringParam(i: Identifiable, name: String, description: String): Param[String] = { + val baseParam = new Param[String](i, name, description) + MMLParam(baseParam, None, None) + baseParam + } + + def StringParam(i: Identifiable, name: String, description: String, validation: String => Boolean): Param[String] = { + val baseParam = new Param[String](i, name, description, validation) + MMLParam(baseParam, None, None) + baseParam + } + + def StringParam(i: Identifiable, name: String, description: String, + default: String): Param[String] = { + val baseParam = new Param[String](i, name, description) + MMLParam(baseParam, Some(default), None) + baseParam + } + + def StringParam(i: Identifiable, name: String, description: String, + default: String, domain: Seq[String]): Param[String] = { + val baseParam = new Param[String](i, name, description) + MMLParam(baseParam, Some(default), Some(domain)) + baseParam + } + + private def MMLParam[T](param: Param[T], + default: Option[T], domain: Option[Seq[String]]): Unit = { + if (default.isDefined) setDefault(param, default.get) + if (domain.isDefined) paramDomains.put(param.name, domain.get) + orderCounter += 1 + } + +} + +trait HasInputCol extends Wrappable { + val inputCol = StringParam(this, "inputCol", "The name of the input column") + def setInputCol(value: String): this.type = set(inputCol, value) + def getInputCol: String = $(inputCol) +} + +trait HasOutputCol extends Wrappable { + val outputCol = StringParam(this, "outputCol", "The name of the output column") + def setOutputCol(value: String): this.type = set(outputCol, value) + def getOutputCol: String = $(outputCol) +} + +trait HasLabelCol extends Wrappable { + val labelCol = StringParam(this, "labelCol", "The name of the label column") + def setLabelCol(value: String): this.type = set(labelCol, value) + def getLabelCol: String = $(labelCol) +} + +trait HasFeaturesCol extends Wrappable { + val featuresCol = StringParam(this, "featuresCol", "The name of the features column") + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + def getFeaturesCol: String = $(featuresCol) +} diff --git a/src/core/env/build.sbt b/src/core/env/build.sbt new file mode 100644 index 0000000000..cde1c89ce0 --- /dev/null +++ b/src/core/env/build.sbt @@ -0,0 +1,7 @@ +libraryDependencies ++= Seq( + // "%%" for scala things, "%" for plain java things + "com.typesafe" % "config" % "1.3.1", + "org.apache.logging.log4j" % "log4j-api" % "2.8.1" % "provided", + "org.apache.logging.log4j" % "log4j-core" % "2.8.1" % "provided", + "org.apache.logging.log4j" %% "log4j-api-scala" % "2.8.1" % "provided" + ) diff --git a/src/core/env/src/main/scala/CodegenTags.scala b/src/core/env/src/main/scala/CodegenTags.scala new file mode 100644 index 0000000000..fd8ebe8fac --- /dev/null +++ b/src/core/env/src/main/scala/CodegenTags.scala @@ -0,0 +1,13 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import scala.annotation.StaticAnnotation + +/** + * Generate the internal wrapper for a given class. + * Used for complicated wrappers, where the basic functionality is auto-generated, + * and the rest is added in the inherited wrapper. + */ +class InternalWrapper extends StaticAnnotation diff --git a/src/core/env/src/main/scala/Configuration.scala b/src/core/env/src/main/scala/Configuration.scala new file mode 100644 index 0000000000..d2a66c4e7b --- /dev/null +++ b/src/core/env/src/main/scala/Configuration.scala @@ -0,0 +1,51 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.util.Properties +import scala.sys.process._ + +import org.apache.spark._ +import org.apache.spark.sql.SparkSession + +// For development convenience - not hard to reimplement the pieces used here +import com.typesafe.config.{Config, ConfigFactory} + +// This is meant to provide a uniform means of configuring the +// SDK and extension packages in a Spark-compatible form while +// also allowing for env vars as we may use via the CLI +abstract class Configuration(config: Config) { + private val namespace = "mmlspark" + + protected def subspace: String + + def root: String = combine(namespace, subspace) + + private def combine(names: String*): String = names.mkString(".") +} + +class MMLConfig(config: Config) extends Configuration(config) { + override val subspace = "sdk" +} + +object MMLConfig { + // Use spark model of one config/JVM + private lazy val baseConfig = new MMLConfig(ConfigFactory.load()) + def get(): MMLConfig = baseConfig + + private def combine(names: String*): String = names.mkString(".") +} + +// Move to CNTK subpackage +class CNTKConfig(config: Config) extends MMLConfig(config) { + override val subspace = "cntk" + // Danil brings up a good point - device configuration is confusing + // we need to not only say number of devices but also which ones in the + // GPU list to use (1080 gaming + Titan DL for example) +} + +// Move to TLC subpackage +class TLCConfig(config: Config) extends MMLConfig(config) { + override val subspace = "tlc" +} diff --git a/src/core/env/src/main/scala/EnvironmentUtils.scala b/src/core/env/src/main/scala/EnvironmentUtils.scala new file mode 100644 index 0000000000..223bb13713 --- /dev/null +++ b/src/core/env/src/main/scala/EnvironmentUtils.scala @@ -0,0 +1,52 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.nio.file.Paths +import scala.sys.process._ + +import org.apache.spark._ +import org.apache.spark.sql.SparkSession + +import ProcessUtils._ + +object EnvironmentUtils { + + // We should use Apache Commons Lang instead + def IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0 + + // Make this overrideable so people have control over the granularity + private lazy val nvInfo: Option[String] = { + println(s"Computing GPU count on ${if(IsWindows) "Windows" else "Linux"}") + val nvsmicmd = if (IsWindows) { + // Unlikely nvidia is on the path + val nvsmi = Paths.get( + System.getenv("ProgramFiles"), + "NVIDIA Corporation", + "NVSMI", + "nvidia-smi.exe").toAbsolutePath.toString + "\"" + nvsmi + "\"" + } else { + "nvidia-smi" + } + // Probably a more Scala-idiomatic way to do this + try { + Some(ProcessUtils.getProcessOutput(s"$nvsmicmd -L")) + } catch { + // Use the logging API to do this properly + case e: Exception => { + println(s"Couldn't query Nvidia SMI for GPU info: $e") + None + } + } + } + + lazy val GPUCount: Option[Int] = if (nvInfo.isEmpty) None else { + // Commons Lang has isNotBlank + val gpucnt = nvInfo.get.split("\n").filter(!_.trim.isEmpty).length + println(s"$gpucnt GPUs detected") + Some(gpucnt) + } + +} diff --git a/src/core/env/src/main/scala/FileUtilities.scala b/src/core/env/src/main/scala/FileUtilities.scala new file mode 100644 index 0000000000..c473772932 --- /dev/null +++ b/src/core/env/src/main/scala/FileUtilities.scala @@ -0,0 +1,139 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io.ByteArrayOutputStream +import java.nio.file.{Files, StandardCopyOption} +import java.util.zip.ZipInputStream + +import org.apache.commons.io.IOUtils +import org.apache.spark.input.PortableDataStream + +import scala.io._ +import scala.util.Random + +object FileUtilities { + + // Make `File` available to everyone who uses these utilities + // (Future TODO: make it some nice type, something like `file` in SBT) + type File = java.io.File + + import scala.util.{Try, Success, Failure} + def using[T <: AutoCloseable, U](disposable: Seq[T])(task: Seq[T] => U): Try[U] = { + try { + Success(task(disposable)) + } catch { + case e: Exception => Failure(e) + } finally { + disposable.foreach(d => d.close()) + } + } + + def delTree(file: File): Boolean = + if (!file.exists) true + else { if (file.isDirectory) file.listFiles.forall(delTree) + file.delete } + + def allFiles(dir: File, pred: (File => Boolean) = null): Array[File] = { + def loop(dir: File): Array[File] = { + val (dirs, files) = dir.listFiles.sorted.partition(_.isDirectory) + (if (pred == null) files else files.filter(pred)) ++ dirs.flatMap(loop) + } + loop(dir) + } + + // readFile takes a file name or a File, and function to extract a value from + // BufferedSource which defaults to _.mkString; performs the read, closes the + // source, and returns the result + def readFile[T](file: File, read: BufferedSource => T): T = { + val i = Source.fromFile(file) + try read(i) finally i.close + } + def readFile(file: File): String = readFile(file, _.mkString) + + def writeFile(file: File, stuff: Any): Unit = { + Files.write(file.toPath, stuff.toString.getBytes()) + () + } + + def copyFile(from: File, toDir: File, overwrite: Boolean = false): Unit = { + Files.copy(from.toPath, (new File(toDir, from.getName)).toPath, + (if (overwrite) Seq(StandardCopyOption.REPLACE_EXISTING) + else Seq()): _*) + () + } + + // Perhaps this should move into a more specific place, not a generic file utils thing + def zipFolder(dir: File, out: File): Unit = { + import java.io.{ BufferedInputStream, FileInputStream, FileOutputStream } + import java.util.zip.{ ZipEntry, ZipOutputStream } + val bufferSize = 2 * 1024 + val data = new Array[Byte](bufferSize) + val zip = new ZipOutputStream(new FileOutputStream(out)) + val prefixLen = dir.getParentFile.toString.length + 1 + allFiles(dir).foreach { file => + zip.putNextEntry(new ZipEntry(file.toString.substring(prefixLen).replace(java.io.File.separator, "/"))) + val in = new BufferedInputStream(new FileInputStream(file), bufferSize) + var b = 0 + while (b >= 0) { zip.write(data, 0, b); b = in.read(data, 0, bufferSize) } + in.close() + zip.closeEntry() + } + zip.close() + } + + /** + * iterate through the entries of a streamed .zip file, selecting only sampleRatio of them + * + * @param portableStream Stream of zip file + * @param zipfile File name is only used to construct the names of the entries + * @param sampleRatio What fraction of files is returned from zip + */ + class ZipIterator(portableStream: PortableDataStream, zipfile: String, sampleRatio: Double = 1) + extends Iterator[(String, Array[Byte])] { + + val stream = portableStream.open + private val zipstream = new ZipInputStream(stream) + + val random = { + val rd = new Random() + rd.setSeed(0) + rd + } + + private def getNext: Option[(String, Array[Byte])] = { + var entry = zipstream.getNextEntry + while(entry != null){ + if(!entry.isDirectory && random.nextDouble < sampleRatio) { + + val filename = zipfile + java.io.File.separator + entry.getName() + + //extracting all bytes of a given entry + val byteStream = new ByteArrayOutputStream + IOUtils.copy(zipstream, byteStream) + val bytes = byteStream.toByteArray + + assert(bytes.length == entry.getSize, + "incorrect number of bytes is read from zipstream: " + bytes.length + " instead of " + entry.getSize) + + return Some((filename, bytes)) + } + entry = zipstream.getNextEntry + } + + stream.close() + None + } + + private var nextValue = getNext + + def hasNext: Boolean = !nextValue.isEmpty + + def next: (String, Array[Byte]) = { + val result = nextValue.get + nextValue = getNext + result + } + } +} diff --git a/src/core/env/src/main/scala/Logging.scala b/src/core/env/src/main/scala/Logging.scala new file mode 100644 index 0000000000..99bec2f0a8 --- /dev/null +++ b/src/core/env/src/main/scala/Logging.scala @@ -0,0 +1,23 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.logging.log4j.scala.{Logging => Logging4J} +import org.apache.logging.log4j._ + +// Ilya has the logging functions already in a separate branch, so log APIs here removed. +// Merge those into a single trait "Logging" here and have MMLParams incorporate it. + +// Utility to provide log-related canonical construction +// There should be a separate logger at each package (mml, cntk, tlc) +object Logging { + + lazy val config = MMLConfig.get + lazy val logRoot = config.root + + def getLogger(customSuffix: String): Logger = { + LogManager.getLogger(s"$logRoot.$customSuffix") + } + +} diff --git a/src/core/env/src/main/scala/NativeLoader.java b/src/core/env/src/main/scala/NativeLoader.java new file mode 100644 index 0000000000..846d45f302 --- /dev/null +++ b/src/core/env/src/main/scala/NativeLoader.java @@ -0,0 +1,194 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark; + +import java.io.*; +import java.nio.file.Files; +import java.util.ArrayList; + +/** + * A helper class for loading native libraries from Java + * + *

Some Java interfaces depend on native libraries that need to be loaded at runtime. + * This class is a simple utility that can load the native libraries from a jar in one of two ways:

+ * + *
    + *
  • By name: If a particular native library is needed, it will extract it to a temp folder + * (along with its dependencies) and load it from there.
  • + *
  • All libraries: all libraries will be extracted to a temp folder and the libraries in the + * load manifest are loaded in the order provided, or loaded in the order specified in the + * native manifest if no load manifest is provided.
  • + *
+ * + *

The jar with the native libraries must contain a file name 'NATIVE_MANIFEST' that lists + * all native files (one per line, full name) to be extracted. If the loadAll() method is used, + * the libraries will be loaded in the order specified in the manifest. The native libraries should be + * in folders describing the OS they run on: linux, windows, mac.

+ * */ +public class NativeLoader { + + private static final String manifestName = "NATIVE_MANIFEST"; + private static final String loadManifestName = "NATIVE_LOAD_MANIFEST"; + private String resourcesPath; + private String[] nativeList = new String[0]; + private Boolean extractionDone = false; + private File tempDir; + + public NativeLoader(String topLevelResourcesPath) throws IOException{ + this.resourcesPath = getResourcesPath(topLevelResourcesPath); + tempDir = Files.createTempDirectory("tmp").toFile(); + tempDir.deleteOnExit(); + } + + + /** + * Loads all native libraries from the jar file, if the jar contains a plain text file + * named 'NATIVE_MANIFEST'. + * + *

The NATIVE_MANIFEST contains what libraries to be extracted (one per line, full name) + * and the order in which they should be loaded. Alternatively, if only specific top-level + * libraries should be loaded, they can be specified in the NATIVE_LOAD_MANIFEST file in order.

+ * */ + public void loadAll(){ + try{ + extractNativeLibraries(); + try{ + // First try to find the NATIVE_LOAD_MANIFEST and load the libraries there + String[] loadList = getResourceLines(loadManifestName); + for (String libName: loadList){ + System.load(tempDir.getAbsolutePath() + File.separator + libName); + } + } + catch (IOException ee){ + // If loading the NATIVE_LOAD_MANIFEST failed, try loading the libraries + // in the order provided by the NATIVE_MANIFEST + for (String libName: nativeList){ + System.load(tempDir.getAbsolutePath() + File.separator + libName); + } + } + } + catch (Exception e){ + // If nothing worked, throw exception + throw new UnsatisfiedLinkError(String.format("Could not load all native libraries because " + + "we encountered the following error: %s", e.getMessage())); + } + } + + /** + * Loads a named native library from the jar file + * + *

This method will first try to load the library from java.library.path system property. + * Only if that fails, the named native library and its dependencies will be extracted to + * a temporary folder and loaded from there.

+ * */ + public void loadLibraryByName(String libName){ + try{ + // First try loading by name + // It's possible that the native library is already on a path java can discover + System.loadLibrary(libName); + } + catch (UnsatisfiedLinkError e){ + try{ + extractNativeLibraries(); + // Get the OS specific library name + libName = System.mapLibraryName(libName); + // Try to load library from extracted native resources + System.load(tempDir.getAbsolutePath() + File.separator + libName); + } + catch (Exception ee){ + throw new UnsatisfiedLinkError(String.format( + "Could not load the native libraries because " + + "we encountered the following problems: %s and %s", + e.getMessage(), ee.getMessage())); + } + } + } + + private void extractNativeLibraries() throws IOException{ + if (!extractionDone) { + nativeList = getResourceLines(manifestName); + // Extract all OS specific native libraries to temporary location + for (String libName: nativeList) { + extractResourceFromPath(libName, resourcesPath); + } + } + extractionDone = true; + } + + private String[] getResourceLines(String resourceName) throws IOException{ + // Read resource file if it exists + InputStream inStream = NativeLoader.class + .getResourceAsStream(resourcesPath + resourceName); + if (inStream == null) { + throw new FileNotFoundException("Could not find native resources in jar. " + + "Make sure the jar containing the native libraries was added to the classpath."); + } + BufferedReader resourceReader = new BufferedReader( + new InputStreamReader(inStream, "UTF-8") + ); + ArrayList lines = new ArrayList(); + for (String line; (line = resourceReader.readLine()) != null; ) { + lines.add(line); + } + resourceReader.close(); + inStream.close(); + return lines.toArray(new String[lines.size()]); + } + + private static String getResourcesPath(String topLevelResourcesPath){ + String sep = "/"; + String OS = System.getProperty("os.name").toLowerCase(); + String resourcePrefix = topLevelResourcesPath + + sep + "%s" + + sep; + if (OS.contains("linux")){ + return String.format(resourcePrefix, "linux"); + } + else if (OS.contains("windows")){ + return String.format(resourcePrefix, "windows"); + } + else if (OS.contains("mac")|| OS.contains("darwin")){ + return String.format(resourcePrefix, "mac"); + } + else{ + throw new UnsatisfiedLinkError( + String.format("This component doesn't currently have native support for OS: %s", OS) + ); + } + } + + private void extractResourceFromPath(String libName, String prefix) throws IOException{ + + File temp = new File(tempDir.getPath() + File.separator + libName); + temp.createNewFile(); + temp.deleteOnExit(); + + if (!temp.exists()) { + throw new FileNotFoundException(String.format( + "Temporary file %s could not be created. Make sure you can write to this location.", + temp.getAbsolutePath()) + ); + } + + String path = prefix + libName; + InputStream inStream = NativeLoader.class.getResourceAsStream(path); + if (inStream == null) { + throw new FileNotFoundException(String.format("Could not find resource %s in jar.", path)); + } + + FileOutputStream outStream = new FileOutputStream(temp); + byte[] buffer = new byte[1 << 18]; + int bytesRead; + + try { + while ((bytesRead = inStream.read(buffer)) >= 0) { + outStream.write(buffer, 0, bytesRead); + } + } finally { + outStream.close(); + inStream.close(); + } + } + +} diff --git a/src/core/env/src/main/scala/ProcessUtilities.scala b/src/core/env/src/main/scala/ProcessUtilities.scala new file mode 100644 index 0000000000..820577d8b1 --- /dev/null +++ b/src/core/env/src/main/scala/ProcessUtilities.scala @@ -0,0 +1,26 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import scala.sys.process._ + +object ProcessUtils { + + // These are only here until we create a more robust + // stream-redirected utility + def getProcessOutput(cmd: String): String = { + println(s"Capturing external process $cmd...") + val ret = cmd.!! + println(s"$ret...done!") + ret + } + + def runProcess(cmd: String): Int = { + println(s"Executing external process $cmd...") + val ret = cmd .! + println(s"$ret...done!") + ret + } + +} diff --git a/src/core/hadoop/build.sbt b/src/core/hadoop/build.sbt new file mode 100644 index 0000000000..cd0183132b --- /dev/null +++ b/src/core/hadoop/build.sbt @@ -0,0 +1 @@ +// nothing here diff --git a/src/core/hadoop/src/main/scala/HadoopUtils.scala b/src/core/hadoop/src/main/scala/HadoopUtils.scala new file mode 100644 index 0000000000..b5a4bf5350 --- /dev/null +++ b/src/core/hadoop/src/main/scala/HadoopUtils.scala @@ -0,0 +1,176 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.hadoop + +import java.nio.file.Paths + +import org.apache.commons.io.FilenameUtils + +import scala.sys.process._ +import org.apache.hadoop.conf.{Configuration, Configured} +import org.apache.hadoop.fs.{Path, PathFilter} +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat +import org.apache.spark.sql.SparkSession +import scala.language.existentials +import scala.util.Random + +class HadoopUtils(hadoopConf: Configuration) { + // Is there a better way? We need to deduce full Hadoop conf + // including current active namenode etc as well as YARN properties + // going forward anyway for cluster participation in GPU-YARN-queue mode + // fs.defaultFS isn't good on HDI because we rewrite to WASB + // Answer, Slightly better: + /* + $ hdfs getconf -confKey dfs.nameservices + mycluster + $ hdfs getconf -confKey dfs.ha.namenodes.mycluster + nn1,nn2 + $ hdfs haadmin -getServiceState nn1 + active + $ hdfs haadmin -getServiceState nn2 + standby + */ + private val NAMESERVICES_KEY = "dfs.nameservices" + private val NAMENODE_KEY_ROOT = "dfs.ha.namenodes" + private val RPC_KEY_ROOT = "dfs.namenode.rpc-address" + + private def getNameServices: String = { + hadoopConf.get(NAMESERVICES_KEY) + } + + private def getNameNodes: Seq[String] = { + val nameservices = getNameServices + println(s"Nameservices for cluster at '$nameservices'") + hadoopConf.get(combine(NAMENODE_KEY_ROOT, nameservices)).split(",") + } + + private def isActiveNode(namenode: String): Boolean = { + shellout(s"hdfs haadmin -getServiceState $namenode").startsWith("active") + } + + private def combine(keys: String*): String = keys.mkString(".") + + def getActiveNameNode: String = { + val nameservices = getNameServices + println(s"Nameservices for cluster at '$nameservices'") + val namenodes = getNameNodes + println(s"Querying namenodes:\n${namenodes.foreach(println)}") + val active = namenodes.par + .filter(isActiveNode) + .head + println(s"Found $active as active namenode") + hadoopConf.get(combine(RPC_KEY_ROOT, nameservices, active)) + } + + // This is only to make sure all uses go away ASAP into Process utils + // I realize this means it will be around forever + private def shellout(cmd: String): String = { + println(s"Executing external process $cmd...") + val ret = cmd.!! + println(s"$ret...done!") + ret + } + +} + +/** + * Filter that allows loading a fraction of HDFS files. + */ +class SamplePathFilter extends Configured with PathFilter { + val random = { + val rd = new Random() + rd.setSeed(0) + rd + } + + // Ratio of files to be read from disk + var sampleRatio: Double = 1 + + // When inspectZip is enabled, zip files are treated as directories, and SamplePathFilter can't filter them out. + // Otherwise, zip files are treated as regular files and only sampleRatio of them is read. + var inspectZip: Boolean = true + + override def setConf(conf: Configuration): Unit = { + if (conf != null) { + sampleRatio = conf.getDouble(SamplePathFilter.ratioParam, 1) + inspectZip = conf.getBoolean(SamplePathFilter.inspectZipParam, true) + } + } + + override def accept(path: Path): Boolean = { + // Note: checking fileSystem.isDirectory is very slow here, so we use basic rules instead + !SamplePathFilter.isFile(path) || + (SamplePathFilter.isZipFile(path) && inspectZip) || + random.nextDouble() < sampleRatio + } +} + +object SamplePathFilter { + val ratioParam = "sampleRatio" + val inspectZipParam = "inspectZip" + + def isFile(path: Path): Boolean = FilenameUtils.getExtension(path.toString) != "" + + def isZipFile(filename: String): Boolean = FilenameUtils.getExtension(filename) == "zip" + + def isZipFile(path: Path): Boolean = isZipFile(path.toString) + + /** + * Set/unset hdfs PathFilter + * + * @param value Filter class that is passed to HDFS + * @param sampleRatio Fraction of the files that the filter picks + * @param inspectZip Look into zip files, if true + * @param spark Existing Spark session + * @return + */ + def setPathFilter(value: Option[Class[_]], sampleRatio: Option[Double] = None, + inspectZip: Option[Boolean] = None, spark: SparkSession) + : Option[Class[_]] = { + val flagName = FileInputFormat.PATHFILTER_CLASS + val hadoopConf = spark.sparkContext.hadoopConfiguration + val old = Option(hadoopConf.getClass(flagName, null)) + if (sampleRatio.isDefined) { + hadoopConf.setDouble(SamplePathFilter.ratioParam, sampleRatio.get) + } else { + hadoopConf.unset(SamplePathFilter.ratioParam) + None + } + + if (inspectZip.isDefined) { + hadoopConf.setBoolean(SamplePathFilter.inspectZipParam, inspectZip.get) + } else { + hadoopConf.unset(SamplePathFilter.inspectZipParam) + None + } + + value match { + case Some(v) => hadoopConf.setClass(flagName, v, classOf[PathFilter]) + case None => hadoopConf.unset(flagName) + } + old + } +} + +object RecursiveFlag { + /** + * Sets a value of spark recursive flag + * + * @param value value to set + * @param spark existing spark session + * @return previous value of this flag + */ + def setRecursiveFlag(value: Option[String], spark: SparkSession): Option[String] = { + val flagName = FileInputFormat.INPUT_DIR_RECURSIVE + val hadoopConf = spark.sparkContext.hadoopConfiguration + val old = Option(hadoopConf.get(flagName)) + + value match { + case Some(v) => hadoopConf.set(flagName, v) + case None => hadoopConf.unset(flagName) + } + + old + } +} diff --git a/src/core/ml/build.sbt b/src/core/ml/build.sbt new file mode 100644 index 0000000000..050b722ee9 --- /dev/null +++ b/src/core/ml/build.sbt @@ -0,0 +1,3 @@ +//> DependsOn: core/test +//> DependsOn: core/spark +//> DependsOn: core/schema diff --git a/src/core/ml/src/test/scala/HashingTFSpec.scala b/src/core/ml/src/test/scala/HashingTFSpec.scala new file mode 100644 index 0000000000..576a35c53f --- /dev/null +++ b/src/core/ml/src/test/scala/HashingTFSpec.scala @@ -0,0 +1,81 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.schema.DatasetExtensions._ +import org.apache.spark.ml.feature.{HashingTF, Tokenizer} +import org.apache.spark.ml.linalg.SparseVector + +class HashingTFSpec extends TestBase { + + test("operation on tokenized strings") { + val wordDataFrame = session.createDataFrame(Seq( + (0, Array("Hi", "I", "can", "not", "foo", "foo")), + (1, Array("I")), + (2, Array("Logistic", "regression")), + (3, Array("Log", "f", "reg")) + )).toDF("label", "words") + + val hashDF = new HashingTF().setInputCol("words").setOutputCol("hashedTF").transform(wordDataFrame) + val lines = hashDF.getSVCol("hashedTF") + + val trueLines = List( + new SparseVector(262144, Array(36073,51654,113890,139098,242088), Array(1.0,2.0,1.0,1.0,1.0)), + new SparseVector(262144, Array(113890), Array(1.0)), + new SparseVector(262144, Array(13671,142455), Array(1.0,1.0)), + new SparseVector(262144, Array(24152,74466,122984), Array(1.0,1.0,1.0)) + ) + assert(lines === trueLines) + } + + test("support several values for number of features") { + val featureSizes = List(1, 5, 100, 100000) + val words = Array("Hi", "I", "can", "not", "foo", "bar", "foo", "afk") + val wordDataFrame = session.createDataFrame(Seq((0, words))).toDF("label", "words") + + val fsResults = featureSizes.map { n => + new HashingTF() + .setNumFeatures(n) + .setInputCol("words") + .setOutputCol("hashedTF") + .transform(wordDataFrame) + .getSVCol("hashedTF")(0) + } + val trueResults = Array( + new SparseVector( 1, Array(0), Array(8.0)), + new SparseVector( 5, Array(0,2,3), Array(4.0,2.0,2.0)), + new SparseVector( 100, Array(0,10,18,33,62,67,80), Array(1.0,2.0,1.0,1.0,1.0,1.0,1.0)), + new SparseVector(100000, Array(5833,9467,16680,29018,68900,85762,97510), Array(1.0,1.0,1.0,1.0,1.0,1.0,2.0)) + ) + assert(fsResults === trueResults) + } + + test("treat empty strings as another word") { + val wordDataFrame = session.createDataFrame(Seq( + (0, "hey you no way"), + (1, ""))) + .toDF("label", "sentence") + + val tokenized = new Tokenizer().setInputCol("sentence").setOutputCol("tokens").transform(wordDataFrame) + val hashDF = new HashingTF().setInputCol("tokens").setOutputCol("HashedTF").transform(tokenized) + + val lines = hashDF.getSVCol("hashedTF") + assert(lines(1) === new SparseVector(262144, Array(249180), Array(1.0))) + } + + test("raise an error when applied to a null array") { + val tokenDataFrame = session.createDataFrame(Seq( + (0, Some(Array("Hi", "I", "can", "not", "foo"))), + (1, None)) + ).toDF("label", "tokens") + assertSparkException[org.apache.spark.SparkException](new HashingTF().setInputCol("tokens"), tokenDataFrame) + } + + test("raise an error when given strange values of n") { + List(0, -1, -10).foreach { n => + intercept[IllegalArgumentException] { new HashingTF().setNumFeatures(n) } + } + } + +} diff --git a/src/core/ml/src/test/scala/IDFSpec.scala b/src/core/ml/src/test/scala/IDFSpec.scala new file mode 100644 index 0000000000..80c71a7195 --- /dev/null +++ b/src/core/ml/src/test/scala/IDFSpec.scala @@ -0,0 +1,103 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.schema.DatasetExtensions._ +import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} +import org.apache.spark.ml.linalg.{DenseVector, SparseVector} + +class IDFSpec extends TestBase { + + test("operation on hashingTF output") { + val sentenceData = session.createDataFrame(Seq((0, "Hi I"), + (1, "I wish"), + (2, "we Cant"))) + .toDF("label", "sentence") + + val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") + val wordsData = tokenizer.transform(sentenceData) + val hashingTF = new HashingTF() + .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) + val featurizedData = hashingTF.transform(wordsData) + + val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") + val idfModel = idf.fit(featurizedData) + val rescaledData = idfModel.transform(featurizedData) + + val lines = rescaledData.getSVCol("features") + val trueLines = List( + new SparseVector(20, Array(0, 9), Array(0.6931471805599453, 0.28768207245178085)), + new SparseVector(20, Array(9, 15), Array(0.28768207245178085, 0.6931471805599453)), + new SparseVector(20, Array(6, 13), Array(0.6931471805599453, 0.6931471805599453)) + ) + assert(lines === trueLines) + } + + test("operation on dense or sparse vectors") { + val denseVects = Seq((0, new DenseVector(Array(1, 1, 0, 0, 0))), + (1, new DenseVector(Array(0, 1, 1, 0, 0))), + (2, new DenseVector(Array(0, 0, 0, 1, 1)))) + + val denseVectDF = session.createDataFrame(denseVects).toDF("label", "features") + val sparseVectDF = session.createDataFrame(denseVects.map(p => (p._1, p._2.toSparse))).toDF("label", "features") + + val rescaledDD = + new IDF().setInputCol("features").setOutputCol("scaledFeatures").fit(denseVectDF).transform(denseVectDF) + val rescaledDS = + new IDF().setInputCol("features").setOutputCol("scaledFeatures").fit(denseVectDF).transform(sparseVectDF) + val rescaledSD = + new IDF().setInputCol("features").setOutputCol("scaledFeatures").fit(sparseVectDF).transform(denseVectDF) + val rescaledSS = + new IDF().setInputCol("features").setOutputCol("scaledFeatures").fit(sparseVectDF).transform(sparseVectDF) + + val resultsD = List(rescaledDD, rescaledSD).map(_.getDVCol("scaledFeatures")) + val resultsS = List(rescaledDS, rescaledSS).map(_.getSVCol("scaledFeatures")) + + assert(resultsD.head === resultsD(1)) + assert(resultsS.head === resultsS(1)) + assert(resultsD.head.map(_.toSparse) === resultsS.head) + } + + test("raise an error when applied to a null array") { + val df = session.createDataFrame(Seq((0, Some(new DenseVector(Array(1, 1, 0, 0, 0)))), + (1, Some(new DenseVector(Array(0, 1, 1, 0, 0)))), + (2, None))) + .toDF("id", "features") + val df2 = new IDF().setInputCol("features") + withoutLogging { + intercept[org.apache.spark.SparkException] { + new IDF().setInputCol("features").fit(df) + } + } + } + + test("support setting minDocFrequency") { + val df = session.createDataFrame(Seq((0, new DenseVector(Array(1, 1, 0, 0, 0))), + (1, new DenseVector(Array(0, 1, 1, 0, 0))), + (2, new DenseVector(Array(0, 0, 0, 1, 1))))) + .toDF("id", "features") + + val df2 = new IDF().setMinDocFreq(2) + .setInputCol("features").setOutputCol("rescaledFeatures") + .fit(df).transform(df) + val lines = df2.getDVCol("rescaledFeatures") + val trueLines = List(new DenseVector(Array(0.0, 0.28768207245178085, 0.0, 0.0, 0.0)), + new DenseVector(Array(0.0, 0.28768207245178085, 0.0, 0.0, 0.0)), + new DenseVector(Array(0.0, 0.0, 0.0, 0.0, 0.0))) + assert(lines === trueLines) + } + + ignore("raise an error when given strange values of minDocumentFrequency") { + val df = session.createDataFrame(Seq((0, new DenseVector(Array(1, 1, 0, 0, 0))), + (1, new DenseVector(Array(0, 1, 1, 0, 0))), + (2, new DenseVector(Array(0, 0, 0, 1, 1))))) + .toDF("id", "features") + // new IDF().setMinDocFreq(-1).setInputCol("features").fit(df).transform(df).show() + List(-1, -10).foreach { n => + val estimator = new IDF().setMinDocFreq(n).setInputCol("features") + assertSparkException[IllegalArgumentException](estimator, df) + } + } + +} diff --git a/src/core/ml/src/test/scala/NGramSpec.scala b/src/core/ml/src/test/scala/NGramSpec.scala new file mode 100644 index 0000000000..a9e9cb247b --- /dev/null +++ b/src/core/ml/src/test/scala/NGramSpec.scala @@ -0,0 +1,74 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.ml.feature.{NGram, Tokenizer} +import org.apache.spark.sql.DataFrame + +import scala.collection.mutable + +class NGramSpec extends TestBase { + + def ngramDFToScalaList(dataFrame: DataFrame, outputCol: String = "ngrams"): Array[List[Any]] = { + dataFrame.select(dataFrame(outputCol)).collect() + .map(_.getAs[mutable.WrappedArray[Any]](0).toList) + } + + test("operation on tokenized strings") { + val wordDataFrame = session.createDataFrame(Seq((0, Array("Hi", "I", "can", "not", "foo")), + (1, Array("I")), + (2, Array("Logistic", "regression")), + (3, Array("Log", "f", "reg")))) + .toDF("label", "words") + + val ngramDF = new NGram().setN(3) + .setInputCol("words").setOutputCol("ngrams") + .transform(wordDataFrame) + val ngrams = ngramDFToScalaList(ngramDF) + assert(ngrams(0) === Array("Hi I can", "I can not", "can not foo")) + assert(ngrams(1) === Array()) + assert(ngrams(2) === Array()) + assert(ngrams(3) === Array("Log f reg")) + } + + test("supporting several values for n") { + val ns = 1 to 6 + val words = Array("Hi", "I", "can", "not", "foo", "bar", "foo", "afk") + val wordDataFrame = session.createDataFrame(Seq((0, words))).toDF("label", "words") + val nGramResults = ns.map { n => + ngramDFToScalaList( + new NGram().setN(n) + .setInputCol("words").setOutputCol("ngrams") + .transform(wordDataFrame)) + } + ns.foreach { n => + assert(nGramResults(n-1)(0).head === words.take(n).mkString(" ")) + } + } + + test("handling empty strings gracefully") { + val wordDataFrame = session.createDataFrame(Seq((0, "hey you no way"), + (1, ""))) + .toDF("label", "sentence") + + val tokenized = new Tokenizer().setInputCol("sentence").setOutputCol("tokens").transform(wordDataFrame) + val ngrams = new NGram().setInputCol("tokens").setOutputCol("ngrams").transform(tokenized) + assert(ngramDFToScalaList(ngrams)(1) === Nil) + } + + test("raise an error when applied to a null array") { + val tokenDataFrame = session.createDataFrame(Seq( + (0, Some(Array("Hi", "I", "can", "not", "foo"))), + (1, None)) + ).toDF("label", "tokens") + assertSparkException[org.apache.spark.SparkException](new NGram().setInputCol("tokens"), tokenDataFrame) + } + + test("raise an error when given strange values of n") { + List(0, -1, -10).foreach { n => + intercept[IllegalArgumentException] { new NGram().setN(n) } + } + } + +} diff --git a/src/core/ml/src/test/scala/OneHotEncoderSpec.scala b/src/core/ml/src/test/scala/OneHotEncoderSpec.scala new file mode 100644 index 0000000000..18bbe4e00e --- /dev/null +++ b/src/core/ml/src/test/scala/OneHotEncoderSpec.scala @@ -0,0 +1,102 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.schema.DatasetExtensions._ +import org.apache.spark._ +import org.apache.spark.ml.feature.OneHotEncoder +import org.apache.spark.ml.linalg.SparseVector + +class OneHotEncoderSpec extends TestBase { + + test("expand category indicies") { + val df = session.createDataFrame(Seq((0, 0.0), + (1, 1.0), + (2, 0.0), + (3, 2.0), + (4, 1.0), + (5, 0.0))) + .toDF("id", "categoryIndex") + + val encoded = + new OneHotEncoder() + .setInputCol("categoryIndex").setOutputCol("categoryVec") + .transform(df) + val oneHotList = encoded.getSVCol("categoryVec") + val trueList = List(new SparseVector(2, Array(0), Array(1.0)), + new SparseVector(2, Array(1), Array(1.0)), + new SparseVector(2, Array(0), Array(1.0)), + new SparseVector(2, Array(), Array()), + new SparseVector(2, Array(1), Array(1.0)), + new SparseVector(2, Array(0), Array(1.0))) + assert(oneHotList === trueList) + } + + test("support interger indicies") { + val df = session.createDataFrame(Seq((0, 0), + (1, 1), + (2, 0), + (3, 2), + (4, 1), + (5, 0) + )) + .toDF("id", "categoryIndex") + + val encoded= new OneHotEncoder().setInputCol("categoryIndex").setOutputCol("categoryVec").transform(df) + val oneHotList = encoded.getSVCol("categoryVec") + val trueList = List(new SparseVector(2, Array(0), Array(1.0)), + new SparseVector(2, Array(1), Array(1.0)), + new SparseVector(2, Array(0), Array(1.0)), + new SparseVector(2, Array(), Array()), + new SparseVector(2, Array(1), Array(1.0)), + new SparseVector(2, Array(0), Array(1.0))) + assert(oneHotList === trueList) + } + + test("support not dropping the last feature") { + val df = session.createDataFrame(Seq((0, 0.0), + (1, 1.0), + (2, 0.0), + (3, 2.0), + (4, 1.0), + (5, 0.0) + )) + .toDF("id", "categoryIndex") + + val encoded= new OneHotEncoder().setDropLast(false) + .setInputCol("categoryIndex").setOutputCol("categoryVec") + .transform(df) + val oneHotList = encoded.getSVCol("categoryVec") + val trueList = List(new SparseVector(3, Array(0), Array(1.0)), + new SparseVector(3, Array(1), Array(1.0)), + new SparseVector(3, Array(0), Array(1.0)), + new SparseVector(3, Array(2), Array(1.0)), + new SparseVector(3, Array(1), Array(1.0)), + new SparseVector(3, Array(0), Array(1.0))) + assert(oneHotList === trueList) + } + + test("raise an error when applied to a null array") { + val df = session.createDataFrame(Seq((0, Some(0.0)), + (1, Some(1.0)), + (2, None))) + .toDF("id", "categoryIndex") + assertSparkException[SparkException](new OneHotEncoder().setInputCol("categoryIndex"), df) + } + + test("raise an error when it receives a strange float") { + val df = session.createDataFrame(Seq((0, 0.0), + (1, 1.0), + (2, 0.4))) + .toDF("id", "categoryIndex") + assertSparkException[SparkException](new OneHotEncoder().setInputCol("categoryIndex"), df) + + val df2 = session.createDataFrame(Seq((0, 0.0), + (1, 1.0), + (2, -1.0))) + .toDF("id", "categoryIndex") + assertSparkException[SparkException](new OneHotEncoder().setInputCol("categoryIndex"), df2) + } + +} diff --git a/src/core/ml/src/test/scala/Word2VecSpec.scala b/src/core/ml/src/test/scala/Word2VecSpec.scala new file mode 100644 index 0000000000..82f7e0ffdd --- /dev/null +++ b/src/core/ml/src/test/scala/Word2VecSpec.scala @@ -0,0 +1,93 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.schema.DatasetExtensions._ +import org.apache.spark.ml.feature.Word2Vec +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.sql.DataFrame + +class Word2VecSpec extends TestBase { + + def genTokenizedText(): DataFrame = { + session.createDataFrame(Seq( + (0, Array("I", "walked", "the", "dog", "down", "the", "street")), + (1, Array("I", "walked", "with", "the", "dog")), + (2, Array("I", "walked", "the", "pup")) + )).toDF("label", "words") + } + + def genW2V(): Word2Vec = new Word2Vec().setSeed(1234).setMinCount(0) + + test("operation on tokenized strings") { + val df = genTokenizedText() + + val df2 = genW2V().setVectorSize(2) + .setInputCol("words").setOutputCol("features").fit(df).transform(df) + + val lines = df2.getDVCol("features") + assert(lines.forall(_.size == 2)) + } + + test("return vectors") { + val df = genTokenizedText() + val model = genW2V().setVectorSize(2) + .setInputCol("words").setOutputCol("features").fit(df) + val vectors = model.getVectors.getDVCol("vector") + assert(vectors(0).size == 2) + } + + test("return synonyms") { + val df = genTokenizedText() + val model = genW2V().setVectorSize(2) + .setInputCol("words").setOutputCol("features").fit(df) + val synonyms = model.findSynonyms("dog", 2).getColAs[String]("word") + assert(synonyms.length === 2) + } + + test("raise an error when applied to a null array") { + val tokenDataFrame = session.createDataFrame(Seq( + (0, Some(Array("Hi", "I", "can", "not", "foo"))), + (1, None)) + ).toDF("label", "tokens") + assertSparkException[org.apache.spark.SparkException](genW2V().setInputCol("tokens"), tokenDataFrame) + } + + test("raise an error when given strange values of parameters") { + def base(): Word2Vec = genW2V().setInputCol("words") + def assertIllegalArgument[T](f: T => Any, args: T*): Unit = + args.foreach { n => interceptWithoutLogging[IllegalArgumentException] { f(n) } } + assertIllegalArgument[Int](base.setMinCount, -1, -10) + assertIllegalArgument[Int](base.setMaxIter, -1, -10) + assertIllegalArgument[Int](base.setVectorSize, 0, -1, -10) + assertIllegalArgument[Int](base.setWindowSize, 0, -1, -10) + assertIllegalArgument[Int](base.setMaxSentenceLength, 0, -1, -10) + assertIllegalArgument[Int](base.setNumPartitions, 0, -1, -10) + assertIllegalArgument[Double](base.setStepSize, 0.0, -1.0, -10.0) + } + + test("return a vector of zeros when it encounters an OOV word") { + val df = genTokenizedText() + val model = genW2V().setVectorSize(2).setMinCount(1).setInputCol("words").setOutputCol("features").fit(df) + val df2 = session.createDataFrame(Seq( + (0, Array("ketchup")))).toDF("label", "words") + val results = model.transform(df2) + val lines = results.getDVCol("features") + val trueLines = List(new DenseVector(Array(0.0, 0.0))) + assert(lines === trueLines) + } + + test("be able to set vector size") { + val df = genTokenizedText() + val vectorSizes = List(1, 10, 100) + vectorSizes.foreach { n => + val results = + genW2V().setVectorSize(n) + .setInputCol("words").setOutputCol("features").fit(df).transform(df) + .getDVCol("features") + assert(results(0).size === n) + } + } + +} diff --git a/src/core/schema/build.sbt b/src/core/schema/build.sbt new file mode 100644 index 0000000000..d61f197ca6 --- /dev/null +++ b/src/core/schema/build.sbt @@ -0,0 +1,4 @@ +// Explicitly prevent core/test code from depending on core sources +//> DependsOn: core/test +//> DependsOn: core/spark +//> DependsOn: core/env diff --git a/src/core/schema/src/main/python/TypeConversionUtils.py b/src/core/schema/src/main/python/TypeConversionUtils.py new file mode 100644 index 0000000000..c3dfc50595 --- /dev/null +++ b/src/core/schema/src/main/python/TypeConversionUtils.py @@ -0,0 +1,17 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +def generateTypeConverter(name, cache, typeConverter): + return lambda value: typeConverter(name, value, cache) + +def complexTypeConverter(name, value, cache): + cache[name]=value + if isinstance(value, list): + java_value=[] + for v in value: + if hasattr(v, "_transfer_params_to_java"): + v._transfer_params_to_java() + java_value.append(v._java_obj) + return java_value + value._transfer_params_to_java() + return value._java_obj diff --git a/src/core/schema/src/main/python/Utils.py b/src/core/schema/src/main/python/Utils.py new file mode 100644 index 0000000000..bebde7030a --- /dev/null +++ b/src/core/schema/src/main/python/Utils.py @@ -0,0 +1,69 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +import sys + +if sys.version >= "3": + basestring = str + +from pyspark.ml.util import JavaMLReadable, JavaMLReader, MLReadable +from pyspark.ml.wrapper import JavaParams +from pyspark.ml.common import inherit_doc + +def from_java(java_stage, stage_name): + """ + Given a Java object, create and return a Python wrapper of it. + Used for ML persistence. + Meta-algorithms such as Pipeline should override this method as a classmethod. + """ + def __get_class(clazz): + """ + Loads Python class from its name. + """ + parts = clazz.split(".") + module = ".".join(parts[:-1]) + m = __import__(module) + for comp in parts[1:]: + m = getattr(m, comp) + return m + # Generate a default new instance from the stage_name class. + py_type = __get_class(stage_name) + if issubclass(py_type, JavaParams): + # Load information from java_stage to the instance. + py_stage = py_type() + py_stage._java_obj = java_stage + py_stage._resetUid(java_stage.uid()) + py_stage._transfer_params_from_java() + elif hasattr(py_type, "_from_java"): + py_stage = py_type._from_java(java_stage) + else: + raise NotImplementedError("This Java stage cannot be loaded into Python currently: %r" + % stage_name) + return py_stage + +@inherit_doc +class JavaMMLReadable(MLReadable): + """ + (Private) Mixin for instances that provide JavaMLReader. + """ + + @classmethod + def read(cls): + """Returns an MLReader instance for this class.""" + return JavaMMLReader(cls) + +@inherit_doc +class JavaMMLReader(JavaMLReader): + """ + (Private) Specialization of :py:class:`MLReader` for :py:class:`JavaParams` types + """ + + def __init__(self, clazz): + super(JavaMMLReader, self).__init__(clazz) + + @classmethod + def _java_loader_class(cls, clazz): + """ + Returns the full class name of the Java ML instance. + """ + return clazz.getJavaPackage() diff --git a/src/core/schema/src/main/scala/BinaryFileSchema.scala b/src/core/schema/src/main/scala/BinaryFileSchema.scala new file mode 100644 index 0000000000..2f0d7f2be9 --- /dev/null +++ b/src/core/schema/src/main/scala/BinaryFileSchema.scala @@ -0,0 +1,32 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.schema + +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.types.{StructType, StructField, StringType, BinaryType} + +object BinaryFileSchema { + + /* + * schema for the binary file column: Row(String, Array[Byte]) + */ + val columnSchema = StructType(Seq( + StructField("path", StringType, true), + StructField("bytes", BinaryType, true) //raw file bytes + )) + + def getPath(row: Row): String = row.getString(0) + def getBytes(row: Row): Array[Byte] = row.getAs[Array[Byte]](1) + + /** + * Check if the dataframe column contains binary file data (i.e. has BinaryFileSchema) + * + * @param df + * @param column + * @return + */ + def isBinaryFile(df: DataFrame, column: String): Boolean = + df.schema(column).dataType == columnSchema + +} diff --git a/src/core/schema/src/main/scala/Categoricals.scala b/src/core/schema/src/main/scala/Categoricals.scala new file mode 100644 index 0000000000..f9367949da --- /dev/null +++ b/src/core/schema/src/main/scala/Categoricals.scala @@ -0,0 +1,317 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.schema + +/** + * Contains objects and functions to manipulate Categoricals + */ +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types._ +import org.apache.spark.ml.attribute._ +import org.apache.spark.sql.functions.udf +import SchemaConstants._ + +import scala.reflect.ClassTag + +object CategoricalUtilities { + + /** + * Sets the given levels on the column. + * @return The modified dataset. + */ + def setLevels(dataset: DataFrame, column: String, levels: Array[_]): DataFrame = { + if (levels == null) dataset + else dataset.withColumn(column, + dataset.col(column).as(column, + updateLevelsMetadata(dataset.schema(column).metadata, + levels, + getCategoricalTypeForValue(levels.head)))) + } + + /** + * Update the levels on the existing metadata. + * @param existingMetadata The existing metadata to add to. + * @param levels The levels to add to the metadata. + * @param dataType The datatype of the levels. + * @return The new metadata. + */ + def updateLevelsMetadata(existingMetadata: Metadata, levels: Array[_], dataType: DataType): Metadata = { + val bldr = + if (existingMetadata.contains(MMLTag)) { + new MetadataBuilder().withMetadata(existingMetadata.getMetadata(MMLTag)) + } else { + new MetadataBuilder() + } + bldr.putBoolean(Ordinal, false) + dataType match { + case DataTypes.StringType => bldr.putStringArray(ValuesString, levels.asInstanceOf[Array[String]]) + case DataTypes.DoubleType => bldr.putDoubleArray(ValuesDouble, levels.asInstanceOf[Array[Double]]) + // Ints require special treatment, because Spark does not have putIntArray yet: + case DataTypes.IntegerType => bldr.putLongArray(ValuesInt, levels.asInstanceOf[Array[Int]].map(_.toLong)) + case DataTypes.LongType => bldr.putLongArray(ValuesLong, levels.asInstanceOf[Array[Long]]) + case DataTypes.BooleanType => bldr.putBooleanArray(ValuesBool, levels.asInstanceOf[Array[Boolean]]) + case _ => throw new UnsupportedOperationException("Unsupported categorical data type: " + dataType) + } + val metadata = bldr.build() + + new MetadataBuilder().withMetadata(existingMetadata).putMetadata(MMLTag, metadata).build() + } + + /** + * Gets the levels from the dataset. + * @param schema The schema to get the levels from. + * @param column The column to retrieve metadata levels from. + * @return The levels. + */ + def getLevels(schema: StructType, column: String): Option[Array[_]] = { + val metadata = schema(column).metadata + + if (metadata.contains(MMLTag)) { + val dataType: Option[DataType] = getDataType(metadata) + if (dataType.isEmpty) None + else { + dataType.get match { + case DataTypes.StringType => Some(getMap[String](metadata).levels) + case DataTypes.LongType => Some(getMap[Long](metadata).levels) + case DataTypes.IntegerType => Some(getMap[Int](metadata).levels) + case DataTypes.DoubleType => Some(getMap[Double](metadata).levels) + case DataTypes.BooleanType => Some(getMap[Boolean](metadata).levels) + case default => throw new UnsupportedOperationException("Unknown categorical type: " + default.typeName) + } + } + } else { + None + } + } + + /** + * Gets the number of levels from the dataset. + * @param dataset The dataset to get the levels count from. + * @param column The column to retrieve metadata levels count from. + * @return The number of levels. + */ + def getLevelCount(dataset: DataFrame, column: String): Option[Int] = { + val metadata = dataset.schema(column).metadata + + if (metadata.contains(MMLTag)) { + val dataType: Option[DataType] = getDataType(metadata) + + if (dataType.isEmpty) None + else { + val numLevels = + dataType.get match { + case DataTypes.StringType => getMap[String](metadata).numLevels + case DataTypes.LongType => getMap[Long](metadata).numLevels + case DataTypes.IntegerType => getMap[Int](metadata).numLevels + case DataTypes.DoubleType => getMap[Double](metadata).numLevels + case DataTypes.BooleanType => getMap[Boolean](metadata).numLevels + case default => throw new UnsupportedOperationException("Unknown categorical type: " + default.typeName) + } + Option(numLevels) + } + } else { + None + } + } + + /** + * Get the map of array of T from the metadata. + * + * @param ct Implicit class tag. + * @param metadata The metadata to retrieve from. + * @tparam T The type of map to retrieve. + * @return The map of array of T. + */ + def getMap[T](metadata: Metadata)(implicit ct: ClassTag[T]): CategoricalMap[T] = { + val data = + if (metadata.contains(MMLTag)) { + metadata.getMetadata(MMLTag) + } else if (metadata.contains(MLlibTag)) { + metadata.getMetadata(MLlibTag) + } else { + sys.error("Invalid metadata to retrieve map from") + } + + val categoricalMap = implicitly[ClassTag[T]] match { + case ClassTag.Int => new CategoricalMap[Int](data.getLongArray(ValuesInt).map(_.toInt)) + case ClassTag.Double => new CategoricalMap[Double](data.getDoubleArray(ValuesDouble)) + case ClassTag.Boolean => new CategoricalMap[Boolean](data.getBooleanArray(ValuesBool)) + case ClassTag.Long => new CategoricalMap[Long](data.getLongArray(ValuesLong)) + case _ => new CategoricalMap[String](data.getStringArray(ValuesString)) + } + categoricalMap.asInstanceOf[CategoricalMap[T]] + } + + /** + * Get a type for the given value. + * @param value The value to get the type from. + * @tparam T The generic type of the value. + * @return The DataType based on the value. + */ + def getCategoricalTypeForValue[T](value: T): DataType = { + value match { + // Complicated type matching is requred to get around type erasure + case _: String => DataTypes.StringType + case _: Double => DataTypes.DoubleType + case _: Int => DataTypes.IntegerType + case _: Long => DataTypes.LongType + case _: Boolean => DataTypes.BooleanType + case _ => throw new UnsupportedOperationException("Unsupported categorical data type") + } + } + + private def getDataType(metadata: Metadata): Option[DataType] = { + val columnMetadata = metadata.getMetadata(MMLTag) + val dataType = + if (columnMetadata.contains(ValuesString)) Some(DataTypes.StringType) + else if (columnMetadata.contains(ValuesLong)) Some(DataTypes.LongType) + else if (columnMetadata.contains(ValuesInt)) Some(DataTypes.IntegerType) + else if (columnMetadata.contains(ValuesLong)) Some(DataTypes.LongType) + else if (columnMetadata.contains(ValuesDouble)) Some(DataTypes.DoubleType) + else if (columnMetadata.contains(ValuesBool)) Some(DataTypes.BooleanType) + else None + dataType + } + +} + +/** + * A wrapper around level maps: Map[T -> Int] and Map[Int -> T] that converts + * the data to/from Spark Metadata in both MLib and AzreML formats. + * @param levels The level values are assumed to be already sorted as needed + * @param isOrdinal A flag that indicates if the data are ordinal + * @tparam T Input levels could be String, Double, Int, Long, Boolean + */ +class CategoricalMap[T](val levels: Array[T], val isOrdinal: Boolean = false) extends Serializable { + //TODO: handle NULL values + + require(levels.distinct.size == levels.size, "Categorical levels are not unique.") + require(!levels.isEmpty, "Levels should not be empty") + + /** total number of level */ + val numLevels = levels.length //TODO: add the maximum possible number of levels? + + /** Spark DataType correspondint to type T */ + val dataType = CategoricalUtilities.getCategoricalTypeForValue(levels.head) + + /** Maps levels to the corresponding integer index */ + private lazy val levelToIndex: Map[T, Int] = levels.zipWithIndex.toMap + + /** Returns the index of the given level, can throw */ + def getIndex(level: T): Int = levelToIndex(level) + + /** Returns the index of a given level as Option; does not throw */ + def getIndexOption(level: T): Option[Int] = levelToIndex.get(level) + + /** Checks if the given level exists */ + def hasLevel(level: T): Boolean = levelToIndex.contains(level) + + /** Returns the level of the given index; can throw */ + def getLevel(index: Int): T = levels(index) + + /** Returns the level of the given index as Option; does not throw */ + def getLevelOption(index: Int): Option[T] = + if (index < 0 || index >= numLevels) None else Some(levels(index)) + + /** Stores levels in Spark Metadata in either MLlib format */ + private def toMetadataMllib(existingMetadata: Metadata): Metadata = { + require(!isOrdinal, "Cannot save Ordinal data in MLlib Nominal format currently," + + " because it does not have a public constructor that accepts Ordinal") + + // Currently, MLlib converts all non-string categorical values to string; + // see org.apache.spark.ml.feature.StringIndexer + val strLevels = levels.map(_.toString).asInstanceOf[Array[String]] + + NominalAttribute.defaultAttr.withValues(strLevels).toMetadata(existingMetadata) + } + + /** Stores levels in Spark Metadata in MML format */ + private def toMetadataMML(existingMetadata: Metadata): Metadata = { + CategoricalUtilities.updateLevelsMetadata(existingMetadata, levels, dataType) + } + + /** Add categorical levels to existing Spark Metadata + * @param existingMetadata [tag, categorical metadata] pair is added to existingMetadata, + * where tag is either MLlib or MML + * @param mmlStyle MML (true) or MLlib metadata (false) + */ + def toMetadata(existingMetadata: Metadata, mmlStyle: Boolean): Metadata = { + + // assert that metadata does not have data with this tag + def assertNoTag(tag: String) = + assert(!existingMetadata.contains(tag), + //TODO: add tests to ensure + s"Metadata already contains the tag $tag; all the data are eraised") + + if (mmlStyle) { + assertNoTag(MMLTag) + toMetadataMML(existingMetadata) + } else { + assertNoTag(MLlibTag) + toMetadataMllib(existingMetadata) + } + } + + /** Add categorical levels and in either MML or MLlib style metadata + * @param mmlStyle MML (true) or MLlib metadata (false) + */ + def toMetadata(mmlStyle: Boolean): Metadata = toMetadata(Metadata.empty, mmlStyle) + +} + +/** + * Extract categorical info from the DataFrame column + * @param df dataframe + * @param column column name + */ +class CategoricalColumnInfo(df: DataFrame, column: String) { + + private val columnSchema = df.schema(column) + private val metadata = columnSchema.metadata + + /** Get the basic info: whether the column is categorical or not, actual type of the column, etc */ + val (isCategorical, isMML, isOrdinal, dataType) = { + + val notCategorical = (false, false, false, NullType) + + if (columnSchema.dataType != DataTypes.IntegerType + && columnSchema.dataType != DataTypes.DoubleType) notCategorical + else if (metadata.contains(MMLTag)) { + val columnMetadata = metadata.getMetadata(MMLTag) + + if (!columnMetadata.contains(Ordinal)) notCategorical + else { + val isOrdinal = columnMetadata.getBoolean(Ordinal) + + val dataType = + if (columnMetadata.contains(ValuesString)) DataTypes.StringType + else if (columnMetadata.contains(ValuesLong)) DataTypes.LongType + else if (columnMetadata.contains(ValuesInt)) DataTypes.IntegerType + else if (columnMetadata.contains(ValuesLong)) DataTypes.LongType + else if (columnMetadata.contains(ValuesDouble)) DataTypes.DoubleType + else if (columnMetadata.contains(ValuesBool)) DataTypes.BooleanType + else throw new Exception("Unrecognized datatype in MML metadata") + + (true, true, isOrdinal, dataType) + } + } + else if (metadata.contains(MLlibTag)) { + val columnMetadata = metadata.getMetadata(MLlibTag) + // nominal metadata has ["type" -> "nominal"] pair + val isCategorical = columnMetadata.contains(MLlibTypeTag) && + columnMetadata.getString(MLlibTypeTag) == AttributeType.Nominal.name + + if (!isCategorical) notCategorical + else { + val isOrdinal = if (columnMetadata.contains(Ordinal)) columnMetadata.getBoolean(Ordinal) else false + val dataType = + if (columnMetadata.contains(ValuesString)) DataTypes.StringType + else throw new UnsupportedOperationException("nominal attribute does not contain string levels") + (true, false, isOrdinal, dataType) + } + } else + notCategorical + } + +} diff --git a/src/core/schema/src/main/scala/DatasetExtensions.scala b/src/core/schema/src/main/scala/DatasetExtensions.scala new file mode 100644 index 0000000000..c71a814c68 --- /dev/null +++ b/src/core/schema/src/main/scala/DatasetExtensions.scala @@ -0,0 +1,68 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.schema + +import org.apache.spark.ml.linalg.{DenseVector, SparseVector} +import org.apache.spark.sql.DataFrame +import scala.collection.mutable + +/** + * Contains methods for manipulating spark dataframes and datasets. + */ +object DatasetExtensions { + + implicit class MMLDataFrame(val df: DataFrame) extends AnyVal { + /** + * Finds an unused column name given initial column name in the given schema. + * The unused column name will be given prefix with a number appended to it, eg "testColumn_5". + * There will be an underscore between the column name and the number appended. + * + * @return The unused column name. + */ + def withDerivativeCol(prefix: String): String = { + val columnNamesSet = mutable.HashSet(df.columns: _*) + findUnusedColumnName(prefix)(columnNamesSet) + } + + /** + * Gets the column values as the given type. + * @param colname The column name to retrieve from. + * @tparam T The type to retrieve. + * @return The sequence of values in the column. + */ + def getColAs[T](colname: String): Seq[T] = { + df.select(colname).collect.map(_.getAs[T](0)) + } + + /** + * Gets the spark sparse vector column. + * @return The spark sparse vector column. + */ + def getSVCol: String => Seq[SparseVector] = getColAs[SparseVector] _ + + /** + * Gets the spark dense vector column. + * @return The spark dense vector column. + */ + def getDVCol: String => Seq[DenseVector] = getColAs[DenseVector] _ + } + + /** + * Finds an unused column name given initial column name and a list of existing column names. + * The unused column name will be given prefix with a number appended to it, eg "testColumn_5". + * There will be an underline between the column name and the number appended. + * + * @return The unused column name. + */ + def findUnusedColumnName(prefix: String)(columnNames: scala.collection.Set[String]): String = { + var counter = 2 + var unusedColumnName = prefix + while (columnNames.contains(unusedColumnName)) { + unusedColumnName += "_" + counter + counter += 1 + } + unusedColumnName + } + +} diff --git a/src/core/schema/src/main/scala/ImageSchema.scala b/src/core/schema/src/main/scala/ImageSchema.scala new file mode 100644 index 0000000000..f5c2502390 --- /dev/null +++ b/src/core/schema/src/main/scala/ImageSchema.scala @@ -0,0 +1,46 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.schema + +import com.microsoft.ml.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.types._ + +import scala.reflect.ClassTag + +object ImageSchema { + + /** + * schema for the image column: Row(String, Int, Int, Int, Array[Byte]) + */ + val columnSchema = StructType( + StructField("path", StringType, true) :: + StructField("height", IntegerType, true) :: + StructField("width", IntegerType, true) :: + StructField("type", IntegerType, true) :: //OpenCV type: CV_8U in most cases + StructField("bytes", BinaryType, true) :: Nil) //OpenCV bytes: row-wise BGR in most cases + + def getPath(row: Row): String = row.getString(0) + def getHeight(row: Row): Int = row.getInt(1) + def getWidth(row: Row): Int = row.getInt(2) + def getType(row: Row): Int = row.getInt(3) + def getBytes(row: Row): Array[Byte] = row.getAs[Array[Byte]](4) + + /** + * Check if the dataframe column contains images (i.e. has imageSchema) + * + * @param df + * @param column + * @return + */ + def isImage(df: DataFrame, column: String): Boolean = + df.schema(column).dataType == columnSchema + + private[spark] def loadLibraryForAllPartitions[T:ClassTag](rdd: RDD[T], lib: String):RDD[T] = { + def perPartition(it: Iterator[T]):Iterator[T] = { + new NativeLoader("/org/opencv/lib").loadLibraryByName(lib); it } + rdd.mapPartitions(perPartition, preservesPartitioning = true) + } +} diff --git a/src/core/schema/src/main/scala/SchemaConstants.scala b/src/core/schema/src/main/scala/SchemaConstants.scala new file mode 100644 index 0000000000..b685f8ea73 --- /dev/null +++ b/src/core/schema/src/main/scala/SchemaConstants.scala @@ -0,0 +1,44 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.schema + +/** + * Contains constants used by modules for schema. + */ +object SchemaConstants { + + val ScoreColumnKind = "ScoreColumnKind" + val ScoreValueKind = "ScoreValueKind" + + val TrueLabelsColumn = "true_labels" + val ScoredLabelsColumn = "scored_labels" + val ScoresColumn = "scores" + val ScoredProbabilitiesColumn = "scored_probabilities" + + val ScoreModelPrefix = "score_model" + val MMLTag = "mml" // MML metadata tag + val MLlibTag = "ml_attr" // MLlib metadata tag, see org.apache.spark.ml.attribute.AttributeKeys + + /** The following tags are used in Metadata representation of categorical data + * do not change them or use them directly + * (see org.apache.spark.ml.attribute.AttributeKeys for the first three) + */ + val Ordinal = "ord" // common tag for both MLlib and MML + val MLlibTypeTag = "type" // MLlib tag for the attribute types + val ValuesString = "vals" // common tag for both MLlib and MML + val ValuesInt = "vals_int" + val ValuesLong = "vals_long" + val ValuesDouble = "vals_double" + val ValuesBool = "vals_bool" + + // Score value kinds, or types of ML: + val ClassificationKind = "Classification" + val RegressionKind = "Regression" + + // Spark native column names + val SparkPredictionColumn = "prediction" + val SparkRawPredictionColumn = "rawPrediction" + val SparkProbabilityColumn = "probability" + +} diff --git a/src/core/schema/src/main/scala/SparkSchema.scala b/src/core/schema/src/main/scala/SparkSchema.scala new file mode 100644 index 0000000000..858409cf37 --- /dev/null +++ b/src/core/schema/src/main/scala/SparkSchema.scala @@ -0,0 +1,352 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.schema + +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.functions._ +import SchemaConstants._ +import scala.reflect.ClassTag + +/** + * Schema modification and information retrieval methods. + */ +object SparkSchema { + + /** + * Sets the label column name. + * + * @param dataset The dataset to set the label column name on. + * @param modelName The model name. + * @param columnName The column name to set as the label. + * @param scoreValueKindModel The model type. + * @return The modified dataset. + */ + def setLabelColumnName: (DataFrame, String, String, String) => DataFrame = + setColumnName(TrueLabelsColumn) + + /** + * Sets the scored labels column name. + * + * @param dataset The dataset to set the scored labels column name on. + * @param modelName The model name. + * @param columnName The column name to set as the scored label. + * @param scoreValueKindModel The model type. + * @return The modified dataset. + */ + def setScoredLabelsColumnName: (DataFrame, String, String, String) => DataFrame = + setColumnName(ScoredLabelsColumn) + + /** + * Sets the scored probabilities column name. + * + * @param dataset The dataset to set the scored probabilities column name on. + * @param modelName The model name. + * @param columnName The column name to set as the scored probability. + * @param scoreValueKindModel The model type. + * @return The modified dataset. + */ + def setScoredProbabilitiesColumnName: (DataFrame, String, String, String) => DataFrame = + setColumnName(ScoredProbabilitiesColumn) + + /** + * Sets the scores column name. + * + * @param dataset The dataset to set the scores column name on. + * @param modelName The model name. + * @param columnName The column name to set as the scores. + * @param scoreValueKindModel The model type. + * @return The modified dataset. + */ + def setScoresColumnName: (DataFrame, String, String, String) => DataFrame = + setColumnName(ScoresColumn) + + /** + * Gets the label column name. + * + * @param dataset The dataset to get the label column from. + * @param modelName The model to retrieve the label column from. + * @return The label column name. + */ + def getLabelColumnName(dataset: DataFrame, modelName: String): String = + getScoreColumnKindColumn(TrueLabelsColumn)(dataset.schema, modelName) + + /** + * Gets the scored labels column name. + * + * @param dataset The dataset to get the scored labels column from. + * @param modelName The model to retrieve the scored labels column from. + * @return The scored labels column name. + */ + def getScoredLabelsColumnName(dataset: DataFrame, modelName: String): String = + getScoreColumnKindColumn(ScoredLabelsColumn)(dataset.schema, modelName) + + /** + * Gets the scores column name. + * + * @param dataset The dataset to get the scores column from. + * @param modelName The model to retrieve the scores column from. + * @return The scores column name. + */ + def getScoresColumnName(dataset: DataFrame, modelName: String): String = + getScoreColumnKindColumn(ScoresColumn)(dataset.schema, modelName) + + /** + * Gets the scored probabilities column name. + * + * @param dataset The dataset to get the scored probabilities column from. + * @param modelName The model to retrieve the scored probabilities column from. + * @return The scored probabilities column name. + */ + def getScoredProbabilitiesColumnName(dataset: DataFrame, modelName: String): String = + getScoreColumnKindColumn(ScoredProbabilitiesColumn)(dataset.schema, modelName) + + /** + * Gets the label column name. + * + * @param dataset The dataset to get the label column from. + * @param modelName The model to retrieve the label column from. + * @return The label column name. + */ + def getLabelColumnName: (StructType, String) => String = + getScoreColumnKindColumn(TrueLabelsColumn) + + /** + * Gets the scored labels column name. + * + * @param dataset The dataset to get the scored labels column from. + * @param modelName The model to retrieve the scored labels column from. + * @return The scored labels column name. + */ + def getScoredLabelsColumnName: (StructType, String) => String = + getScoreColumnKindColumn(ScoredLabelsColumn) + + /** + * Gets the scores column name. + * + * @param dataset The dataset to get the scores column from. + * @param modelName The model to retrieve the scores column from. + * @return The scores column name. + */ + def getScoresColumnName: (StructType, String) => String = + getScoreColumnKindColumn(ScoresColumn) + + /** + * Gets the scored probabilities column name. + * + * @param dataset The dataset to get the scored probabilities column from. + * @param modelName The model to retrieve the scored probabilities column from. + * @return The scored probabilities column name. + */ + def getScoredProbabilitiesColumnName: (StructType, String) => String = + getScoreColumnKindColumn(ScoredProbabilitiesColumn) + + /** + * Gets the score value kind or null if it does not exist from a dataset. + * + * @param scoreColumnKindColumn The score column kind to retrieve. + * @param dataset The dataset to get the score column kind column name from. + * @param modelName The model to retrieve the score column kind column name from. + * @param columnName The column to retrieve the score value kind from. + * @return + */ + def getScoreValueKind(dataset: DataFrame, modelName: String, columnName: String): String = { + getScoreValueKind(dataset.schema, modelName, columnName) + } + + /** + * Gets the score value kind or null if it does not exist from the schema. + * + * @param scoreColumnKindColumn The score column kind to retrieve. + * @param schema The schema to get the score column kind column name from. + * @param modelName The model to retrieve the score column kind column name from. + * @param columnName The column to retrieve the score value kind from. + * @return + */ + def getScoreValueKind(schema: StructType, modelName: String, columnName: String): String = { + val metadata = schema(columnName).metadata + if (metadata == null) return null + getMetadataFromModule(metadata, modelName, ScoreValueKind) + } + + /** + * Sets the score column kind. + * + * @param scoreColumnKindColumn The score column kind column. + * @param dataset The dataset to set the score column kind on. + * @param modelName The model name. + * @param columnName The column name to set as the specified score column kind. + * @param scoreValueKindModel The model type. + * @return + */ + private def setColumnName(scoreColumnKindColumn: String) + (dataset: DataFrame, modelName: String, + columnName: String, scoreValueKindModel: String): DataFrame = { + dataset.withColumn(columnName, + dataset.col(columnName).as(columnName, + updateMetadata(dataset.schema(columnName).metadata, + scoreColumnKindColumn, scoreValueKindModel, modelName))) + } + + /** + * Gets the score column kind column name or null if it does not exist. + * + * @param scoreColumnKindColumn The score column kind to retrieve. + * @param schema The schema to get the score column kind column name from. + * @param modelName The model to retrieve the score column kind column name from. + * @return + */ + private def getScoreColumnKindColumn(scoreColumnKindColumn: String) + (schema: StructType, modelName: String): String = { + val structField = schema.find { + case StructField(_, _, _, metadata) => + getMetadataFromModule(metadata, modelName, ScoreColumnKind) == scoreColumnKindColumn + } + if (structField.isEmpty) null else structField.get.name + } + + private def updateMetadata(metadata: Metadata, scoreColumnKindColumn: String, + scoreValueKindModel: String, moduleName: String): Metadata = { + val mmltagMetadata = + if (metadata.contains(MMLTag)) metadata.getMetadata(MMLTag) + else null + val moduleNameMetadata = + if (mmltagMetadata != null && mmltagMetadata.contains(moduleName)) + mmltagMetadata.getMetadata(moduleName) + else null + + val moduleMetadataBuilder = new MetadataBuilder() + if (mmltagMetadata != null && moduleNameMetadata != null) { + moduleMetadataBuilder.withMetadata(moduleNameMetadata) + } + moduleMetadataBuilder.putString(ScoreColumnKind, scoreColumnKindColumn) + moduleMetadataBuilder.putString(ScoreValueKind, scoreValueKindModel) + + val moduleBuilder = new MetadataBuilder() + if (mmltagMetadata != null) { + moduleBuilder.withMetadata(mmltagMetadata) + } + moduleBuilder.putMetadata(moduleName, moduleMetadataBuilder.build()) + + new MetadataBuilder() + .withMetadata(metadata) + .putMetadata(MMLTag, moduleBuilder.build()) + .build() + } + + private def getMetadataFromModule(colMetadata: Metadata, moduleName: String, tag: String): String = { + if (!colMetadata.contains(MMLTag)) return null + val mlTagMetadata = colMetadata.getMetadata(MMLTag) + if (!mlTagMetadata.contains(moduleName)) return null + val modelMetadata = mlTagMetadata.getMetadata(moduleName) + if (!modelMetadata.contains(tag)) return null + modelMetadata.getString(tag) + } + + /** + * Convert the regular column to the categorical one + * @param df dataframe + * @param column column name + * @param newColumn new categorical column name + * @param mmlStyle MML format (true, default) or MLlib format (false) + * @return updated dataframe + */ + def makeCategorical(df: DataFrame, + column: String, + newColumn: String, + mmlStyle: Boolean = true): DataFrame = { + + val dataType = df.schema(column).dataType + val collected = df.select(column).distinct().collect() + + dataType match { + //TODO: all cases below are the same; can we simplify the code with a single generic function? + case _: IntegerType => { + val levels = collected.map(row => row(0).asInstanceOf[Int]) + val map = new CategoricalMap(levels.sorted) + val getIndex = udf((level: Int) => map.getIndex(level)) + val metadata = map.toMetadata(mmlStyle) + df.withColumn(newColumn, getIndex(df(column)).as(newColumn, metadata)) + } + + case _: LongType => { + val levels = collected.map(row => row(0).asInstanceOf[Long]) + val map = new CategoricalMap(levels.sorted) + val getIndex = udf((level: Long) => map.getIndex(level)) + val metadata = map.toMetadata(mmlStyle) + df.withColumn(newColumn, getIndex(df(column)).as(newColumn, metadata)) + } + + case _: DoubleType => { + val levels = collected.map(row => row(0).asInstanceOf[Double]) + val map = new CategoricalMap(levels.sorted) + val getIndex = udf((level: Double) => map.getIndex(level)) + val metadata = map.toMetadata(mmlStyle) + df.withColumn(newColumn, getIndex(df(column)).as(newColumn, metadata)) + } + + case _: StringType => { + val levels = collected.map(row => row(0).asInstanceOf[String]) + val map = new CategoricalMap(levels.sorted) + val getIndex = udf((level: String) => map.getIndex(level)) + val metadata = map.toMetadata(mmlStyle) + df.withColumn(newColumn, getIndex(df(column)).as(newColumn, metadata)) + } + + case _: BooleanType => { + val levels = collected.map(row => row(0).asInstanceOf[Boolean]) + val map = new CategoricalMap(levels.sorted) + val getIndex = udf((level: Boolean) => map.getIndex(level)) + val metadata = map.toMetadata(mmlStyle) + df.withColumn(newColumn, getIndex(df(column)).as(newColumn, metadata)) + } + //case _: BooleanType => makeCategorical[Boolean] + case _ => throw new Exception("Unsupported Categorical type " + dataType.toString) + } + } + + /** + * Convert the regular column to the categorical one + * @param df dataframe + * @param column column name + * @param newColumn new categorical column name + * @param mmlStyle MML format (true, default) or MLlib format (false) + * @return updated dataframe + */ + def makeNonCategorical(df: DataFrame, + column: String, + newColumn: String): DataFrame = { + + val info = new CategoricalColumnInfo(df, column) + require(info.isCategorical, "column " + column + "is not Categorical") + require(info.dataType == StringType, "underlying categorical is not String based") //TODO: add other types too + //(isCategorical, isMML, isOrdinal, dataType) + + val map = CategoricalUtilities.getMap[String](df.schema(column).metadata) + val getLevel = udf((index: Int) => map.getLevel(index)) //TODO: can throw? + df.withColumn(newColumn, getLevel(df(column)).as(newColumn)) //TODO: keeping metadata: .as(newColumn,metadata) + } + + /** find if the given column is a string */ + def isString(df: DataFrame, column: String): Boolean = { + df.schema(column).dataType == DataTypes.StringType + } + + /** find if the given column is numeric */ + def isNumeric(df: DataFrame, column: String): Boolean = { + df.schema(column).dataType.isInstanceOf[NumericType] + } + + /** find if the given column is boolean */ + def isBoolean(df: DataFrame, column: String): Boolean = { + df.schema(column).dataType.isInstanceOf[BooleanType] + } + + /** find if the given column is Categorical; use CategoricalColumnInfo for more details */ + def isCategorical(df: DataFrame, column: String): Boolean = { + val info = new CategoricalColumnInfo(df, column) + info.isCategorical + } + +} diff --git a/src/core/schema/src/test/scala/TestCategoricals.scala b/src/core/schema/src/test/scala/TestCategoricals.scala new file mode 100644 index 0000000000..11bd6f41f4 --- /dev/null +++ b/src/core/schema/src/test/scala/TestCategoricals.scala @@ -0,0 +1,131 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql._ +import org.apache.spark.ml.Transformer +import org.apache.spark.sql.types._ +import org.apache.spark.ml.param._ +import com.microsoft.ml.spark.schema._ + +import scala.reflect.{ClassTag, classTag} + +class TestCategoricalMap extends TestBase { + + /** basic asserts that should be true for all Categorical Maps + * + * @param levels sorted categorical levels + * @param wrong_level a value that is not a level + * @param dataType corresponding Spark datatype + * @param isOrdinal whether levels are Ordinal or not + * @param mmlStyle save to MML (true, default) or MLlib (false) metadata + */ + private def testMapBasic[T: ClassTag](levels: Array[T], wrong_level: T, dataType: DataType, + isOrdinal: Boolean, mmlStyle: Boolean = true): Unit = { + + val map = new CategoricalMap(levels, isOrdinal) + val s = " " + classTag[T]; // to idenfity which type throws the error + + assert(map.numLevels == levels.length, "numLevels" + s) + assert(map.isOrdinal == isOrdinal, "isOrdinal" + s) + assert(map.dataType == dataType, "dataType" + s) + assert(map.getIndex(levels.head) == 0 & map.getIndex(levels.last) == levels.length - 1, "getIndex" + s) + assert(map.getIndexOption(wrong_level) == None & map.getIndexOption(levels(1)) == Some(1), "getIndexOption" + s) + assert(map.hasLevel(levels(1)) == true & map.hasLevel(wrong_level) == false, "hasLevel" + s) + assert(map.getLevel(1) == levels(1), "getLevel" + s) + assert(map.getLevelOption(1) == Some(levels(1)) & map.getLevelOption(-1) == None, "getLevelOption" + s) + + val mml_meta = map.toMetadata(mmlStyle) //TODO: check metadata for correctness + } + + /** test CategoricalMap for different undelying types */ + test("Test: Create basic CategoricalMap") { + + for (mmlStyle <- List(true, false)) { + + val isOrdinal = mmlStyle + + val strArray = Array("as", "", "efe") + testMapBasic(strArray, "wrong_level", StringType, isOrdinal, mmlStyle) + + val intArray = Array[Int](34, 54747, -346, 756, 0) + testMapBasic(intArray, -45, IntegerType, isOrdinal, mmlStyle) + + val longArray = Array[Long](34, 54747, -346, 756, 0) + testMapBasic(longArray, (-45: Long), LongType, isOrdinal, mmlStyle) + + val doubleArray = Array[Double](34.45, 54.747, -3.46, 7.56, 0) + testMapBasic(doubleArray, (-45: Double), DoubleType, isOrdinal, mmlStyle) + } + } + + import session.implicits._ + + /** sample dafaframe */ + private val DF = Seq[(Int, Long, Double, Boolean, String)]( + (-3, 24L, 0.32534, true, "piano"), + (1, 5L, 5.67, false, "piano"), + (-3, 5L, 0.32534, false, "guitar")) + .toDF("int", "long", "double", "bool", "string") + + /** sample dafaframe with Null values*/ + private val nullDF = Seq[(String, java.lang.Integer, java.lang.Double)]( + ("Alice", null, 44.3), + (null, 60, null), + ("Josh", 25, Double.NaN)) + .toDF("string", "int", "double") + + /** test CategoricalMap for different undelying types */ + test("Test: Convert the regular column into categorical") { + for (col <- DF.columns; mmlStyle <- List(false, true)) { + val newName = col + "_cat" + val df = SparkSchema.makeCategorical(DF, column = col, newColumn = newName, mmlStyle) + + assert(!SparkSchema.isCategorical(df, col), "Check for non-categorical columns") + assert(SparkSchema.isCategorical(df, newName), "Check for categorical columns") + + val info = new CategoricalColumnInfo(df, newName) + + assert(info.isCategorical, "the column is supposed to be categorical") + assert(info.isMML == mmlStyle, "wrong metadata style in categorical column") + assert(!info.isOrdinal, "wrong ordinal style in categorical column") + if (mmlStyle) + assert(info.dataType == DF.schema(col).dataType, "categorical data type is not correct") + else + assert(info.dataType == StringType, "categorical data type is not String") + } + } + + test("Test: String categorical levels") { + val col = "string" + val true_levels = DF.select("string").collect().map(_(0).toString).distinct.sorted + + for (mmlStyle <- List(false, true)) { + val newName = col + "_cat" + val df = SparkSchema.makeCategorical(DF, column = col, newColumn = newName, mmlStyle) + + val map = CategoricalUtilities.getMap[String](df.schema(newName).metadata) + + val levels = map.levels.sorted + + (true_levels zip levels).foreach { + case (a, b) => assert(a == b, "categorical levels are not the same") + } + } + } + + test("Test: Going to Categorical and Back") { + val col = "string" + for (mmlStyle <- List(false, true)) { + val newName = col + "_cat" + val df = SparkSchema.makeCategorical(DF, column = col, newColumn = newName, mmlStyle) + + val testName = col + "_noncat" + val df1 = SparkSchema.makeNonCategorical(df, column = newName, newColumn = testName) + + df1.select(col, testName).collect.foreach(row => assert(row(0) == row(1), "two columns should be the same")) + } + } + +} diff --git a/src/core/schema/src/test/scala/VerifyFastVectorAssembler.scala b/src/core/schema/src/test/scala/VerifyFastVectorAssembler.scala new file mode 100644 index 0000000000..7cb44dc0c5 --- /dev/null +++ b/src/core/schema/src/test/scala/VerifyFastVectorAssembler.scala @@ -0,0 +1,118 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.schema + +import com.microsoft.ml.spark.TransformerFuzzingTest +import org.apache.spark.SparkException +import org.apache.spark.ml.{Estimator, Transformer} +import org.apache.spark.ml.feature.{FastVectorAssembler, StringIndexer} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.StructType + +/** + * Verifies the fast vector assembler, which only keeps categorical metadata and removes all other metadata. + * TODO: Move this to core/spark and remove MML dependencies for the verification + */ +class VerifyFastVectorAssembler extends TransformerFuzzingTest { + + val invalidExceptionError = "Could not catch correct exception" + + val inputCols = Array("a", "b", "c", "d", "e") + val outputCol = "testCol" + val mockDataset = session.createDataFrame(Seq( + (0, 2, 0.5, 0.6, 0), + (1, 3, 0.4, 0.5, 1), + (2, 4, 0.78, 0.99, 2), + (3, 5, 0.12, 0.34, 3) + )).toDF(inputCols: _*) + + test("Verify fast vector assembler does not keep metadata for non-categorical columns") { + val fastAssembler = new FastVectorAssembler().setInputCols(inputCols).setOutputCol(outputCol) + val transformedDataset = fastAssembler.transform(mockDataset) + // Assert metadata is empty + assert(transformedDataset.schema(outputCol).metadata.toString() == + "{\"ml_attr\":{\"attrs\":{},\"num_attrs\":0}}") + } + + test("Verify fast vector assembler throws when the first column is not categorical") { + + val (inputCols: Array[String], catColumn: String, categoricalData: DataFrame) = createCategoricalData + + val outputCol = "testCol" + + val fastAssembler = new FastVectorAssembler() + .setInputCols((inputCols.toList.drop(1) ::: (List(catColumn))).toArray) + .setOutputCol(outputCol) + + var caughtException: Boolean = false + try { + val transformedDataset = fastAssembler.transform(categoricalData) + } + catch { + case exception: SparkException => { + caughtException = true + exception.getMessage.contains("Categorical columns must precede all others") + } + case _: Throwable => throw new Exception(invalidExceptionError) + } + + if (!caughtException) + throw new Exception(invalidExceptionError) + } + + test("Verify fast vector assembler works when the first column is a categorical column") { + + val (inputCols: Array[String], catColumn: String, categoricalData: DataFrame) = createCategoricalData + + val outputCol = "testCol" + + val fastAssembler2 = new FastVectorAssembler() + .setInputCols((catColumn :: inputCols.toList.drop(1)).toArray) + .setOutputCol(outputCol) + val transformedDataset2 = fastAssembler2.transform(categoricalData) + + // Assert metadata is not empty + val mlattrData = transformedDataset2.schema(outputCol).metadata.getMetadata(SchemaConstants.MLlibTag) + // assert the metadata is equal to: "{\"ml_attr\":{\"attrs\":{\"nominal\":[{\"vals\":[\"are\",\"how\", + // \"hello\",\"you\"],\"idx\":0,\"name\":\"cat\"}]},\"num_attrs\":1}}" + val attrsTag = "attrs" + assert(mlattrData.contains(attrsTag)) + val attrsData = mlattrData.getMetadata(attrsTag) + val nominalTag = "nominal" + assert(attrsData.contains(nominalTag)) + val nominalData = attrsData.getMetadataArray(nominalTag) + val valsTag = "vals" + assert(nominalData(0).contains(valsTag)) + assert(nominalData(0).getStringArray(valsTag).contains("are")) + assert(nominalData(0).getStringArray(valsTag).contains("how")) + assert(nominalData(0).getStringArray(valsTag).contains("hello")) + assert(nominalData(0).getStringArray(valsTag).contains("you")) + + } + + def createCategoricalData: (Array[String], String, DataFrame) = { + val inputCols = Array("a", "b", "c", "d", "e") + + val dataset = session.createDataFrame(Seq( + ("hello", 2, 0.5, 0.6, 0), + ("how", 3, 0.4, 0.5, 1), + ("are", 4, 0.78, 0.99, 2), + ("you", 5, 0.12, 0.34, 3) + )).toDF(inputCols: _*) + + val catColumn = "cat" + val indexer = new StringIndexer().setInputCol("a").setOutputCol(catColumn).fit(dataset) + val categoricalData = indexer.transform(dataset).toDF() + (inputCols, catColumn, categoricalData) + } + + override def setParams(fitDataset: DataFrame, transformer: Transformer): Transformer = + transformer.asInstanceOf[FastVectorAssembler].setInputCols(inputCols).setOutputCol(outputCol) + + override def createDataset: DataFrame = mockDataset + + override def schemaForDataset: StructType = ??? + + override def getTransformer(): Transformer = new FastVectorAssembler() +} diff --git a/src/core/schema/src/test/scala/VerifySparkSchema.scala b/src/core/schema/src/test/scala/VerifySparkSchema.scala new file mode 100644 index 0000000000..5b240a3906 --- /dev/null +++ b/src/core/schema/src/test/scala/VerifySparkSchema.scala @@ -0,0 +1,56 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.schema + +import com.microsoft.ml.spark.TestBase + +/** + * Verifies the spark schema functions. + */ +class VerifySparkSchema extends TestBase { + + val labelColumn = "label" + val scoreColumn = "score" + val probabilityColumn = "probability" + val scoredLabelsColumn = "scored label" + test("Spark schema should be able to set and get label, score, probability and scored labels column name") { + val dataset = session.createDataFrame(Seq( + (0, Array("Hi", "I", "can", "not", "foo"), 0.50, 0.60, 0), + (1, Array("I"), 0.40, 0.50, 1), + (2, Array("Logistic", "regression"), 0.78, 0.99, 2), + (3, Array("Log","f", "reg"), 0.12, 0.34, 3) + )).toDF(labelColumn, "words", scoreColumn, probabilityColumn, scoredLabelsColumn) + + val modelName = "Test model name" + val datasetWithLabel = + SparkSchema.setLabelColumnName(dataset, modelName, labelColumn, SchemaConstants.RegressionKind) + val labelColumnNameRetrieved = + SparkSchema.getLabelColumnName(datasetWithLabel, modelName) + + assert(labelColumnNameRetrieved == labelColumn) + + val datasetWithScore = + SparkSchema.setScoresColumnName(dataset, modelName, scoreColumn, SchemaConstants.RegressionKind) + val scoreColumnNameRetrieved = + SparkSchema.getScoresColumnName(datasetWithScore, modelName) + + assert(scoreColumnNameRetrieved == scoreColumn) + + val datasetWithProbability = + SparkSchema.setScoredProbabilitiesColumnName(dataset, modelName, probabilityColumn, + SchemaConstants.RegressionKind) + val probabilityColumnNameRetrieved = + SparkSchema.getScoredProbabilitiesColumnName(datasetWithProbability, modelName) + + assert(probabilityColumnNameRetrieved == probabilityColumn) + + val datasetWithScoredLabels = + SparkSchema.setScoredLabelsColumnName(dataset, modelName, scoredLabelsColumn, SchemaConstants.RegressionKind) + val scoredLabelsColumnNameRetrieved = + SparkSchema.getScoredLabelsColumnName(datasetWithScoredLabels, modelName) + + assert(scoredLabelsColumnNameRetrieved == scoredLabelsColumn) + } + +} diff --git a/src/core/spark/build.sbt b/src/core/spark/build.sbt new file mode 100644 index 0000000000..cd0183132b --- /dev/null +++ b/src/core/spark/build.sbt @@ -0,0 +1 @@ +// nothing here diff --git a/src/core/spark/src/main/scala/ArrayMapParam.scala b/src/core/spark/src/main/scala/ArrayMapParam.scala new file mode 100644 index 0000000000..2da6645bb8 --- /dev/null +++ b/src/core/spark/src/main/scala/ArrayMapParam.scala @@ -0,0 +1,70 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package org.apache.spark.ml.param + +import spray.json._ +import org.apache.spark.ml.util.Identifiable +import scala.collection.immutable.Map + +object ArrayMapJsonProtocol extends DefaultJsonProtocol { + + import spray.json._ + implicit object MapJsonFormat extends JsonFormat[Map[String, Any]] { + def write(m: Map[String, Any]): JsValue = { + JsObject(m.mapValues { + case v: Int => JsNumber(v) + case v: Double => JsNumber(v) + case v: String => JsString(v) + case true => JsTrue + case false => JsFalse + case v: Map[_, _] => write(v.asInstanceOf[Map[String, Any]]) + case default => serializationError(s"Unable to serialize $default") + }) + } + + def read(value: JsValue): Map[String, Any] = value.asInstanceOf[JsObject].fields.map(kvp => { + val convValue = kvp._2 match { + case JsNumber(n) => if (n.isValidInt) n.intValue().asInstanceOf[Any] else n.toDouble.asInstanceOf[Any] + case JsString(s) => s + case JsTrue => true + case JsFalse => false + case v: JsValue => read(v) + case default => deserializationError(s"Unable to deserialize $default") + } + (kvp._1, convValue) + }) + } + +} + +import ArrayMapJsonProtocol._ + +/** + * Param for Array of stage parameter maps. + */ +class ArrayMapParam(parent: String, name: String, doc: String, isValid: Array[Map[String, Any]] => Boolean) + extends Param[Array[Map[String, Any]]](parent, name, doc, isValid) { + + def this(parent: String, name: String, doc: String) = + this(parent, name, doc, ParamValidators.alwaysTrue) + + def this(parent: Identifiable, name: String, doc: String, isValid: Array[Map[String, Any]] => Boolean) = + this(parent.uid, name, doc, isValid) + + def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc) + + /** Creates a param pair with the given value (for Java). */ + override def w(value: Array[Map[String, Any]]): ParamPair[Array[Map[String, Any]]] = super.w(value) + + override def jsonEncode(value: Array[Map[String, Any]]): String = { + val json = value.toSeq.asInstanceOf[Seq[Map[String, Int]]].toJson + json.prettyPrint + } + + override def jsonDecode(json: String): Array[Map[String, Any]] = { + val jsonValue = json.parseJson + jsonValue.convertTo[Seq[Map[String, Any]]].toArray + } + +} diff --git a/src/core/spark/src/main/scala/EstimatorParam.scala b/src/core/spark/src/main/scala/EstimatorParam.scala new file mode 100644 index 0000000000..bfdc213eee --- /dev/null +++ b/src/core/spark/src/main/scala/EstimatorParam.scala @@ -0,0 +1,36 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package org.apache.spark.ml.param + +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.util.Identifiable + +/** + * Param for Estimator. Needed as spark has explicit params for many different types but not Estimator. + */ +class EstimatorParam(parent: String, name: String, doc: String, isValid: Estimator[_ <: Model[_]] => Boolean) + extends Param[Estimator[_ <: Model[_]]](parent, name, doc, isValid) { + + def this(parent: String, name: String, doc: String) = + this(parent, name, doc, ParamValidators.alwaysTrue) + + def this(parent: Identifiable, name: String, doc: String, isValid: Estimator[_ <: Model[_]] => Boolean) = + this(parent.uid, name, doc, isValid) + + def this(parent: Identifiable, name: String, doc: String) = + this(parent.uid, name, doc) + + /** Creates a param pair with the given value (for Java). */ + override def w(value: Estimator[_ <: Model[_]]): ParamPair[Estimator[_ <: Model[_]]] = + super.w(value) + + override def jsonEncode(value: Estimator[_ <: Model[_]]): String = { + throw new NotImplementedError("The transform cannot be encoded.") + } + + override def jsonDecode(json: String): Estimator[_ <: Model[_]] = { + throw new NotImplementedError("The transform cannot be decoded.") + } + +} diff --git a/src/core/spark/src/main/scala/FastVectorAssembler.scala b/src/core/spark/src/main/scala/FastVectorAssembler.scala new file mode 100644 index 0000000000..ddd9072d47 --- /dev/null +++ b/src/core/spark/src/main/scala/FastVectorAssembler.scala @@ -0,0 +1,154 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package org.apache.spark.ml.feature + +import scala.collection.mutable.ArrayBuilder +import org.apache.spark.SparkException +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.attribute.{Attribute, AttributeGroup} +import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared._ +import org.apache.spark.ml.util._ +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +/** + * A fast vector assembler. The columns given must be ordered such that categorical columns come first + * (otherwise spark learners will give categorical attributes to the wrong index). + * Does not keep spurrious numeric data which can significantly slow down computations when there are + * millions of columns. + */ +class FastVectorAssembler (override val uid: String) + extends Transformer with HasInputCols with HasOutputCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("FastVectorAssembler")) + + /** @group setParam */ + def setInputCols(value: Array[String]): this.type = set(inputCols, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + override def transform(dataset: Dataset[_]): DataFrame = { + // Schema transformation. + val schema = dataset.schema + lazy val first = dataset.toDF.first() + var addedNumericField = false + + // Propagate only nominal (categorical) attributes (others only slow down the code) + val attrs: Array[Attribute] = $(inputCols).flatMap { c => + val field = schema(c) + val index = schema.fieldIndex(c) + field.dataType match { + case _: NumericType | BooleanType => + val attr = Attribute.fromStructField(field) + if (attr.isNominal) { + if (addedNumericField) { + throw new SparkException("Categorical columns must precede all others, column out of order: " + c) + } + Some(attr.withName(c)) + } else { + addedNumericField = true + None + } + case _: VectorUDT => + val group = AttributeGroup.fromStructField(field) + if (group.attributes.isDefined) { + // If attributes are defined, copy them with updated names. + group.attributes.get.zipWithIndex.map { case (attr, i) => + if (attr.isNominal && attr.name.isDefined) { + if (addedNumericField) { + throw new SparkException("Categorical columns must precede all others, column out of order: " + c) + } + attr.withName(c + "_" + attr.name.get) + } else if (attr.isNominal) { + if (addedNumericField) { + throw new SparkException("Categorical columns must precede all others, column out of order: " + c) + } + attr.withName(c + "_" + i) + } else { + addedNumericField = true + null + } + }.filter(attr => attr != null) + } else { + addedNumericField = true + None + } + case otherType => + throw new SparkException(s"FastVectorAssembler does not support the $otherType type") + } + } + val metadata = new AttributeGroup($(outputCol), attrs).toMetadata() + + // Data transformation. + val assembleFunc = udf { r: Row => + FastVectorAssembler.assemble(r.toSeq: _*) + } + val args = $(inputCols).map { c => + schema(c).dataType match { + case DoubleType => dataset(c) + case _: VectorUDT => dataset(c) + case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid") + } + } + + dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol), metadata)) + } + + override def transformSchema(schema: StructType): StructType = { + val inputColNames = $(inputCols) + val outputColName = $(outputCol) + val inputDataTypes = inputColNames.map(name => schema(name).dataType) + inputDataTypes.foreach { + case _: NumericType | BooleanType => + case t if t.isInstanceOf[VectorUDT] => + case other => + throw new IllegalArgumentException(s"Data type $other is not supported.") + } + if (schema.fieldNames.contains(outputColName)) { + throw new IllegalArgumentException(s"Output column $outputColName already exists.") + } + StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true)) + } + + override def copy(extra: ParamMap): FastVectorAssembler = defaultCopy(extra) + +} + +object FastVectorAssembler extends DefaultParamsReadable[FastVectorAssembler] { + + override def load(path: String): FastVectorAssembler = super.load(path) + + private[feature] def assemble(vv: Any*): Vector = { + val indices = ArrayBuilder.make[Int] + val values = ArrayBuilder.make[Double] + var cur = 0 + vv.foreach { + case v: Double => + if (v != 0.0) { + indices += cur + values += v + } + cur += 1 + case vec: Vector => + vec.foreachActive { case (i, v) => + if (v != 0.0) { + indices += cur + i + values += v + () + } + } + cur += vec.size + case null => + throw new SparkException("Values to assemble cannot be null.") + case o => + throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") + } + Vectors.sparse(cur, indices.result(), values.result()).compressed + } + +} diff --git a/src/core/spark/src/main/scala/MapArrayParam.scala b/src/core/spark/src/main/scala/MapArrayParam.scala new file mode 100644 index 0000000000..bad158fca7 --- /dev/null +++ b/src/core/spark/src/main/scala/MapArrayParam.scala @@ -0,0 +1,74 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package org.apache.spark.ml.param + +import org.apache.spark.ml.util.Identifiable + +import scala.collection.JavaConverters._ +import scala.collection.immutable.Map +import scala.collection.mutable +import spray.json._ + +object MapArrayJsonProtocol extends DefaultJsonProtocol { + + import spray.json._ + implicit object MapJsonFormat extends JsonFormat[Map[String, Seq[String]]] { + def write(m: Map[String, Seq[String]]): JsValue = { + JsObject(m.mapValues { + case v: Seq[String] => seqFormat[String].write(v) + case default => serializationError(s"Unable to serialize $default") + }) + } + + def read(value: JsValue): Map[String, Seq[String]] = value.asInstanceOf[JsObject].fields.map(kvp => { + val convValue = kvp._2 match { + case v: JsValue => seqFormat[String].read(v) + case default => deserializationError(s"Unable to deserialize $default") + } + (kvp._1, convValue) + }) + } + +} + +import MapArrayJsonProtocol._ + +/** + * Param for Map of String to Seq of String. + */ +class MapArrayParam(parent: String, name: String, doc: String, isValid: Map[String, Seq[String]] => Boolean) + extends Param[Map[String, Seq[String]]](parent, name, doc, isValid) { + + def this(parent: String, name: String, doc: String) = + + this(parent, name, doc, ParamValidators.alwaysTrue) + + def this(parent: Identifiable, name: String, doc: String, isValid: Map[String, Seq[String]] => Boolean) = + + this(parent.uid, name, doc, isValid) + + def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc) + + /** Creates a param pair with the given value (for Java). */ + def w(value: java.util.HashMap[String, java.util.List[String]]): ParamPair[Map[String, Seq[String]]] = { + val mutMap = mutable.Map[String, Seq[String]]() + for (key <- value.keySet().asScala) { + val list = value.get(key).asScala + mutMap(key) = list + } + w(mutMap.toMap) + } + + override def jsonEncode(value: Map[String, Seq[String]]): String = { + val convertedMap = value.map(kvp => (kvp._1, kvp._2.toArray)) + val json = convertedMap.toJson + json.prettyPrint + } + + override def jsonDecode(json: String): Map[String, Seq[String]] = { + val jsonValue = json.parseJson + jsonValue.convertTo[Map[String, Seq[String]]] + } + +} diff --git a/src/core/spark/src/main/scala/MetadataUtilities.scala b/src/core/spark/src/main/scala/MetadataUtilities.scala new file mode 100644 index 0000000000..43f36c1272 --- /dev/null +++ b/src/core/spark/src/main/scala/MetadataUtilities.scala @@ -0,0 +1,10 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package org.apache.spark.sql.types + +object MetadataUtilities { + + def getMetadataKeys(metadata: Metadata): Iterable[String] = metadata.map.keys + +} diff --git a/src/core/spark/src/main/scala/TransformParam.scala b/src/core/spark/src/main/scala/TransformParam.scala new file mode 100644 index 0000000000..fb1233a3cc --- /dev/null +++ b/src/core/spark/src/main/scala/TransformParam.scala @@ -0,0 +1,58 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package org.apache.spark.ml.param + +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.util.Identifiable +import scala.collection.JavaConverters._ + +/** + * Param for Transformer. Needed as spark has explicit params for many different types but not Transformer. + */ +class TransformerParam(parent: String, name: String, doc: String, isValid: Transformer => Boolean) + extends Param[Transformer](parent, name, doc, isValid) { + + def this(parent: String, name: String, doc: String) = + this(parent, name, doc, ParamValidators.alwaysTrue) + + def this(parent: Identifiable, name: String, doc: String, isValid: Transformer => Boolean) = + this(parent.uid, name, doc, isValid) + + def this(parent: Identifiable, name: String, doc: String) = + this(parent.uid, name, doc) + + /** Creates a param pair with the given value (for Java). */ + override def w(value: Transformer): ParamPair[Transformer] = + super.w(value) + + override def jsonEncode(value: Transformer): String = { + throw new NotImplementedError("The transform cannot be encoded.") + } + + override def jsonDecode(json: String): Transformer = { + throw new NotImplementedError("The transform cannot be decoded.") + } + +} + +/** + * Param for Array of Models. + */ +class TransformerArrayParam(parent: String, name: String, doc: String, isValid: Array[Transformer] => Boolean) + extends Param[Array[Transformer]](parent, name, doc, isValid) { + + def this(parent: String, name: String, doc: String) = + + this(parent, name, doc, ParamValidators.alwaysTrue) + + def this(parent: Identifiable, name: String, doc: String, isValid: Array[Transformer] => Boolean) = + + this(parent.uid, name, doc, isValid) + + def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc) + + /** Creates a param pair with the given value (for Java). */ + def w(value: java.util.List[Transformer]): ParamPair[Array[Transformer]] = w(value.asScala.toArray) + +} diff --git a/src/core/test/base/build.sbt b/src/core/test/base/build.sbt new file mode 100644 index 0000000000..cd0183132b --- /dev/null +++ b/src/core/test/base/build.sbt @@ -0,0 +1 @@ +// nothing here diff --git a/src/core/test/base/src/main/scala/SparkSessionFactory.scala b/src/core/test/base/src/main/scala/SparkSessionFactory.scala new file mode 100644 index 0000000000..4b328406df --- /dev/null +++ b/src/core/test/base/src/main/scala/SparkSessionFactory.scala @@ -0,0 +1,53 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io.File + +import org.apache.log4j.{Level, Logger} +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession + +// Convert configuration to JSON/ENV vars moving forward: +// 1. Logging Level +// 2. Warehouse directory +// 3. DiskBlockManager - currently defaults to USER TEMP it seems +// 3a. Does this derive from spark.local.dir? Should be configured as well? +// 4. Actual Session host instead of local +object SparkSessionFactory { + + // Default spark warehouse = ./spark-warehouse + private val defaultWarehouseDirName = "spark-warehouse" + private val testDir = System.currentTimeMillis.toString + + private lazy val localWarehousePath = + "file:" + + customNormalize(new File(currentDir, defaultWarehouseDirName) + .getAbsolutePath()) + val workingDir = + "file:" + + customNormalize(new File(currentDir, testDir) + .getAbsolutePath()) + // On NTFS-like systems, normalize path + // (solves the problem of sending a path from spark to hdfs on Windows) + def customNormalize(path: String): String = { + if (File.separator != "\\") path + else path.replaceFirst("[A-Z]:", "").replace("\\", "/") + } + def currentDir(): String = System.getProperty("user.dir") + + def getSession(name: String, logLevel: String = "WARN"): SparkSession = { + val conf = new SparkConf() + .setAppName(name) + .setMaster("local[*]") + .set("spark.logConf", "true") + .set("spark.sql.warehouse.dir", SparkSessionFactory.localWarehousePath) + val sess = SparkSession.builder() + .config(conf) + .getOrCreate() + sess.sparkContext.setLogLevel(logLevel) + sess + } + +} diff --git a/src/core/test/base/src/main/scala/TestBase.scala b/src/core/test/base/src/main/scala/TestBase.scala new file mode 100644 index 0000000000..f5debe7ddd --- /dev/null +++ b/src/core/test/base/src/main/scala/TestBase.scala @@ -0,0 +1,155 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import scala.reflect.ClassTag +import org.apache.spark._ +import org.apache.spark.ml._ +import org.apache.spark.sql._ +import org.scalatest._ +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions._ +import org.scalactic.source.Position + +// Common test tags +object TestBase { + object Extended extends Tag("com.microsoft.ml.spark.test.tags.extended") + object LinuxOnly extends Tag("com.microsoft.ml.spark.test.tags.linuxonly") +} + +trait LinuxOnly extends TestBase { + override def test(testName: String, testTags: Tag*)(testFun: => Any)(implicit pos: Position): Unit = + super.test(testName, testTags.toList.::(TestBase.LinuxOnly): _*)(testFun) +} + +abstract class TestBase extends FunSuite with BeforeAndAfterEachTestData with BeforeAndAfterAll { + + println(s"\n>>>-------------------- $this --------------------<<<") + + // "This Is A Bad Thing" according to my research. However, this is + // just for tests so maybe ok. A better design would be to break the + // session stuff into TestSparkSession as a trait and have test suites + // that need it "with TestSparkSession" instead, but that's a lot of + // changes right now and maybe not desired. + private var sessionInitialized = false + protected lazy val session: SparkSession = { + info(s"Creating a spark session for suite $this") + sessionInitialized = true + SparkSessionFactory + .getSession(s"$this", logLevel = "WARN") + } + + protected lazy val sc: SparkContext = session.sparkContext + protected lazy val dir = SparkSessionFactory.workingDir + protected def normalizePath(path: String) = SparkSessionFactory.customNormalize(path) + + // Timing info + var suiteElapsed: Long = 0 + var testStart: Long = 0 + var testElapsed: Long = 0 + + // Test Fixture Overrides + protected override def beforeEach(td: TestData): Unit = { + testStart = System.currentTimeMillis + testElapsed = 0 + super.beforeEach(td) + } + + protected override def afterEach(td: TestData): Unit = { + try { + super.afterEach(td) + } + finally { + testElapsed = System.currentTimeMillis - testStart + logTime(s"Test ${td.name}", testElapsed, 3000) + suiteElapsed += testElapsed + } + } + + protected override def beforeAll(): Unit = { + if (sessionInitialized) { + info(s"Parallelism: ${session.sparkContext.defaultParallelism.toString}") + } + suiteElapsed = 0 + } + + protected override def afterAll(): Unit = { + logTime(s"Suite $this", suiteElapsed, 10000) + if (sessionInitialized) { + info("Shutting down spark session") + session.stop() + } + } + + // Utilities + + def withoutLogging[T](e: => T): T = { + // This should really keep the old level, but there is no sc.getLogLevel, so + // take the cheap way out for now: just use "WARN", and do something proper + // when/if needed + sc.setLogLevel("OFF") + try e finally sc.setLogLevel("WARN") + } + + def interceptWithoutLogging[E <: Exception: ClassTag](e: => Any): Unit = { + withoutLogging { intercept[E] { e }; () } + } + + def assertSparkException[E <: Exception: ClassTag](stage: PipelineStage, data: DataFrame): Unit = { + withoutLogging { + intercept[E] { + val transformer = stage match { + case e: Estimator[_] => e.fit(data) + case t: Transformer => t + case _ => sys.error(s"Unknown PipelineStage value: $stage") + } + // use .length to force the pipeline (.count might work, but maybe it's sometimes optimized) + transformer.transform(data).foreach { r => r.length; () } + } + () + } + } + + import session.implicits._ + + def makeBasicDF(): DataFrame = { + val df = Seq( + (0, "guitars", "drums"), + (1, "piano", "trumpet"), + (2, "bass", "cymbals")).toDF("numbers","words", "more") + df + } + + def makeBasicNullableDF(): DataFrame = { + val df = Seq( + (0, 2.5, "guitars", "drums"), + (1, Double.NaN, "piano", "trumpet"), + (2, 8.9, "bass", null)).toDF("indices", "numbers","words", "more") + df + } + + def verifyResult(expected: DataFrame, result: DataFrame): Boolean = { + assert(expected.count == result.count) + assert(expected.schema.length == result.schema.length) + (expected.columns zip result.columns).forall{ case (x,y) => x == y } + } + + def time[R](block: => R): R = { + val t0 = System.nanoTime() + val result = block + val t1 = System.nanoTime() + println(s"Elapsed time: ${(t1 - t0) / 1e9} sec") + result + } + + private def logTime(name: String, time: Long, threshold: Long) = { + val msg = s"$name took ${time / 1000.0}s" + if (time > threshold) { + alert(msg) + } else { + info(msg) + } + } + +} diff --git a/src/core/test/build.sbt b/src/core/test/build.sbt new file mode 100644 index 0000000000..e3bafe48f4 --- /dev/null +++ b/src/core/test/build.sbt @@ -0,0 +1 @@ +Extras.noJar diff --git a/src/core/test/datagen/build.sbt b/src/core/test/datagen/build.sbt new file mode 100644 index 0000000000..6c29f8db94 --- /dev/null +++ b/src/core/test/datagen/build.sbt @@ -0,0 +1 @@ +//> DependsOn: core/test/base diff --git a/src/core/test/datagen/src/main/scala/DatasetConstraints.scala b/src/core/test/datagen/src/main/scala/DatasetConstraints.scala new file mode 100644 index 0000000000..d67c82fca9 --- /dev/null +++ b/src/core/test/datagen/src/main/scala/DatasetConstraints.scala @@ -0,0 +1,68 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import breeze.stats.distributions.{Rand, RandBasis, Uniform} +import org.apache.commons.math3.random.{MersenneTwister, RandomGenerator} + +import scala.util.Random + +/** + * Specifies the trait for constraints on generating a dataset. + */ +trait HasDatasetGenerationConstraints { + var numRows: Int + var numCols: Int + var numSlotsPerCol: Array[Int] + var randomizeColumnNames: Boolean +} + +/** + * Basic constraints for generating a dataset. + */ +class BasicDatasetGenerationConstraints(numberOfRows: Int, numberOfColumns: Int, numberOfSlotsPerColumn: Array[Int]) + extends HasDatasetGenerationConstraints { + override var numRows: Int = numberOfRows + override var numCols: Int = numberOfColumns + override var numSlotsPerCol: Array[Int] = numberOfSlotsPerColumn + override var randomizeColumnNames: Boolean = true +} + +/** + * Contraints on generating a dataset where all parameters are randomly generated. + * @param minRows The min number of rows. + * @param maxRows The max number of rows. + * @param minCols The min number of columns. + * @param maxCols The max number of columns. + * @param minSlots The min number of slots. + * @param maxSlots The max number of slots. + */ +class RandomDatasetGenerationConstraints(minRows: Int, + maxRows: Int, + minCols: Int, + maxCols: Int, + minSlots: Int, + maxSlots: Int) + extends HasDatasetGenerationConstraints { + + override var numRows: Int = _ + override var numCols: Int = _ + override var numSlotsPerCol: Array[Int] = _ + override var randomizeColumnNames: Boolean = _ + + /** + * Generates values for rows, columns and slots based on the given constraints using a random number generator. + * @param random The random number generator. + */ + def generateConstraints(random: Random): Unit = { + val rand = new RandBasis(new MersenneTwister(random.nextInt())) + val distributionRows = new Uniform(minRows.toDouble, maxRows.toDouble)(rand) + val distributionCols = new Uniform(minCols.toDouble, maxCols.toDouble)(rand) + val distributionSlots = new Uniform(minCols.toDouble, maxCols.toDouble)(rand) + numRows = distributionRows.draw().toInt + numCols = distributionCols.draw().toInt + numSlotsPerCol = (1 to numCols).map(col => distributionSlots.draw().toInt).toArray + } + +} diff --git a/src/core/test/datagen/src/main/scala/DatasetOptions.scala b/src/core/test/datagen/src/main/scala/DatasetOptions.scala new file mode 100644 index 0000000000..5e75992f58 --- /dev/null +++ b/src/core/test/datagen/src/main/scala/DatasetOptions.scala @@ -0,0 +1,57 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.ColumnOptions.ColumnOptions +import com.microsoft.ml.spark.DataOptions.DataOptions + +/** + * Specifies the column types supported in spark dataframes and modules. + */ +object ColumnOptions extends Enumeration { + type ColumnOptions = Value + // TODO: add Categorical, DenseVector, SparseVector + val Scalar = Value +} + +/** + * Specifies the data types supported in spark dataframes and modules. + */ +object DataOptions extends Enumeration { + type DataOptions = Value + val String, Int, Double, Boolean, Date, Timestamp, Byte, Short = Value +} + +/** + * Options used to specify how a dataset will be generated. + * This contains information on what the data and column types + * (specified as flags) for generating a dataset will be limited to. + * It also contain options for all possible missing values generation + * and options for how values will be generated. + */ +case class DatasetOptions(columnTypes: ColumnOptions.ValueSet, + dataTypes: DataOptions.ValueSet, + missingValuesOptions: DatasetMissingValuesGenerationOptions) + +object DatasetOptions { + def apply(columnOptions: ColumnOptions.ValueSet, dataOptions: DataOptions.ValueSet): DatasetOptions = { + val missingValueOptions = DatasetMissingValuesGenerationOptions(0.0, columnOptions, dataOptions) + new DatasetOptions(columnOptions, dataOptions, missingValueOptions) + } + + def apply(columnOption: ColumnOptions, dataOption: DataOptions): DatasetOptions = { + val colOptions = ColumnOptions.ValueSet(columnOption) + val dataOptions = DataOptions.ValueSet(dataOption) + val missingValueOptions = DatasetMissingValuesGenerationOptions(0.0, colOptions, dataOptions) + new DatasetOptions(colOptions, dataOptions, missingValueOptions) + } +} + +case class DatasetMissingValuesGenerationOptions(percentMissing: Double, + columnTypesWithMissings: ColumnOptions.ValueSet, + dataTypesWithMissings: DataOptions.ValueSet) { + def hashMissing(): Boolean = { + !columnTypesWithMissings.isEmpty && !dataTypesWithMissings.isEmpty + } +} diff --git a/src/core/test/datagen/src/main/scala/GenerateDataType.scala b/src/core/test/datagen/src/main/scala/GenerateDataType.scala new file mode 100644 index 0000000000..5b72dc4c84 --- /dev/null +++ b/src/core/test/datagen/src/main/scala/GenerateDataType.scala @@ -0,0 +1,37 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.sql.Timestamp +import java.sql.Date +import org.apache.commons.lang.RandomStringUtils +import scala.util.Random + +/** + * Generates the specified random data type. + */ +class GenerateDataType(random: Random) extends Serializable { + + def nextTimestamp: Timestamp = new Timestamp(random.nextLong()) + + def nextBoolean: Boolean = random.nextBoolean() + + def nextByte: Byte = { + val byteArray = new Array[Byte](1) + random.nextBytes(byteArray) + byteArray(0) + } + + def nextDouble: Double = random.nextDouble() + + def nextInt: Int = random.nextInt() + + def nextShort: Short = random.nextInt(Short.MaxValue).toShort + + def nextString: String = RandomStringUtils.random(random.nextInt(100), 0, 0, true, true, null, + new java.util.Random(random.nextLong())) + + def nextDate: Date = new Date(random.nextLong()) + +} diff --git a/src/core/test/datagen/src/main/scala/GenerateDataset.scala b/src/core/test/datagen/src/main/scala/GenerateDataset.scala new file mode 100644 index 0000000000..ed1432b75f --- /dev/null +++ b/src/core/test/datagen/src/main/scala/GenerateDataset.scala @@ -0,0 +1,114 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.ColumnOptions.ColumnOptions +import com.microsoft.ml.spark.DataOptions.DataOptions +import org.apache.spark.mllib.random.RandomRDDs +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ + +import scala.util.Random + +/** + * Defines methods to generate a random spark DataFrame dataset based on given options. + */ +object GenerateDataset { + + /** + * Generates a random Spark DataFrame given a set of dataset generation constraints. + * @param sparkSession The spark session. + * @param datasetGenerationConstraints The dataset generation constraints to use. + * @param seed The random seed. + * @return A randomly generated dataset. + */ + def generateDataset(sparkSession: SparkSession, + datasetGenerationConstraints: HasDatasetGenerationConstraints, + seed: Long): DataFrame = { + generateDatasetFromOptions(sparkSession, Map[Int, DatasetOptions](), datasetGenerationConstraints, seed) + } + + /** + * Generates a random Spark DataFrame given a map of index to DataGenerationOptions. + * @param sparkSession The spark session. + * @param indexToOptions The map of indexes to DataGenerationOptions. + * @param datasetGenerationConstraints The constraints for generating the dataset. + * @param seed The random seed. + * @return The randomly generated dataset. + */ + def generateDatasetFromOptions(sparkSession: SparkSession, + indexToOptions: Map[Int, DatasetOptions], + datasetGenerationConstraints: HasDatasetGenerationConstraints, + seed: Long): DataFrame = { + + val random = new Random(seed) + val numCols: Int = datasetGenerationConstraints.numCols + val datasetGenerationOptions = (1 to numCols). + map(index => + if (indexToOptions.contains(index)) indexToOptions(index) + else new DatasetOptions(ColumnOptions.values, + DataOptions.values, + new DatasetMissingValuesGenerationOptions(0.5, + ColumnOptions.values, + DataOptions.values))) + // Get random options chosen from given valid space-dimension complex + val chosenOptions:Array[(ColumnOptions, DataOptions)] = + datasetGenerationOptions.toArray.map(option => chooseOptions(option, random)) + + val rdd = RandomRDDs.randomRDD[Row](sparkSession.sparkContext, + new RandomRowGeneratorCombiner(chosenOptions.map(option => new RandomRowGenerator(option._1, option._2))), + datasetGenerationConstraints.numRows.toLong, 1, random.nextLong()) + sparkSession.createDataFrame(rdd, getSchemaFromOptions(chosenOptions, random)) + } + + def getOptionsFromSchema(schema: StructType): Map[Int, DatasetOptions] = { + val datasetOptions = schema.map(sf => DatasetOptions(ColumnOptions.Scalar, getOptionsFromDataType(sf.dataType))) + datasetOptions.zipWithIndex.map(kvp => (kvp._2 + 1, kvp._1)).toMap + } + + private def chooseOptions(options: DatasetOptions, random: Random) = { + val (optionsColumnArray, optionsDataArray) = (options.columnTypes.toArray, options.dataTypes.toArray) + (optionsColumnArray(random.nextInt(optionsColumnArray.length)), + optionsDataArray(random.nextInt(optionsDataArray.length))) + } + + private def getSchemaFromOptions(chosenOptions: Array[(ColumnOptions, DataOptions)], + random: Random): StructType = { + val generateDataType = new GenerateDataType(random) + new StructType( + chosenOptions + .map(option => getDataTypeFromOptions(option._2)) + .map(dataType => StructField(generateDataType.nextString, dataType))) + } + + lazy val dataTypeToOptions: Map[DataOptions, DataType] = Map( + DataOptions.String -> StringType, + DataOptions.Timestamp -> TimestampType, + DataOptions.Short -> ShortType, + DataOptions.Int -> IntegerType, + DataOptions.Boolean -> BooleanType, + DataOptions.Byte -> ByteType, + DataOptions.Date -> DateType, + DataOptions.Double -> DoubleType + ) + + lazy val optionsToDataType: Map[DataType, DataOptions] = dataTypeToOptions.map(kvp => (kvp._2, kvp._1)) + + private def getDataTypeFromOptions(data: DataOptions): DataType = { + if (dataTypeToOptions.contains(data)) { + dataTypeToOptions(data) + } else { + throw new Exception("The type does not exist in spark: " + data) + } + } + + private def getOptionsFromDataType(data: DataType): DataOptions = { + if (optionsToDataType.contains(data)) { + optionsToDataType(data) + } else { + throw new Exception("The corresponding option does not exist for spark data type: " + data) + } + } + +} diff --git a/src/core/test/datagen/src/main/scala/GenerateRow.scala b/src/core/test/datagen/src/main/scala/GenerateRow.scala new file mode 100644 index 0000000000..a258bc1f46 --- /dev/null +++ b/src/core/test/datagen/src/main/scala/GenerateRow.scala @@ -0,0 +1,70 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.ColumnOptions.ColumnOptions +import com.microsoft.ml.spark.DataOptions.DataOptions +import org.apache.spark.mllib.random.RandomDataGenerator +import org.apache.spark.sql.Row + +import scala.util.Random + +/** + * Combines an array of row generators into a single row generator. + * @param generators + */ +class RandomRowGeneratorCombiner(generators: Array[RandomMMLGenerator[Row]]) extends RandomMMLGenerator[Row] { + + override def nextValue(): Row = Row.merge(generators.map(generator => generator.nextValue()): _*) + + override def copy(): RandomRowGeneratorCombiner = new RandomRowGeneratorCombiner(generators) + +} + +/** + * Randomly generates a row given the set space of data, column options. + * @param col The column generation options specifying the column type to generate. + * @param data The data generation options specifying the data to generate. + */ +class RandomRowGenerator(col: ColumnOptions, data: DataOptions) extends RandomMMLGenerator[Row] { + + override def nextValue(): Row = { + if (data == DataOptions.Boolean) + Row(random.nextBoolean) + else if (data == DataOptions.Byte) + Row(random.nextByte) + else if (data == DataOptions.Double) + Row(random.nextDouble) + else if (data == DataOptions.Int) + Row(random.nextInt) + else if (data == DataOptions.Short) + Row(random.nextShort) + else if (data == DataOptions.String) + Row(random.nextString) + else if (data == DataOptions.Date) + Row(random.nextDate) + else if (data == DataOptions.Timestamp) + Row(random.nextTimestamp) + else throw new Exception("Selected type not supported: " + data) + } + + override def copy(): RandomRowGenerator = new RandomRowGenerator(col, data) + +} + +/** + * Base abstract class for random generation of data. + * @tparam T The data to generate. + */ +abstract class RandomMMLGenerator[T] extends RandomDataGenerator[T] { + + var seed: Long = 0 + var random: GenerateDataType = new GenerateDataType(new Random(seed)) + + override def setSeed(seed: Long): Unit = { + random = new GenerateDataType(new Random(seed)) + this.seed = seed + } + +} diff --git a/src/core/test/datagen/src/main/scala/ModuleFuzzingTest.scala b/src/core/test/datagen/src/main/scala/ModuleFuzzingTest.scala new file mode 100644 index 0000000000..48dd3e0afa --- /dev/null +++ b/src/core/test/datagen/src/main/scala/ModuleFuzzingTest.scala @@ -0,0 +1,52 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.ml.{Estimator, Transformer} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.DataFrame + +/** + * Used to provide overrides on datasets to be constructed for testing fit/transform and default values + */ +abstract class EstimatorFuzzingTest extends TestBase { + def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = estimator + + def createFitDataset: DataFrame = { + val schema = schemaForDataset + GenerateDataset.generateDatasetFromOptions(session, + GenerateDataset.getOptionsFromSchema(schema), + new BasicDatasetGenerationConstraints(5, schema.size, Array()), + 0).toDF(schemaForDataset.map(_.name): _*) + } + + def createTransformDataset: DataFrame = createFitDataset + + def schemaForDataset: StructType + + def getEstimator(): Estimator[_] + + def getClassName: String = getEstimator().getClass.getName +} + +/** + * Used to provide overrides on datasets to be constructed for testing transform and default values + */ +abstract class TransformerFuzzingTest extends TestBase { + def setParams(fitDataset: DataFrame, transformer: Transformer): Transformer = transformer + + def createDataset: DataFrame = { + val schema = schemaForDataset + GenerateDataset.generateDatasetFromOptions(session, + GenerateDataset.getOptionsFromSchema(schema), + new BasicDatasetGenerationConstraints(5, schema.size, Array()), + 0) + } + + def schemaForDataset: StructType + + def getTransformer(): Transformer + + def getClassName: String = getTransformer().getClass.getName +} diff --git a/src/core/test/datagen/src/test/scala/VerifyGenerateDataset.scala b/src/core/test/datagen/src/test/scala/VerifyGenerateDataset.scala new file mode 100644 index 0000000000..94a7d5cd52 --- /dev/null +++ b/src/core/test/datagen/src/test/scala/VerifyGenerateDataset.scala @@ -0,0 +1,46 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +/** + * Verifies generating a dataset using the api. + */ +class VerifyGenerateDataset extends TestBase { + + test("Smoke test to verify that generating a dataset works") { + val numRows = 10 + val numCols = 20 + val numSlotsPerVectorCol = Array(15, 15) + val seed = 1337 + val df = GenerateDataset + .generateDataset(session, new BasicDatasetGenerationConstraints(numRows, numCols, numSlotsPerVectorCol), + seed.toLong) + assert(df.columns.length == numCols) + assert(df.count == numRows) + } + + test("Verify that the generated dataset is always the same") { + val numRows = 10 + val numCols = 20 + val numSlotsPerVectorCol = Array(15, 15) + val seed = 1337 + + val datasets = (0 to 10).map(i => GenerateDataset + .generateDataset(session, new BasicDatasetGenerationConstraints(numRows, numCols, numSlotsPerVectorCol), + seed.toLong)) + + assert(datasets.forall(df => verifyResult(df, datasets(0))), "Datasets must be equal") + } + + test("Verify that for different seed, you will get different datasets") { + val numRows = 25 + val numCols = 10 + + val datasets = (0 to 10).map(i => GenerateDataset + .generateDataset(session, new BasicDatasetGenerationConstraints(numRows, numCols, Array()), i.toLong)) + + assert(!datasets.forall(df => verifyResult(df, datasets(0))), "Datasets must not be equal for different seeds") + } + +} diff --git a/src/data-conversion/build.sbt b/src/data-conversion/build.sbt new file mode 100644 index 0000000000..6d55f118b6 --- /dev/null +++ b/src/data-conversion/build.sbt @@ -0,0 +1 @@ +//> DependsOn: core diff --git a/src/data-conversion/src/main/scala/DataConversion.scala b/src/data-conversion/src/main/scala/DataConversion.scala new file mode 100644 index 0000000000..073ca6b714 --- /dev/null +++ b/src/data-conversion/src/main/scala/DataConversion.scala @@ -0,0 +1,161 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import java.sql.Timestamp + +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import schema._ + +object DataConversion extends DefaultParamsReadable[DataConversion] + +/* +This class takes a DataFrame, a comma separated list of column names, and a conversion action and returns +a new DataFrame with the contents of the selected columns coverted to the requested type. + */ +class DataConversion(override val uid: String) extends Transformer with MMLParams { + def this() = this(Identifiable.randomUID("DataConversion")) + + val col: Param[String] = StringParam(this, "col", + "comma separated list of columns whose type will be converted", "") + + /** @group getParam **/ + final def getCol: String = $(col) + + /** @group setParam **/ + def setCol(value: String): this.type = set(col, value) + + val convertTo: Param[String] = StringParam(this, "convertTo", "the result type", "") + + /** @group getParam **/ + final def getConvertTo: String = $(convertTo) + + /** @group setParam **/ + def setConvertTo(value: String): this.type = set(convertTo, value) + + val dateTimeFormat: Param[String] = StringParam(this, "dateTimeFormat", + "format for DateTime when making DateTime:String conversions", "yyyy-MM-dd HH:mm:ss") + + /** @group getParam **/ + final def getDateTimeFormat: String = $(dateTimeFormat) + + /** @group setParam **/ + def setDateTimeFormat(value: String): this.type = set(dateTimeFormat, value) + + override def transform(dataset: Dataset[_]): DataFrame = { + require($(col) != null, "No column name specified") + require(dataset != null, "No dataset supplied") + require(dataset.columns.length != 0, "Dataset with no columns cannot be converted") + val colsList = $(col).split(",").map(_.trim) + val errorList = verifyCols(dataset.toDF(), colsList) + if (!errorList.isEmpty) { + throw new NoSuchElementException + } + var df = dataset.toDF + + val res: DataFrame = { + for (convCol <- colsList) { + df = $(convertTo) match { + case "boolean" => numericTransform(df, BooleanType, convCol) + case "byte" => numericTransform(df, ByteType, convCol) + case "short" => numericTransform(df, ShortType, convCol) + case "integer" => numericTransform(df, IntegerType, convCol) + case "long" => numericTransform(df, LongType, convCol) + case "float" => numericTransform(df, FloatType, convCol) + case "double" => numericTransform(df, DoubleType, convCol) + case "string" => numericTransform(df, StringType, convCol) + case "toCategorical" => SparkSchema.makeCategorical(df, convCol, convCol, true) + case "clearCategorical" => SparkSchema.makeNonCategorical(df, convCol, convCol) + case "date" => toDateConversion(df, convCol) + } + } + df + } + res + } + + /** + * @param dataset - The input dataset, to be transformed + * @param paramMap - ParamMap which contains parameter value to override the default value + * @return - the DataFrame that results from data conversion + */ + override def transform(dataset: Dataset[_], paramMap: ParamMap): DataFrame = { + setCol(paramMap.getOrElse(new Param("col", "col","name of column whose type will be converted"), "")) + setConvertTo(paramMap.getOrElse(new Param("convertTo", "convertTo","result type"), "")) + setDateTimeFormat(paramMap.getOrElse(new Param("dateTimeFormat", "dateTimeFormat", "time string format"), "")) + transform(dataset) + } + + def transformSchema(schema: StructType): StructType = { + System.err.println("transformSchema not implemented yet") + schema + } + + def copy(extra: ParamMap): DataConversion = defaultCopy(extra) + + /* + Convert to a numeric type or a string. If the input type was a TimestampType, tnen do a different conversion? + */ + private def numericTransform(df: DataFrame, outType: DataType, columnName: String): DataFrame = { + val inType = df.schema(columnName).dataType + if (inType == StringType && outType == BooleanType) throw new Exception("String to Boolean not supported") + val res = inType match { + case TimestampType => fromDateConversion(df, outType, columnName) + case _ => df.withColumn(columnName, df(columnName).cast(outType).as(columnName)) + } + res + } + + /* + Convert a TimestampType to a StringType or a LongType, else error + */ + private def fromDateConversion(df: DataFrame, outType: DataType, columnName: String): DataFrame = { + require(outType == StringType || outType == LongType, "Date only converts to string or long") + val res = outType match { + case LongType => { + val getTime = udf((t:java.sql.Timestamp)=>t.getTime()) + df.withColumn(columnName, getTime(df(columnName))) + } + case StringType => { + val parseTimeString = udf((t:java.sql.Timestamp)=>{ + val f:java.text.SimpleDateFormat = new java.text.SimpleDateFormat($(dateTimeFormat));f.format(t)}) + df.withColumn(columnName, parseTimeString(df(columnName))) + } + } + res + } + + private def toDateConversion(df: DataFrame, columnName: String): DataFrame = { + val inType = df.schema(columnName).dataType + require(inType == StringType || inType == LongType, "Can only convert string or long to Date") + val res = inType match { + case StringType => { + val f = new java.text.SimpleDateFormat($(dateTimeFormat)) + val parseTimeFromString = udf((t:String)=>{new Timestamp(f.parse(t).getTime)}) + df.withColumn(columnName, parseTimeFromString(df(columnName)).cast("timestamp")).as(columnName) + } + case LongType => { + val longToTimestamp = udf((t:Long)=>{new java.sql.Timestamp(t)}) + df.withColumn(columnName, longToTimestamp(df(columnName))) + } + } + res + } + + private def verifyCols(df: DataFrame, req: Array[String]): List[String] = { + req.foldLeft(List[String]()) { (l, r) => + if (df.columns.contains(r)) l + else { + System.err.println(s"DataFrame does not contain specified column: $r") + r :: l + } + } + } + +} diff --git a/src/data-conversion/src/test/scala/VerifyDataConversion.scala b/src/data-conversion/src/test/scala/VerifyDataConversion.scala new file mode 100644 index 0000000000..49b42eebf0 --- /dev/null +++ b/src/data-conversion/src/test/scala/VerifyDataConversion.scala @@ -0,0 +1,232 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.sql.Timestamp + +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.types._ +import schema._ + +class VerifyDataConversions extends TestBase { + + import session.implicits._ + + val testVal: Long = (Int.MaxValue).toLong + 100 + val testShort: Integer = Short.MaxValue + 100 + /* + DataFrame for the numerical and string <--> numerical conversions + */ + val masterInDF = Seq((true: Boolean, 1: Byte, 2: Short, 3: Integer, 4: Long, 5.0F, 6.0, "7", "8.0"), + (false, 9: Byte, 10: Short, 11: Integer, 12: Long, 14.5F, 15.5, "16", "17.456"), + (true, -127: Byte, 345: Short, testShort, testVal, 18.91F, 20.21, "100", "200.12345")) + .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring") + + /* + Dataframe of Timestamp data + */ + val tsDF= Seq("1986-07-27 12:48:00.123", "1988-11-01 11:08:48.456", "1993-08-06 15:32:00.789").toDF("Col0") + .select($"Col0".cast("timestamp")) + + /* + Timestamps as longs dataframe. These longs were generated on the commandline feeding the above timestamp + values to Timestamp.getTime() + */ + val f = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS") + val parseTimeFromString = udf((t:String)=>{new Timestamp(f.parse(t).getTime)}) + val lDF = Seq(f.parse("1986-07-27 12:48:00.123").getTime(), + f.parse("1988-11-01 11:08:48.456").getTime(), + f.parse("1993-08-06 15:32:00.789").getTime()).toDF("Col0") + + /* + Timestaps as strings dataframe + */ + val sDF = Seq("1986-07-27 12:48:00.123", "1988-11-01 11:08:48.456", "1993-08-06 15:32:00.789").toDF("Col0") + + /* + Test conversion of all numeric types to Boolean + Strings are cast to null, which causes the comparison test to fail, so for now I + am skipping the string tests. + Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string + */ + test("Test convert all types to Boolean") { + val r1 = new DataConversion().setCol("byte").setConvertTo("boolean").transform(masterInDF) + val r2 = new DataConversion().setCol("short").setConvertTo("boolean").transform(r1) + val r3 = new DataConversion().setCol("int").setConvertTo("boolean").transform(r2) + val r4 = new DataConversion().setCol("long").setConvertTo("boolean").transform(r3) + val r5 = new DataConversion().setCol("float").setConvertTo("boolean").transform(r4) + val r6 = new DataConversion().setCol("double").setConvertTo("boolean").transform(r5) + val expectedRes = Seq(( true, true, true, true, true, true, true, "7", "8.0"), + (false, true, true, true, true, true, true, "16", "17.456"), + (true, true, true, true, true, true, true, "100", "200.12345")) + .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring") + assert(r6.schema("byte").dataType == BooleanType) + assert(r6.schema("short").dataType == BooleanType) + assert(r6.schema("int").dataType == BooleanType) + assert(r6.schema("long").dataType == BooleanType) + assert(r6.schema("float").dataType == BooleanType) + assert(r6.schema("double").dataType == BooleanType) + } + + /* + Verify sting to boolean throws an error + */ + test("Test convert string to boolean throws an exception") { + assertThrows[Exception] { + new DataConversion().setCol("intstring").setConvertTo("boolean").transform(masterInDF) + } + } + + /* + Test conversion of all numeric types to Byte, as well as string representations + of integers and doubles + Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string + For floats and doubles, the conversion value is the truncated integer portion of + the number. For values that exceed the min/max value for integers, the value will be truncated + at the least 32 bits, so a very large number will end up being a very large negative number + */ + test("Test convert to Byte") { + val expectedDF = Seq(( 1: Byte, 1: Byte, 2: Byte, 3: Byte, 4: Byte, 5: Byte, 6: Byte, 7: Byte, 8: Byte), + (0: Byte, 9: Byte, 10: Byte, 11: Byte, 12: Byte, 14: Byte, 127: Byte, 16: Byte, 17: Byte), + (1: Byte, -127: Byte, 89: Byte, 99: Byte, 99: Byte, 18: Byte, 20: Byte, 100: Byte, -56: Byte)) + .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring") + val res = generateRes("byte", masterInDF) + assert(res.schema("bool").dataType == ByteType) + assert(res.schema("short").dataType == ByteType) + assert(res.schema("int").dataType == ByteType) + assert(res.schema("long").dataType == ByteType) + assert(res.schema("float").dataType == ByteType) + assert(res.schema("double").dataType == ByteType) + assert(res.schema("intstring").dataType == ByteType) + assert(res.schema("doublestring").dataType == ByteType) + } + + /* + Test conversion of all numeric types to Short, as well as string representations + of integers and doubles + Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string + For floats and doubles, the conversion value is the truncated integer portion of + the number. For values that exceed the min/max value for integers, the value will be truncated + at the least 32 bits, so a very large number will end up being a very large negative number + */ + test("Test convert to Short") { + val expectedDF = Seq(( 1: Short, 1: Short, 2: Short, 3: Short, 4: Short, 5: Short, 6: Short, 7: Short, 8: Short), + (0: Short, 9: Short, 10: Short, 11: Short, 12: Short, 14: Short, 15: Short, 16: Short, 17: Short), + (1: Short, -127: Short, 345: Short, -32669: Short, 99: Short, 18: Short, 20: Short, 100: Short, 200: Short)) + .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring") + assert(expectedDF.except(generateRes("short", masterInDF)).count == 0) + } + + /* + Test conversion of all numeric types to Integer, as well as string representations + of integers and doubles + Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string + For floats and doubles, the conversion value is the truncated integer portion of + the number. For values that exceed the min/max value for integers, the value will be truncated + at the least 32 bits, so a very large number will end up being a very large negative number + */ + test("Test convert to Integer") { + val expectedDF = Seq((1, 1, 2, 3, 4, 5, 6, 7, 8), + (0, 9, 10, 11, 12, 14, 15, 16, 17), + (1, -127, 345, 32867, -2147483549, 18, 20, 100, 200)) + .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring") + assert(expectedDF.except(generateRes("integer", masterInDF)).count == 0) + } + + /* + Test conversion of all numeric types to Long, as well as string representations + of integers and doubles + Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string + For floats and doubles, the conversion value is the truncated integer portion of + the number. For values that exceed the min/max value for integers, the value will be truncated + at the least 32 bits, so a very large number will end up being a very large negative number + */ + test("Test convert to Long") { + val expectedDF = Seq((1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), + (0L, 9L, 10L, 11L, 12L, 14L, 15L, 16L, 17L), + (1L, -127L, 345L, 32867L, 2147483747L, 18L, 20L, 100L, 200L)) + .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring") + assert(expectedDF.except(generateRes("long", masterInDF)).count == 0) + } + + /* + Test conversion of all numeric types to Double, as well as string representations + of integers and doubles + Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string + */ + test("Test convert to Double") { + val fToD = 18.91F.toDouble + val expectedDF = Seq((1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0), + (0.0, 9.0, 10.0, 11.0, 12.0, 14.5, 15.5, 16.0, 17.456), + (1.0, -127.0, 345.0, 32867.0, 2147483747.0, fToD, 20.21, 100.0, 200.12345)) + .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring") + assert(expectedDF.except(generateRes("double", masterInDF)).count == 0) + } + + // Test the conversions to string + test("Test convert all types to String") { + val expectedDF = Seq(("true", "1", "2", "3", "4", "5.0", "6.0", "7", "8.0"), + ("false", "9", "10", "11", "12", "14.5", "15.5", "16", "17.456"), + ("true", "-127", "345", "32867", "2147483747", "18.91", "20.21", "100", "200.12345")) + .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring") + assert(expectedDF.except(generateRes("string", masterInDF)).count == 0) + } + + // Test convert to categorical: + test("Test convert to categorical") { + val inDF = Seq(("piano", 1, 2), ("drum", 3, 4), ("guitar", 5, 6)).toDF("instruments", "c1", "c2") + val res = new DataConversion().setCol("instruments").setConvertTo("toCategorical").transform(inDF) + assert(SparkSchema.isCategorical(res, "instruments")) + } + + // Test clearing categorical + test("Test that categorical features will be cleared") { + val inDF = Seq(("piano", 1, 2), ("drum", 3, 4), ("guitar", 5, 6)).toDF("instruments", "c1", "c2") + val res = new DataConversion().setCol("instruments").setConvertTo("toCategorical").transform(inDF) + assert(SparkSchema.isCategorical(res, "instruments")) + val res2 = new DataConversion().setCol("instruments").setConvertTo("clearCategorical").transform(res) + assert(!SparkSchema.isCategorical(res2, "instruments")) + assert(inDF.except(res2).count == 0) + } + + // Verify that a TimestampType is converted to a LongType + test("Test timestamp to long conversion") { + val res = new DataConversion().setCol("Col0").setConvertTo("long") + .setDateTimeFormat("yyyy-MM-dd HH:mm:ss.SSS").transform(tsDF) + assert(res.schema("Col0").dataType == LongType) + assert(lDF.except(res).count == 0) + } + + // Test the reverse - long to timestamp + test("Test long to timestamp conversion") { + val res = new DataConversion().setCol("Col0").setConvertTo("date") + .setDateTimeFormat("yyyy-MM-dd HH:mm:ss.SSS").transform(lDF) + assert(res.schema("Col0").dataType == TimestampType) + assert(tsDF.except(res).count == 0) + } + + test("Test timestamp to string conversion") { + val res = new DataConversion().setCol("Col0").setConvertTo("string") + .setDateTimeFormat("yyyy-MM-dd HH:mm:ss.SSS").transform(tsDF) + assert(res.schema("Col0").dataType == StringType) + assert(sDF.except(res).count == 0) + } + + test("Test date string to timestamp conversion") { + val res = new DataConversion().setCol("Col0").setConvertTo("date") + .setDateTimeFormat("yyyy-MM-dd HH:mm:ss.SSS").transform(sDF) + val res2 = new DataConversion().setCol("Col0").setConvertTo("long") + .setDateTimeFormat("yyyy-MM-dd HH:mm:ss.SSS").transform(res) + assert(res.schema("Col0").dataType == TimestampType) + assert(tsDF.except(res).count == 0) + } + + def generateRes(convTo: String, inDF: DataFrame): DataFrame = { + val result = new DataConversion().setCol("bool, byte, short, int, long, float, double, intstring, doublestring") + .setConvertTo(convTo).transform(masterInDF) + result + } + +} diff --git a/src/downloader/build.sbt b/src/downloader/build.sbt new file mode 100644 index 0000000000..6d55f118b6 --- /dev/null +++ b/src/downloader/build.sbt @@ -0,0 +1 @@ +//> DependsOn: core diff --git a/src/downloader/src/main/python/ModelDownloader.py b/src/downloader/src/main/python/ModelDownloader.py new file mode 100644 index 0000000000..b774f63184 --- /dev/null +++ b/src/downloader/src/main/python/ModelDownloader.py @@ -0,0 +1,101 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +import sys + +if sys.version >= '3': + basestring = str + +from pyspark.ml.param.shared import * +from mmlspark.Utils import * + +DEFAULT_URL = "https://mmlspark.azureedge.net/datasets/CNTKModels/" + + +class ModelSchema: + def __init__(self, name, dataset, modelType, uri, hash, size, inputNode, numLayers, layerNames): + """ + An object that represents a model. + + :param name: Name of the model + :param dataset: Dataset it was trained on + :param modelType: Domain that the model operates on + :param uri: The location of the model's bytes + :param hash: The sha256 hash of the models bytes + :param size: the size of the model in bytes + :param inputNode: the node which represents the input + :param numLayers: the number of layers of the model + :param layerNames: the names of nodes that represent layers in the network + """ + self.name = name + self.dataset = dataset + self.modelType = modelType + self.uri = uri + self.hash = hash + self.size = size + self.inputNode = inputNode + self.numLayers = numLayers + self.layerNames = layerNames + + def __str__(self): + return self.__repr__() + + def __repr__(self): + return "ModelSchema".format(self.name, self.dataset, self.uri) + + def toJava(self, sparkSession): + ctx = sparkSession.sparkContext + uri = ctx._jvm.java.net.URI(self.uri) + return ctx._jvm.com.microsoft.ml.spark.ModelSchema( + self.name, self.dataset, self.modelType, + uri, self.hash, self.size, self.inputNode, + self.numLayers, self.layerNames) + + @staticmethod + def fromJava(jobj): + return ModelSchema(jobj.name(), jobj.dataset(), + jobj.modelType(), jobj.uri().toString(), + jobj.hash(), jobj.size(), jobj.inputNode(), + jobj.numLayers(), list(jobj.layerNames())) + + +class ModelDownloader: + def __init__(self, sparkSession, localPath, serverURL=DEFAULT_URL): + """ + A class for downloading CNTK pretrained models in python. To download all models use the downloadModels + function. To browse models from the microsoft server please use remoteModels. + + :param sparkSession: A spark session for interfacing between python and java + :param localPath: The folder to save models to + :param serverURL: The location of the model Server, beware this default can change! + """ + self.localPath = localPath + self.serverURL = serverURL + + self._sparkSession = sparkSession + self._ctx = sparkSession.sparkContext + self._model_downloader = self._ctx._jvm.com.microsoft.ml.spark.ModelDownloader( + sparkSession._jsparkSession, localPath, serverURL) + + def _wrap(self, iter): + return (ModelSchema.fromJava(s) for s in iter) + + def localModels(self): + return self._wrap(self._model_downloader.localModels()) + + def remoteModels(self): + return self._wrap(self._model_downloader.remoteModels()) + + def downloadModel(self, model): + model = model.toJava(self._sparkSession) + return ModelSchema.fromJava(self._model_downloader.downloadModel(model)) + + def downloadByName(self, name): + return ModelSchema.fromJava(self._model_downloader.downloadByName(name)) + + def downloadModels(self, models=None): + if models is None: + models = self.remoteModels() + models = (m.toJava(self._sparkSession) for m in models) + + return list(self._wrap(self._model_downloader.downloadModels(models))) diff --git a/src/downloader/src/main/scala/ModelDownloader.scala b/src/downloader/src/main/scala/ModelDownloader.scala new file mode 100644 index 0000000000..55b42a3bb0 --- /dev/null +++ b/src/downloader/src/main/scala/ModelDownloader.scala @@ -0,0 +1,260 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io._ +import java.net.{URI, URL} +import java.util +import org.apache.commons.io.IOUtils +import org.apache.hadoop.conf.{Configuration => HadoopConf} +import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} +import org.apache.hadoop.io.{IOUtils => HUtils} +import org.apache.log4j.LogManager +import org.apache.spark.sql.SparkSession +import spray.json._ +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ + +/** + * Abstract representation of a repository for future expansion + * + * @tparam S an instantiation of the + */ +private[spark] abstract class Repository[S <: Schema] { + + def listSchemas(): Iterable[S] + + def getBytes(schema: S): InputStream + + def addBytes(schema: S, location: URI, bytes: InputStream): S + +} + +/** + * Exception returned if a repo cannot find the file + * + * @param uri : location of the file + */ +class ModelNotFoundException(uri: URI) extends FileNotFoundException(s"model located at $uri could not be found") + +private[spark] class HDFSRepo[S <: Schema](val uri: URI, val hconf: HadoopConf) + (implicit val jsonFormat: JsonFormat[S]) + extends Repository[S] { + + private val rootPath = new Path(uri) + + private val fs = FileSystem.get(uri, hconf) + + if (!fs.exists(rootPath)) { + fs.mkdirs(rootPath) + } + + override def listSchemas(): Iterable[S] = { + val fileIteratorHadoop = fs.listFiles(rootPath, false) + val fileIterator = new Iterator[LocatedFileStatus] { + def hasNext: Boolean = fileIteratorHadoop.hasNext + + def next(): LocatedFileStatus = fileIteratorHadoop.next() + } + + val schemaStrings = fileIterator + .filter(status => + status.isFile & status.getPath.toString.endsWith(".meta")) + .map(status => + IOUtils.toString(fs.open(status.getPath).getWrappedStream)) + + schemaStrings.map(s => s.parseJson.convertTo[S]).toList + } + + override def getBytes(schema: S): InputStream = { + try { + fs.open(new Path(schema.uri)) + } catch { + case _: IOException => throw new ModelNotFoundException(schema.uri) + } + } + + override def addBytes(schema: S, location: URI, bytes: InputStream): S = { + val path = new Path(location) + val os = fs.create(path) + try { + HUtils.copyBytes(bytes, os, hconf) + } finally { + os.close() + } + val downloadedIs = fs.open(path) + try{ + schema.assertMatchingHash(downloadedIs) + }finally{ + downloadedIs.close() + } + + val newSchema = schema.updateURI(location) + val schemaPath = new Path(location.getPath + ".meta") + val osSchema = fs.create(schemaPath) + val SchemaIs = IOUtils.toInputStream(newSchema.toJson.prettyPrint) + try { + HUtils.copyBytes(SchemaIs, osSchema, hconf) + } finally { + osSchema.close() + SchemaIs.close() + } + newSchema + } + +} + +/** + * Class to represent repository of models that will eventually be hosted outside the repo + */ +private[spark] class DefaultModelRepo(val baseURL: URL) extends Repository[ModelSchema] { + var connectTimeout = 15000 + var readTimeout = 5000 + + import SchemaJsonProtocol._ + + private def toStream(url: URL) = { + val urlCon = url.openConnection() + urlCon.setConnectTimeout(connectTimeout) + urlCon.setReadTimeout(readTimeout) + new BufferedInputStream(urlCon.getInputStream) + } + + private def join(root: URL, file: String) = { + new Path(new Path(root.toURI), file).toUri.toURL + } + + override def listSchemas(): Iterable[ModelSchema] = { + val url = join(baseURL, "MANIFEST") + val manifestStream = toStream(url) + try { + val modelStreams = IOUtils.readLines(manifestStream).map(fn => toStream(join(baseURL, fn))) + try { + modelStreams.map(s => IOUtils.toString(s).parseJson.convertTo[ModelSchema]) + } finally { + modelStreams.foreach(_.close()) + } + } finally { + manifestStream.close() + } + } + + override def getBytes(schema: ModelSchema): InputStream = { + try { + val url = schema.uri.toURL + val urlCon = url.openConnection() + urlCon.setConnectTimeout(connectTimeout) + urlCon.setReadTimeout(readTimeout) + new BufferedInputStream(urlCon.getInputStream) + } catch { + case _: IOException => throw new ModelNotFoundException(schema.uri) + } + } + + override def addBytes(schema: ModelSchema, location: URI, bytes: InputStream): ModelSchema = + throw new IllegalAccessError("Do not have the credentials to write a file to the remote repository") +} + +private[spark] abstract class Client { + var quiet = false + + private def log(s: String): Unit = { + LogManager.getRootLogger.info(s) + } + + def repoTransfer[T <: Schema](schema: T, targetLocation: URI, + source: Repository[T], target: Repository[T], + overwrite: Boolean = false, closeStream: Boolean = true): T = { + if (target.listSchemas().exists(s => + (s.uri == targetLocation) && (s.hash == schema.hash))) { + log(s"Using model at $targetLocation, skipping download") + target.listSchemas().find(_.hash == schema.hash).get + } else { + log(s"No model found in local repo, writing bytes to $targetLocation") + val sourceStream = source.getBytes(schema) + try { + target.addBytes(schema, targetLocation, sourceStream) + } finally { + if (closeStream) sourceStream.close() + } + } + } + +} + +private[spark] object ModelDownloader { + private[spark] val defaultURL = new URL("https://mmlspark.azureedge.net/datasets/CNTKModels/") +} + +/** + * Class for downloading models from a server to Local or HDFS + * + * @param spark Spark session so that the downloader can save to HDFS + * @param localPath path to a directory that will store the models (local or HDFS) + * @param serverURL URL of the server which supplies models ( The default URL is subject to change) + */ +class ModelDownloader(val spark: SparkSession, + val localPath: URI, + val serverURL: URL = ModelDownloader.defaultURL) extends Client { + + import SchemaJsonProtocol._ + + def this(spark: SparkSession, localPath: String, serverURL: String) = { + this(spark, new URI(localPath), new URL(serverURL)) + } + + private val localModelRepo = new HDFSRepo[ModelSchema](localPath, spark.sparkContext.hadoopConfiguration) + + private val remoteModelRepo = new DefaultModelRepo(serverURL) + + /** + * Function for querying the local repository for its registered models + * + * @return the model schemas found in the downloader's local path + */ + def localModels: util.Iterator[ModelSchema] = localModelRepo.listSchemas().iterator.asJava + + /** + * Function for querying the remote server for its registered models + * + * @return the model schemas found in remote reposiory accessed through the serverURL + */ + def remoteModels: util.Iterator[ModelSchema] = remoteModelRepo.listSchemas().iterator.asJava + + /** + * Method to download a single model + * @param model the remote model schema + * @return the new local model schema with a URI that points to the model's location (on HDFS or local) + */ + def downloadModel(model: ModelSchema): ModelSchema = { + repoTransfer(model, + new Path(new Path(localPath), NamingConventions.canonicalModelFilename(model)).toUri, + remoteModelRepo, localModelRepo) + } + + def downloadByName(name: String): ModelSchema = { + val models = remoteModels.filter(_.name == name).toList + if (models.length != 1) { + throw new IllegalArgumentException(s"there are ${models.length} models with the same name") + } + downloadModel(models.head) + } + + /** + * @param models An iterable of remote model schemas + * @return An list of local model schema whose URI's points to the model's location (on HDFS or local) + */ + def downloadModels(models: Iterable[ModelSchema] = remoteModels.toIterable): List[ModelSchema] = + // Call toList so that all models are downloaded when downloadModels are called + models.map(downloadModel).toList + + /** + * @param models A java iterator of remote model schemas for in the java api (for python wrapper) + * @return A java List of local model schema whose URI's points to the model's location (on HDFS or local) + */ + def downloadModels(models: util.ArrayList[ModelSchema]): util.List[ModelSchema] = + // Call toList so that all models are downloaded when downloadModels are called + models.map(downloadModel).toList.asJava + +} diff --git a/src/downloader/src/main/scala/Schema.scala b/src/downloader/src/main/scala/Schema.scala new file mode 100644 index 0000000000..f30d02a83e --- /dev/null +++ b/src/downloader/src/main/scala/Schema.scala @@ -0,0 +1,92 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io.InputStream +import java.net.URI +import org.apache.commons.codec.digest.DigestUtils +import spray.json._ + +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ + +private[spark] object NamingConventions { + + def canonicalModelFilename(name: String, dataset: String): String = + s"${name}_$dataset.model" + + def canonicalModelFilename(model: ModelSchema): String = + s"${model.name}_${model.dataset}.model" + +} + +/** + * Abstract representation of a schema for an item that can be held in a repository + * + * @param uri location of the underlying file (local, HDFS, or HTTP) + * @param hash sha256 hash of the underlying file + * @param size size in bytes of the underlying file + */ +abstract class Schema(val uri: URI, val hash: String, val size: Long) { + + def updateURI(newURI: URI): this.type + + def assertMatchingHash(bytes: InputStream): Unit = { + val newHash = DigestUtils.sha256Hex(bytes) + if (newHash != hash) { + throw new IllegalArgumentException(s"downloaded hash: $newHash does not match given hash: $hash") + } + } + +} + +/** + * Class representing the schema of a CNTK model + * @param name name of the model architecture + * @param dataset dataset the model was trained on + * @param modelType type of problem the model is suited for eg: (image, text, sound, sentiment etc) + * @param uri location of the underlying file (local, HDFS, or HTTP) + * @param hash sha256 hash of the underlying file + * @param size size in bytes of the underlying file + * @param inputNode the node which represents the input + * @param numLayers the number of layers of the model + * @param layerNames the names nodes that represent layers in the network + */ +case class ModelSchema(name: String, + dataset: String, + modelType: String, + override val uri: URI, + override val hash: String, + override val size: Long, + inputNode: Int, + numLayers: Int, + layerNames: Array[String]) + extends Schema(uri, hash, size) { + + def this(name: String, dataset: String, modelType: String, + uri: URI, hash: String, size: Long, inputNode: Int, numLayers: Int, + layerNames: java.util.ArrayList[String]) = { + this(name, dataset, modelType, uri, hash, size, + inputNode, numLayers, layerNames.toList.toArray) + } + + override def updateURI(newURI: URI): this.type = this.copy(uri = newURI).asInstanceOf[this.type] + +} + +private[spark] object SchemaJsonProtocol extends DefaultJsonProtocol { + + implicit object URIJsonFormat extends JsonFormat[URI] { + def write(u: URI): JsValue = { + JsString(u.toString) + } + + def read(value: JsValue): URI = new URI(value.asInstanceOf[JsString].value) + } + + implicit val modelSchemaFormat: RootJsonFormat[ModelSchema] = + jsonFormat(ModelSchema.apply, + "name", "dataset", "modelType", "uri", "hash", "size", "inputNode", "numLayers", "layerNames") + +} diff --git a/src/downloader/src/test/scala/DownloaderSuite.scala b/src/downloader/src/test/scala/DownloaderSuite.scala new file mode 100644 index 0000000000..835c39e716 --- /dev/null +++ b/src/downloader/src/test/scala/DownloaderSuite.scala @@ -0,0 +1,49 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.nio.file.Files +import com.microsoft.ml.spark.FileUtilities.File +import scala.collection.JavaConversions._ + +class DownloaderSuite extends TestBase { + + val saveDir = Files.createTempDirectory("Models-").toFile + val d = new ModelDownloader(session, saveDir.toURI) + + test("A downloader should be able to download a model", TestBase.Extended) { + val m = d.remoteModels.filter(_.name == "CNN").next() + val schema = d.downloadModel(m) + println(schema) + assert(m.size == new File(schema.uri).length()) + assert(d.localModels.toList.length == 1) + } + + test("A downloader should be able to get all Models " + + "and maybeDownload should be fast if models are downloaded", TestBase.Extended) { + + d.downloadModels() + val modTimes = d.localModels.map(s => + new File(s.uri).lastModified()) + + d.downloadModels() + val modTimes2 = d.localModels.map(s => + new File(s.uri).lastModified()) + + // No modification on second call because models are cached + assert(modTimes.toList === modTimes2.toList) + + // the downloader's local models will reflect the change + assert(d.localModels.toList.length == d.remoteModels.toList.length) + + // there will be a metadata file for every model + assert(saveDir.list().count(_.endsWith(".meta")) == d.localModels.toList.length) + } + + override def afterAll(): Unit = { + FileUtilities.delTree(saveDir) + super.afterAll() + } + +} diff --git a/src/featurize/build.sbt b/src/featurize/build.sbt new file mode 100644 index 0000000000..c013e90fa1 --- /dev/null +++ b/src/featurize/build.sbt @@ -0,0 +1,3 @@ +//> DependsOn: core +//> DependsOn: utils +//> DependsOn: multi-column-adapter diff --git a/src/featurize/src/main/scala/AssembleFeatures.scala b/src/featurize/src/main/scala/AssembleFeatures.scala new file mode 100644 index 0000000000..27b583d00e --- /dev/null +++ b/src/featurize/src/main/scala/AssembleFeatures.scala @@ -0,0 +1,499 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io._ + +import com.microsoft.ml.spark.schema.{CategoricalColumnInfo, DatasetExtensions} +import com.microsoft.ml.spark.schema.DatasetExtensions._ +import org.apache.hadoop.fs.Path +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.ml.feature._ +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util._ +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType +import org.apache.spark.ml.linalg.SparseVector +import org.apache.spark.mllib.linalg.VectorUDT +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{StringType, _} + +import scala.collection.mutable +import scala.collection.mutable.ListBuffer +import scala.collection.immutable.{BitSet, HashSet} + +private object AssembleFeaturesUtilities +{ + private val tokenizedColumnName = "tokenizedFeatures" + private val hashedFeaturesColumnName = "hashedFeatures" + private val selectedFeaturesColumnName = "selectedFeatures" + + def getTokenizedColumnName(dataset:DataFrame): String = { + dataset.withDerivativeCol(tokenizedColumnName) + } + + def getHashedFeaturesColumnName(dataset:DataFrame): String = { + dataset.withDerivativeCol(hashedFeaturesColumnName) + } + + def getSelectedFeaturesColumnName(dataset:DataFrame): String = { + dataset.withDerivativeCol(selectedFeaturesColumnName) + } + + def hashStringColumns(nonMissingDataset: DataFrame, colNamesToHash: ListBuffer[String], + hashingTransform: HashingTF): DataFrame = { + val tokenizeFunc = udf((cols: Seq[String]) => cols + .filter(str => str != null && !str.isEmpty) + .flatMap(str => str.toLowerCase.split("\\s"))) + val cols = array(colNamesToHash.map(x => col(x)): _*) + val combinedData = nonMissingDataset.withColumn(hashingTransform.getInputCol, tokenizeFunc(cols)) + hashingTransform.transform(combinedData) + } + + def isNumeric(dataType: DataType): Boolean = dataType == IntegerType || + dataType == BooleanType || + dataType == LongType || + dataType == ByteType || + dataType == ShortType || + dataType == FloatType +} + +/** + * Class containing the list of column names to perform special featurization steps for. + * colNamesToHash - List of column names to hash. + * colNamesToDuplicateForMissings - List of column names containing doubles to duplicate + * so we can remove missing values from them. + * colNamesToTypes - Map of column names to their types. + * colNamesToCleanMissings - List of column names to clean missing values from (ignore). + * colNamesToVectorize - List of column names to vectorize using FastVectorAssembler. + * categoricalColumns - List of categorical columns to pass through or turn into indicator array. + * conversionColumnNamesMap - Map from old column names to new. + */ +@SerialVersionUID(0L) +class ColumnNamesToFeaturize extends Serializable { + val colNamesToHash = ListBuffer[String]() + val colNamesToDuplicateForMissings = ListBuffer[String]() + val colNamesToTypes = mutable.Map[String, DataType]() + val vectorColumnsToAdd = ListBuffer[String]() + val colNamesToCleanMissings = ListBuffer[String]() + val colNamesToVectorize = ListBuffer[String]() + val categoricalColumns = mutable.Map[String, String]() + val conversionColumnNamesMap = mutable.Map[String, String]() +} + +object AssembleFeatures extends DefaultParamsReadable[AssembleFeatures] + +/** + * Assembles the features in a dataset, converting them to a form appropriate for training. + */ +class AssembleFeatures(override val uid: String) extends Estimator[AssembleFeaturesModel] + with HasFeaturesCol with MMLParams { + + def this() = this(Identifiable.randomUID("AssembleFeatures")) + + setDefault(featuresCol -> "features") + + val columnsToFeaturize: StringArrayParam = + new StringArrayParam(this, "columnsToFeaturize", "columns to featurize", array => true) + + /** @group getParam **/ + final def getColumnsToFeaturize: Array[String] = $(columnsToFeaturize) + + /** @group setParam **/ + def setColumnsToFeaturize(value: Array[String]): this.type = set(columnsToFeaturize, value) + + val oneHotEncodeCategoricals: Param[Boolean] = BooleanParam(this, + "oneHotEncodeCategoricals", + "one hot encode categoricals", + true) + + /** @group getParam **/ + final def getOneHotEncodeCategoricals: Boolean = $(oneHotEncodeCategoricals) + + /** @group setParam **/ + def setOneHotEncodeCategoricals(value: Boolean): this.type = set(oneHotEncodeCategoricals, value) + + val numberOfFeatures: IntParam = + IntParam(this, "numberOfFeatures", "number of features to hash string columns to") + + /** @group getParam **/ + final def getNumberOfFeatures: Int = $(numberOfFeatures) + + /** @group setParam **/ + def setNumberOfFeatures(value: Int): this.type = set(numberOfFeatures, value) + + /** + * Assembles the features in the dataset. + * + * @param dataset The input dataset to fit. + * @return The model that will return the original dataset with assembled features as a vector. + */ + override def fit(dataset: Dataset[_]): AssembleFeaturesModel = { + val columnNamesToFeaturize = new ColumnNamesToFeaturize + + val columnsToFeaturize = HashSet[String](getColumnsToFeaturize: _*) + + val columns = dataset.columns + + val allIntermediateCols = new mutable.HashSet[String]() + allIntermediateCols ++= columns + + val datasetAsDf = dataset.toDF() + + // Remap and convert columns prior to training + columns.foreach { + col => if (columnsToFeaturize.contains(col)) { + val unusedColumnName = DatasetExtensions.findUnusedColumnName(col)(allIntermediateCols) + allIntermediateCols += unusedColumnName + + // Find out if column is categorical + // If using non-tree learner, one-hot encode them + // Otherwise, pass attributes directly to train classifier, + // but move categoricals to beginning for superior + // runtime and to avoid spark bug + val categoricalInfo = new CategoricalColumnInfo(datasetAsDf, col) + val isCategorical = categoricalInfo.isCategorical + if (isCategorical) { + val oheColumnName = DatasetExtensions.findUnusedColumnName("TmpOHE_" + unusedColumnName)(allIntermediateCols) + columnNamesToFeaturize.categoricalColumns += unusedColumnName -> oheColumnName + } + + dataset.schema(col).dataType match { + case _ @ (dataType: DataType) if dataType == DoubleType + || dataType == FloatType => { + columnNamesToFeaturize.colNamesToTypes += unusedColumnName -> dataType + // For double and float columns, will always need to remove possibly NaN values + columnNamesToFeaturize.colNamesToCleanMissings += unusedColumnName + columnNamesToFeaturize.conversionColumnNamesMap += col -> unusedColumnName + } + case _ @ (dataType: DataType) if (AssembleFeaturesUtilities.isNumeric(dataType)) => { + // Convert all numeric columns to same type double to feed them as a vector to the learner + if (dataset.schema(col).nullable) { + columnNamesToFeaturize.colNamesToCleanMissings += unusedColumnName + } + columnNamesToFeaturize.colNamesToTypes += unusedColumnName -> dataType + columnNamesToFeaturize.conversionColumnNamesMap += col -> unusedColumnName + } + case _: StringType => { + // Hash string columns + columnNamesToFeaturize.colNamesToHash += col + columnNamesToFeaturize.colNamesToTypes += col -> StringType + } + case _ @ (dataType: DataType) if dataType.typeName == "vector" || dataType.isInstanceOf[VectorUDT] => { + columnNamesToFeaturize.vectorColumnsToAdd += unusedColumnName + // For double columns, will always need to remove possibly NaN values + columnNamesToFeaturize.colNamesToCleanMissings += unusedColumnName + columnNamesToFeaturize.colNamesToTypes += unusedColumnName -> dataType + columnNamesToFeaturize.conversionColumnNamesMap += col -> unusedColumnName + } + } + } + } + val colNamesToVectorizeWithoutHashOneHot: List[String] = getColumnsToVectorize(columnNamesToFeaturize, + columnNamesToFeaturize.conversionColumnNamesMap.keys.toSeq) + + // Tokenize the string columns + val (transform: Option[HashingTF], colNamesToVectorize: List[String], nonZeroColumns: Option[Array[Int]]) = + if (columnNamesToFeaturize.colNamesToHash.isEmpty) + (None, colNamesToVectorizeWithoutHashOneHot, None) + else { + val hashingTransform = new HashingTF() + .setInputCol(AssembleFeaturesUtilities.getTokenizedColumnName(datasetAsDf)) + .setOutputCol(AssembleFeaturesUtilities.getHashedFeaturesColumnName(datasetAsDf)) + .setNumFeatures(getNumberOfFeatures) + + // Hash data for the vectorizer, to determine which slots are non-zero and should be kept + val hashedData = AssembleFeaturesUtilities.hashStringColumns(datasetAsDf, + columnNamesToFeaturize.colNamesToHash, + hashingTransform) + val encoder = Encoders.kryo[BitSet] + val bitset = hashedData.select(hashingTransform.getOutputCol) + .map(row => toBitSet(row.getAs[SparseVector](0).indices))(encoder) + .reduce(_ | _) + + val nonZeroColumns: Array[Int] = bitset.toArray + + val colsToVectorize = + colNamesToVectorizeWithoutHashOneHot :+ AssembleFeaturesUtilities.getSelectedFeaturesColumnName(datasetAsDf) + + (Some(hashingTransform), + colsToVectorize, + Some(nonZeroColumns)) + } + + columnNamesToFeaturize.colNamesToVectorize ++= colNamesToVectorize + + val vectorAssembler = new FastVectorAssembler() + .setInputCols(colNamesToVectorize.toArray) + .setOutputCol(getFeaturesCol) + + new AssembleFeaturesModel(uid, columnNamesToFeaturize, transform, + nonZeroColumns, vectorAssembler, $(oneHotEncodeCategoricals)) + } + + private def getColumnsToVectorize(columnNamesToFeaturize: ColumnNamesToFeaturize, + columnsToFeaturize: Seq[String]): List[String] = { + val categoricalColumnNames = + if ($(oneHotEncodeCategoricals)) { + columnNamesToFeaturize.categoricalColumns.values + } else { + columnNamesToFeaturize.categoricalColumns.keys + } + + val newColumnNames = + columnsToFeaturize.map(oldColName => columnNamesToFeaturize.conversionColumnNamesMap(oldColName)) + + val colNamesToVectorizeWithoutHash = (categoricalColumnNames.toList + ::: newColumnNames.toList) + .distinct + + // If one hot encoding, remove the columns we are converting from the list to vectorize + val colNamesToVectorizeWithoutHashOneHot = + if ($(oneHotEncodeCategoricals)) { + colNamesToVectorizeWithoutHash.filter { + !columnNamesToFeaturize.categoricalColumns.contains(_) + } + } else { + colNamesToVectorizeWithoutHash + } + colNamesToVectorizeWithoutHashOneHot + } + + def toBitSet(indices: Array[Int]): BitSet = { + indices.foldLeft(BitSet())((bitset, index) => bitset + index) + } + + override def copy(extra: ParamMap): Estimator[AssembleFeaturesModel] = { + new AssembleFeatures() + } + + @DeveloperApi + override def transformSchema(schema: StructType): StructType = + schema.add(new StructField(getFeaturesCol, VectorType)) + +} + +/** + * Model produced by [[AssembleFeatures]]. + */ +class AssembleFeaturesModel(val uid: String, + val columnNamesToFeaturize: ColumnNamesToFeaturize, + val hashingTransform: Option[HashingTF], + val nonZeroColumns: Option[Array[Int]], + val vectorAssembler: FastVectorAssembler, + val oneHotEncodeCategoricals: Boolean) + extends Model[AssembleFeaturesModel] with Params with MLWritable { + + /** @group getParam **/ + final def getFeaturesColumn: String = vectorAssembler.getOutputCol + + override def write: MLWriter = new AssembleFeaturesModel.AssembleFeaturesModelWriter(uid, + columnNamesToFeaturize, + hashingTransform, + nonZeroColumns, + vectorAssembler, + oneHotEncodeCategoricals) + + override def copy(extra: ParamMap): AssembleFeaturesModel = + new AssembleFeaturesModel(uid, + columnNamesToFeaturize, + hashingTransform, + nonZeroColumns, + vectorAssembler, + oneHotEncodeCategoricals) + + override def transform(dataset: Dataset[_]): DataFrame = { + val transformedDataset = dataset.select( + dataset.columns.flatMap { + col => { + val dataType = dataset.schema(col).dataType + if (!dataType.isInstanceOf[StringType] + && columnNamesToFeaturize.colNamesToHash.contains(col)) { + throw new Exception("Invalid column type specified during score, should be string for column: " + col) + } + + if (!columnNamesToFeaturize.conversionColumnNamesMap.contains(col)) { + Seq(dataset(col)) + } else { + val colType = columnNamesToFeaturize.colNamesToTypes(columnNamesToFeaturize.conversionColumnNamesMap(col)) + if (colType != dataType) { + throw new Exception(s"Invalid column type specified during score, should be $colType for column: " + col) + } + + // Convert all columns to same type double to feed them as a vector to the learner + dataType match { + case _ @ (dataType: DataType) if (AssembleFeaturesUtilities.isNumeric(dataType)) => { + Seq(dataset(col), + dataset(col).cast(DoubleType).as(columnNamesToFeaturize.conversionColumnNamesMap(col), + dataset.schema(col).metadata)) + } + case _: DoubleType => { + Seq(dataset(col), + dataset(col).as(columnNamesToFeaturize.conversionColumnNamesMap(col), + dataset.schema(col).metadata)) + } + case _ @ (dataType: DataType) if dataType.typeName == "vector" || dataType.isInstanceOf[VectorUDT] => { + Seq(dataset(col), + dataset(col).as(columnNamesToFeaturize.conversionColumnNamesMap(col), + dataset.schema(col).metadata)) + } + case default => Seq(dataset(col)) + } + } + } + }: _* + ) + + // Drop all rows with missing values + val nonMissingDataset = transformedDataset.na.drop(columnNamesToFeaturize.colNamesToCleanMissings) + // Tokenize the string columns + val stringFeaturizedData: DataFrame = + if (columnNamesToFeaturize.colNamesToHash.isEmpty) nonMissingDataset + else { + val hashedData = AssembleFeaturesUtilities.hashStringColumns(nonMissingDataset, + columnNamesToFeaturize.colNamesToHash, + hashingTransform.get) + + val vectorSlicer = new VectorSlicer().setIndices(nonZeroColumns.get) + .setInputCol(hashingTransform.get.getOutputCol) + .setOutputCol(columnNamesToFeaturize.colNamesToVectorize.last) + // Run count based feature selection on the hashed data + val countBasedFeatureSelectedColumns = vectorSlicer.transform(hashedData) + // Remove the intermediate columns tokenized and hashed + countBasedFeatureSelectedColumns + .drop(hashingTransform.get.getInputCol) + .drop(hashingTransform.get.getOutputCol) + } + var columnsToDrop = vectorAssembler.getInputCols + // One-hot encode categoricals + val oheData = + if (oneHotEncodeCategoricals && !columnNamesToFeaturize.categoricalColumns.isEmpty) { + val ohe = new OneHotEncoder() + val inputColsKeys = columnNamesToFeaturize.categoricalColumns.keys + val outputColsKeys = columnNamesToFeaturize.categoricalColumns.values + val inputCols = inputColsKeys.mkString(",") + val outputCols = outputColsKeys.mkString(",") + val oheAdapter = + new MultiColumnAdapter().setBaseTransformer(ohe).setInputCols(inputCols).setOutputCols(outputCols) + columnsToDrop = columnsToDrop.union(columnNamesToFeaturize.categoricalColumns.keys.toSeq) + oheAdapter.transform(stringFeaturizedData) + } else { + stringFeaturizedData + } + + val vectorizedData = vectorAssembler.transform(oheData) + + // Drop the vector assembler intermediate columns + vectorizedData.drop(columnsToDrop: _*) + } + + @DeveloperApi + override def transformSchema(schema: StructType): StructType = + schema.add(new StructField(getFeaturesColumn, VectorType)) + +} + +object AssembleFeaturesModel extends MLReadable[AssembleFeaturesModel] { + + private val hashingTransformPart = "hashingTransform" + private val vectorAssemblerPart = "vectorAssembler" + private val columnNamesToFeaturizePart = "columnNamesToFeaturize" + private val nonZeroColumnsPart = "nonZeroColumns" + private val dataPart = "data" + + override def read: MLReader[AssembleFeaturesModel] = new AssembleFeaturesModelReader + + override def load(path: String): AssembleFeaturesModel = super.load(path) + + /** [[MLWriter]] instance for [[AssembleFeaturesModel]] */ + private[AssembleFeaturesModel] + class AssembleFeaturesModelWriter(val uid: String, + val columnNamesToFeaturize: ColumnNamesToFeaturize, + val hashingTransform: Option[HashingTF], + val nonZeroColumns: Option[Array[Int]], + val vectorAssembler: FastVectorAssembler, + val oneHotEncodeCategoricals: Boolean) + extends MLWriter { + private case class Data(uid: String, oneHotEncodeCategoricals: Boolean) + + override protected def saveImpl(path: String): Unit = { + val overwrite = this.shouldOverwrite + val qualPath = PipelineUtilities.makeQualifiedPath(sc, path) + // Required in order to allow this to be part of an ML pipeline + PipelineUtilities.saveMetadata(uid, + AssembleFeaturesModel.getClass.getName.replace("$", ""), + new Path(path, "metadata").toString, + sc, + overwrite) + + val dataPath = new Path(qualPath, dataPart).toString + + // Save data + val data = Data(uid, oneHotEncodeCategoricals) + // save the hashing transform + if (!hashingTransform.isEmpty) { + val hashingTransformPath = new Path(qualPath, hashingTransformPart).toString + val writer = + if (overwrite) hashingTransform.get.write.overwrite() + else hashingTransform.get.write + writer.save(hashingTransformPath) + } + // save the vector assembler + val vectorAssemblerPath = new Path(qualPath, vectorAssemblerPart).toString + val writer = + if (overwrite) vectorAssembler.write.overwrite() + else vectorAssembler.write + writer.save(vectorAssemblerPath) + + // save the column names to featurize + ObjectUtilities.writeObject(columnNamesToFeaturize, qualPath, columnNamesToFeaturizePart, sc, overwrite) + + // save the nonzero columns + ObjectUtilities.writeObject(nonZeroColumns, qualPath, nonZeroColumnsPart, sc, overwrite) + + val saveMode = + if (overwrite) SaveMode.Overwrite + else SaveMode.ErrorIfExists + sparkSession.createDataFrame(Seq(data)).repartition(1).write.mode(saveMode).parquet(dataPath) + } + } + + private class AssembleFeaturesModelReader + extends MLReader[AssembleFeaturesModel] { + override def load(path: String): AssembleFeaturesModel = { + val qualPath = PipelineUtilities.makeQualifiedPath(sc, path) + // load the uid and one hot encoding param + val dataPath = new Path(qualPath, dataPart).toString + val data = sparkSession.read.format("parquet").load(dataPath) + val Row(uid: String, oneHotEncodeCategoricals: Boolean) = + data.select("uid", "oneHotEncodeCategoricals").head() + + // load the hashing transform + val hashingPath = new Path(qualPath, hashingTransformPart).toString + val hashingTransform = + if (new File(hashingPath).exists()) Some(HashingTF.load(hashingPath)) + else None + + // load the vector assembler + val vectorAssemblerPath = new Path(qualPath, vectorAssemblerPart).toString + val vectorAssembler = FastVectorAssembler.load(vectorAssemblerPath) + + // load the column names to featurize + val columnNamesToFeaturize = + ObjectUtilities.loadObject[ColumnNamesToFeaturize](qualPath, columnNamesToFeaturizePart, sc) + + // load the nonzero columns + val nonZeroColumns = ObjectUtilities.loadObject[Option[Array[Int]]](qualPath, nonZeroColumnsPart, sc) + + new AssembleFeaturesModel(uid, + columnNamesToFeaturize, + hashingTransform, + nonZeroColumns, + vectorAssembler, + oneHotEncodeCategoricals) + } + } + +} diff --git a/src/featurize/src/main/scala/Featurize.scala b/src/featurize/src/main/scala/Featurize.scala new file mode 100644 index 0000000000..2090891f79 --- /dev/null +++ b/src/featurize/src/main/scala/Featurize.scala @@ -0,0 +1,92 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util._ +import org.apache.spark.ml.{Estimator, Pipeline, PipelineModel} +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ + +private object FeaturizeUtilities +{ + // 2^18 features by default + val numFeaturesDefault = 262144 + // 2^12 features for tree-based or NN-based learners + val numFeaturesTreeOrNNBased = 4096 +} + +object Featurize extends DefaultParamsReadable[Featurize] + +/** + * Featurizes a dataset, converting them to a form appropriate for training. + */ +class Featurize(override val uid: String) extends Estimator[PipelineModel] with MMLParams { + + def this() = this(Identifiable.randomUID("Featurize")) + + val featureColumns: MapArrayParam = new MapArrayParam(uid, "featureColumns", "feature columns") + + /** @group getParam **/ + final def getFeatureColumns: Map[String, Seq[String]] = $(featureColumns) + + /** @group setParam **/ + def setFeatureColumns(value: Map[String, Seq[String]]): this.type = set(featureColumns, value) + + val oneHotEncodeCategoricals: Param[Boolean] = BooleanParam(this, + "oneHotEncodeCategoricals", + "one hot encode categoricals", + true) + + /** @group getParam **/ + final def getOneHotEncodeCategoricals: Boolean = $(oneHotEncodeCategoricals) + + /** @group setParam **/ + def setOneHotEncodeCategoricals(value: Boolean): this.type = set(oneHotEncodeCategoricals, value) + + val numberOfFeatures: IntParam = + IntParam(this, + "numberOfFeatures", + "number of features to hash string columns to", + FeaturizeUtilities.numFeaturesDefault) + + /** @group getParam **/ + final def getNumberOfFeatures: Int = $(numberOfFeatures) + + /** @group setParam **/ + def setNumberOfFeatures(value: Int): this.type = set(numberOfFeatures, value) + + /** + * Featurizes the dataset. + * + * @param dataset The input dataset to train. + * @return The featurized model. + */ + override def fit(dataset: Dataset[_]): PipelineModel = { + val pipeline = assembleFeaturesEstimators(getFeatureColumns) + pipeline.fit(dataset) + } + + private def assembleFeaturesEstimators(featureColumns: Map[String, Seq[String]]): Pipeline = { + val assembleFeaturesEstimators = featureColumns.map(newColToFeatures => { + new AssembleFeatures() + .setColumnsToFeaturize(newColToFeatures._2.toArray) + .setFeaturesCol(newColToFeatures._1) + .setNumberOfFeatures(getNumberOfFeatures) + .setOneHotEncodeCategoricals(getOneHotEncodeCategoricals) + }).toArray + + new Pipeline().setStages(assembleFeaturesEstimators) + } + + override def copy(extra: ParamMap): Estimator[PipelineModel] = { + new Featurize() + } + + @DeveloperApi + override def transformSchema(schema: StructType): StructType = + assembleFeaturesEstimators(getFeatureColumns).transformSchema(schema) + +} diff --git a/src/featurize/src/test/scala/VerifyFeaturize.scala b/src/featurize/src/test/scala/VerifyFeaturize.scala new file mode 100644 index 0000000000..d01294856f --- /dev/null +++ b/src/featurize/src/test/scala/VerifyFeaturize.scala @@ -0,0 +1,330 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.nio.file.Files + +import com.microsoft.ml.spark.FileUtilities.File +import com.microsoft.ml.spark.schema.SparkSchema +import org.apache.spark.ml.Estimator +import org.apache.spark.ml.feature.StringIndexer +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors} +import org.apache.spark.sql._ +import org.apache.spark.sql.types.StructType + +class VerifyAssembleFeatures extends EstimatorFuzzingTest { + override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = { + val assembleFeatures = estimator.asInstanceOf[AssembleFeatures] + assembleFeatures.setColumnsToFeaturize(fitDataset.columns) + } + + override def createFitDataset: DataFrame = { + // TODO: Fix bug for date and timestamp type not supported + val options = DatasetOptions(ColumnOptions.ValueSet(ColumnOptions.Scalar), + DataOptions.ValueSet(DataOptions.String, + DataOptions.Int, + DataOptions.Double, + DataOptions.Boolean, + DataOptions.Byte, + DataOptions.Short)) + val indexToType = Map[Int, DatasetOptions](1 -> options, 2 -> options, 3 -> options, 4 -> options, 5 -> options) + GenerateDataset.generateDatasetFromOptions(session, + indexToType, + new BasicDatasetGenerationConstraints(5, 5, Array()), + 0) + } + + override def schemaForDataset: StructType = ??? + + override def getEstimator(): Estimator[_] = new AssembleFeatures() +} + +class VerifyFeaturize extends EstimatorFuzzingTest { + + val mockLabelColumn = "Label" + val featuresColumn = "testColumn" + val thisDirectory = new File("src/test/scala") + val targetDirectory = new File("target") + + val benchmarkBasicDataTypesFile = "benchmarkBasicDataTypes.json" + val historicDataTypesFile = new File(thisDirectory, benchmarkBasicDataTypesFile) + val benchmarkBasicDataTypesTempFile = getTempFile(benchmarkBasicDataTypesFile) + + val benchmarkVectorsFile = "benchmarkVectors.json" + val historicVectorsFile = new File(thisDirectory, benchmarkVectorsFile) + val benchmarkVectorsTempFile = getTempFile(benchmarkVectorsFile) + + val benchmarkStringFile = "benchmarkString.json" + val historicStringFile = new File(thisDirectory, benchmarkStringFile) + val benchmarkStringTempFile = getTempFile(benchmarkStringFile) + + val benchmarkStringMissingsFile = "benchmarkStringMissing.json" + val historicStringMissingsFile = new File(thisDirectory, benchmarkStringMissingsFile) + val benchmarkStringMissingsTempFile = getTempFile(benchmarkStringMissingsFile) + + val benchmarkOneHotFile = "benchmarkOneHot.json" + val historicOneHotFile = new File(thisDirectory, benchmarkOneHotFile) + val benchmarkOneHotTempFile = getTempFile(benchmarkOneHotFile) + + val benchmarkNoOneHotFile = "benchmarkNoOneHot.json" + val historicNoOneHotFile = new File(thisDirectory, benchmarkNoOneHotFile) + val benchmarkNoOneHotTempFile = getTempFile(benchmarkNoOneHotFile) + + val benchmarkOneHotMissingsFile = "benchmarkOneHotMissings.json" + val historicOneHotMissingsFile = new File(thisDirectory, benchmarkOneHotMissingsFile) + val benchmarkOneHotMissingsTempFile = getTempFile(benchmarkOneHotMissingsFile) + + val benchmarkNoOneHotMissingsFile = "benchmarkNoOneHotMissings.json" + val historicNoOneHotMissingsFile = new File(thisDirectory, benchmarkNoOneHotMissingsFile) + val benchmarkNoOneHotMissingsTempFile = getTempFile(benchmarkNoOneHotMissingsFile) + + val benchmarkStringIndexOneHotFile = "benchmarkStringIndexOneHot.json" + val historicStringIndexOneHotFile = new File(thisDirectory, benchmarkStringIndexOneHotFile) + val benchmarkStringIndexOneHotTempFile = getTempFile(benchmarkStringIndexOneHotFile) + + private def getTempFile(fileName: String): File = { + new File(targetDirectory, + s"${fileName}_${System.currentTimeMillis}_.json") + } + + // int label with features of: + // long, double, boolean, int, byte, float + val mockDataset = session.createDataFrame(Seq( + (0, 2L, 0.50, true, 0, 0.toByte, 12F), + (1, 3L, 0.40, false, 1, 100.toByte, 30F), + (0, 4L, 0.78, true, 2, 50.toByte, 12F), + (1, 5L, 0.12, false, 3, 0.toByte, 12F), + (0, 1L, 0.50, true, 0, 0.toByte, 30F), + (1, 3L, 0.40, false, 1, 10.toByte, 12F), + (0, 3L, 0.78, false, 2, 0.toByte, 12F), + (1, 4L, 0.12, false, 3, 0.toByte, 12F), + (0, 0L, 0.50, true, 0, 0.toByte, 12F), + (1, 2L, 0.40, false, 1, 127.toByte, 30F), + (0, 3L, 0.78, true, 2, -128.toByte, 12F), + (1, 4L, 0.12, false, 3, 0.toByte, 12F))) + .toDF(mockLabelColumn, "col1", "col2", "col3", "col4", "col5", "col6") + + test("Featurizing on some basic data types") { + val result: DataFrame = featurizeAndVerifyResult(mockDataset, + benchmarkBasicDataTypesTempFile.toString, + historicDataTypesFile) + // Verify that features column has the correct number of slots + assert(result.first().getAs[DenseVector](featuresColumn).values.length == 6) + } + + test("Featurizing with vector columns, sparse and dense") { + val dataset: DataFrame = session.createDataFrame(Seq( + (0, Vectors.sparse(3, Seq((0, 1.0), (2, 2.0))), 0.50, 0.60, 0, Vectors.dense(1.0, 0.1, -1.5)), + (1, Vectors.dense(1.5, 0.2, -1.2), 0.40, 0.50, 1, Vectors.dense(1.5, 0.2, -1.2)), + (1, Vectors.sparse(3, Seq((0, 1.0), (2, 2.0))), 0.12, 0.34, 3, Vectors.sparse(3, Seq((0, 1.0), (2, 2.0)))), + (0, Vectors.dense(1.1, 0.5, -1.024), 0.50, 0.60, 0, Vectors.dense(1.0, 0.4, -1.23)), + (1, Vectors.dense(1.1, 0.5, -1.056), 0.40, 0.50, 1, Vectors.dense(1.1, 0.5, -1.024)), + (0, Vectors.dense(Double.NaN, 0.2, -1.23), 0.78, 0.99, 2, Vectors.dense(1.0, 0.1, -1.22)), + (1, Vectors.dense(1.0, 0.4, -1.23), 0.12, 0.34, 3, Vectors.dense(Double.NaN, 0.2, -1.23)))) + .toDF(mockLabelColumn, "col1", "col2", "col3", "col4", "col5") + + val result: DataFrame = featurizeAndVerifyResult(dataset, + benchmarkVectorsTempFile.toString, + historicVectorsFile) + // Verify that features column has the correct number of slots + assert(result.first().getAs[DenseVector](featuresColumn).values.length == 9) + } + + test("Featurizing with text columns - using hashing with count based feature selection") { + val dataset: DataFrame = session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, "pokemon are everywhere"), + (1, 3, 0.40, 0.50, "they are in the woods"), + (0, 4, 0.78, 0.99, "they are in the water"), + (1, 5, 0.12, 0.34, "they are in the fields"), + (0, 3, 0.78, 0.99, "pokemon - gotta catch em all"))) + .toDF(mockLabelColumn, "col1", "col2", "col3", "col4") + + val result: DataFrame = featurizeAndVerifyResult(dataset, + benchmarkStringTempFile.toString, + historicStringFile) + // Verify that features column has the correct number of slots + assert(result.first().getAs[SparseVector](featuresColumn).size == 11) + } + + test("Verify featurizing text data produces proper tokenized output") { + val wordCountCol = "wordCount" + val wordLengthCol = "wordLength" + val textCol = "textCol" + val mockAmazonData = session.createDataFrame(Seq( + (1, 221, 4.42, "Ok~ but I think the Keirsey Temperment Test is more accurate - and cheaper. This book has its " + + "good points. If anything, it helps you put into words what you want from a supervisor, but it is not very " + + "accurate. The online test does not account for a difference between when 2 of their options are both " + + "exactly like you, or if they don't describe you at all. This messes up the results, and it did not " + + "describe me very well. I am not just in denial. I have taken a lot of personality type tests, like " + + "the Keirsey Temperment sorter and have a pretty good idea of my strengths. So, although this book is " + + "pretty good in making you understand the importance of incouraging your strengths, it still " + + "leaves you wondering about how you fit in to their terminology. As for using this book as a manager " + + "to better serve your employees, I'v seen it done and it does not necessarily work because the strengths " + + "spit out for people were not wholly accurate. The company I work for has done this, and most of the " + + "people who were shifted around to better serve their strengths (according to this book) are very " + + "unhappy in their new positions. Your money can be spent better elsewhere. I say its only worth about $10"), + (0, 138, 4.49, "I had a bad feeling about this! And I was right! I was intrigued by the title, which " + + "supposedly links Jedi wisdom to Christianity. Well, after 60 pages or so, I have got the feeling that the " + + "Staub is trying to wrap Jedi in Christian cloth and failing at that. The author speaks of the difficulty in " + + "leading a Christian life. But, I say that any religious life (be it Christian, Islam or otherwise) is hard " + + "because it turns the back on the norm or the conventional. I am convinced that Yoda is a Zen master; " + + "the Force is derived from Tao, not God as interpreted by the orthodox religion(I am purposefully leaving " + + "out Christian Mysticism, which is another beast altogether.). A better book on the subject of theology " + + "in Star wars is \"The Dharma of Star Wars.\""), + (0, 43, 4.98, "Poorly written I tried reading this book but found it so turgid and poorly written that I " + + "put it down in frustration. It reads like a translation from another language by an academic bureacrat. " + + "The theme is interesting, the execution poor. Cannot recommend"))) + .toDF(mockLabelColumn, wordCountCol, wordLengthCol, textCol) + + val featModel = new Featurize() + .setFeatureColumns(Map { featuresColumn -> Array(wordCountCol, wordLengthCol, textCol) }) + .setNumberOfFeatures(100000).fit(mockAmazonData) + val nonzeroValuesThreshold = 30 + featModel.transform(mockAmazonData).collect().foreach( + row => assert(row.getAs[SparseVector](featuresColumn).indices.length > nonzeroValuesThreshold, + "Strings improperly tokenized") + ) + } + + test("Featurizing with text columns that have missing values - using hashing with count based feature selection") { + val dataset: DataFrame = session.createDataFrame(Seq( + (0, 2, 0.50, "pokemon are everywhere"), + (1, 3, 0.40, null), + (0, 4, 0.78, "they are in the water"), + (1, 5, 0.12, "they are in the fields"), + (0, 3, 0.78, null))) + .toDF(mockLabelColumn, "col1", "col2", "col3") + + val result: DataFrame = featurizeAndVerifyResult(dataset, + benchmarkStringMissingsTempFile.toString, + historicStringMissingsFile) + // Verify that features column has the correct number of slots + assert(result.first().getAs[DenseVector](featuresColumn).size == 8) + } + + test("Featurizing with categorical columns - using one hot encoding") { + val cat = "Cat" + val dog = "Dog" + val bird = "Bird" + val dataset: DataFrame = session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, dog, cat), + (1, 3, 0.40, 0.50, cat, dog), + (0, 4, 0.78, 0.99, dog, bird), + (1, 5, 0.12, 0.34, cat, dog), + (0, 3, 0.78, 0.99, dog, bird), + (1, 4, 0.12, 0.34, bird, dog))) + .toDF(mockLabelColumn, "col1", "col2", "col3", "col4", "col5") + + val catDataset = SparkSchema.makeCategorical( + SparkSchema.makeCategorical(dataset, "col4", "col4", false), + "col5", + "col5", + false) + + val result: DataFrame = featurizeAndVerifyResult(catDataset, + benchmarkOneHotTempFile.toString, + historicOneHotFile, + oneHotEncode = true) + // Verify that features column has the correct number of slots + assert(result.first().getAs[DenseVector](featuresColumn).size == 7) + + // Verify without one-hot encoding we get expected data + val resultNoOneHot: DataFrame = featurizeAndVerifyResult(catDataset, + benchmarkNoOneHotTempFile.toString, + historicNoOneHotFile) + // Verify that features column has the correct number of slots + assert(resultNoOneHot.first().getAs[DenseVector](featuresColumn).size == 5) + + // Verify get equivalent results if we use string indexer for making categoricals + val tmp4col = "col4tmp" + val tmp5col = "col5tmp" + val strind1 = new StringIndexer().setInputCol("col4").setOutputCol(tmp4col) + val strind2 = new StringIndexer().setInputCol("col5").setOutputCol(tmp5col) + val fit1 = strind1.fit(dataset) + val catResult1 = fit1.transform(dataset) + val fit2 = strind2.fit(catResult1) + val catResult2 = fit2.transform(catResult1) + .drop("col4", "col5") + .withColumnRenamed(tmp4col, "col4") + .withColumnRenamed(tmp5col, "col5") + + val resultStringIndexer: DataFrame = featurizeAndVerifyResult(catResult2, + benchmarkStringIndexOneHotTempFile.toString, + historicStringIndexOneHotFile, + oneHotEncode = true) + // Verify that features column has the correct number of slots + assert(resultStringIndexer.first().getAs[DenseVector](featuresColumn).size == 7) + } + + // This test currently fails on makeCategorical, where we should handle missing values (unlike spark, + // which fails with a null reference exception) + ignore("Featurizing with categorical columns that have missings - using one hot encoding") { + val cat = "Cat" + val dog = "Dog" + val bird = "Bird" + val dataset: DataFrame = session.createDataFrame(Seq( + (0, cat), + (1, null), + (0, bird), + (1, null), + (0, bird), + (1, dog))) + .toDF(mockLabelColumn, "col1") + + val catDataset = SparkSchema.makeCategorical(dataset, "col1", "col1", false) + + val result: DataFrame = featurizeAndVerifyResult(catDataset, + benchmarkOneHotMissingsTempFile.toString, + historicOneHotMissingsFile, + oneHotEncode = true) + // Verify that features column has the correct number of slots + assert(result.first().getAs[DenseVector](featuresColumn).size == 4) + + // Verify without one-hot encoding we get expected data + val resultNoOneHot: DataFrame = featurizeAndVerifyResult(catDataset, + benchmarkNoOneHotMissingsTempFile.toString, + historicNoOneHotMissingsFile) + // Verify that features column has the correct number of slots + assert(resultNoOneHot.first().getAs[DenseVector](featuresColumn).size == 4) + } + + def featurizeAndVerifyResult(dataset: DataFrame, + tempFile: String, + historicFile: File, + oneHotEncode: Boolean = false): DataFrame = { + val featureColumns = dataset.columns.filter(_ != mockLabelColumn) + val feat = new Featurize() + .setNumberOfFeatures(10) + .setFeatureColumns(Map(featuresColumn -> featureColumns)) + .setOneHotEncodeCategoricals(oneHotEncode) + val featModel = feat.fit(dataset) + val result = featModel.transform(dataset) + // Write out file so it is easy to compare the results + result.repartition(1).write.json(tempFile) + if (!Files.exists(historicFile.toPath)) { + // Store result in file for future + val directory = historicFile.toString.replace(".json", "") + result.repartition(1).write.json(directory) + val directoryFile = new File(directory) + val jsonFile = directoryFile.listFiles().filter(file => file.toString.endsWith(".json"))(0) + jsonFile.renameTo(historicFile) + FileUtilities.delTree(directoryFile) + } + val expResult = session.read.json(historicFile.toString) + // Verify the results are the same + verifyResult(expResult, result) + result + } + + override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = { + val featureColumns = fitDataset.columns.filter(_ != mockLabelColumn) + estimator.asInstanceOf[Featurize].setFeatureColumns(Map(featuresColumn -> featureColumns)) + } + + override def createFitDataset: DataFrame = mockDataset + + override def schemaForDataset: StructType = ??? + + override def getEstimator(): Estimator[_] = new Featurize() +} diff --git a/src/featurize/src/test/scala/benchmarkBasicDataTypes.json b/src/featurize/src/test/scala/benchmarkBasicDataTypes.json new file mode 100644 index 0000000000..cec71f6716 --- /dev/null +++ b/src/featurize/src/test/scala/benchmarkBasicDataTypes.json @@ -0,0 +1,12 @@ +{"Label":0,"col1":2,"col2":0.5,"col3":true,"col4":0,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[0.0,2.0,1.0,12.0,0.5,0.0]}} +{"Label":1,"col1":3,"col2":0.4,"col3":false,"col4":1,"col5":100,"col6":30.0,"testColumn":{"type":1,"values":[1.0,3.0,0.0,30.0,0.4,100.0]}} +{"Label":0,"col1":4,"col2":0.78,"col3":true,"col4":2,"col5":50,"col6":12.0,"testColumn":{"type":1,"values":[2.0,4.0,1.0,12.0,0.78,50.0]}} +{"Label":1,"col1":5,"col2":0.12,"col3":false,"col4":3,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[3.0,5.0,0.0,12.0,0.12,0.0]}} +{"Label":0,"col1":1,"col2":0.5,"col3":true,"col4":0,"col5":0,"col6":30.0,"testColumn":{"type":1,"values":[0.0,1.0,1.0,30.0,0.5,0.0]}} +{"Label":1,"col1":3,"col2":0.4,"col3":false,"col4":1,"col5":10,"col6":12.0,"testColumn":{"type":1,"values":[1.0,3.0,0.0,12.0,0.4,10.0]}} +{"Label":0,"col1":3,"col2":0.78,"col3":false,"col4":2,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[2.0,3.0,0.0,12.0,0.78,0.0]}} +{"Label":1,"col1":4,"col2":0.12,"col3":false,"col4":3,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[3.0,4.0,0.0,12.0,0.12,0.0]}} +{"Label":0,"col1":0,"col2":0.5,"col3":true,"col4":0,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[0.0,0.0,1.0,12.0,0.5,0.0]}} +{"Label":1,"col1":2,"col2":0.4,"col3":false,"col4":1,"col5":127,"col6":30.0,"testColumn":{"type":1,"values":[1.0,2.0,0.0,30.0,0.4,127.0]}} +{"Label":0,"col1":3,"col2":0.78,"col3":true,"col4":2,"col5":-128,"col6":12.0,"testColumn":{"type":1,"values":[2.0,3.0,1.0,12.0,0.78,-128.0]}} +{"Label":1,"col1":4,"col2":0.12,"col3":false,"col4":3,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[3.0,4.0,0.0,12.0,0.12,0.0]}} diff --git a/src/featurize/src/test/scala/benchmarkNoOneHot.json b/src/featurize/src/test/scala/benchmarkNoOneHot.json new file mode 100644 index 0000000000..bf00792485 --- /dev/null +++ b/src/featurize/src/test/scala/benchmarkNoOneHot.json @@ -0,0 +1,6 @@ +{"Label":0,"col1":2,"col2":0.5,"col3":0.6,"col4":2,"col5":1,"testColumn":{"type":1,"values":[1.0,2.0,2.0,0.6,0.5]}} +{"Label":1,"col1":3,"col2":0.4,"col3":0.5,"col4":1,"col5":2,"testColumn":{"type":1,"values":[2.0,1.0,3.0,0.5,0.4]}} +{"Label":0,"col1":4,"col2":0.78,"col3":0.99,"col4":2,"col5":0,"testColumn":{"type":1,"values":[0.0,2.0,4.0,0.99,0.78]}} +{"Label":1,"col1":5,"col2":0.12,"col3":0.34,"col4":1,"col5":2,"testColumn":{"type":1,"values":[2.0,1.0,5.0,0.34,0.12]}} +{"Label":0,"col1":3,"col2":0.78,"col3":0.99,"col4":2,"col5":0,"testColumn":{"type":1,"values":[0.0,2.0,3.0,0.99,0.78]}} +{"Label":1,"col1":4,"col2":0.12,"col3":0.34,"col4":0,"col5":2,"testColumn":{"type":1,"values":[2.0,0.0,4.0,0.34,0.12]}} diff --git a/src/featurize/src/test/scala/benchmarkOneHot.json b/src/featurize/src/test/scala/benchmarkOneHot.json new file mode 100644 index 0000000000..7b193d5113 --- /dev/null +++ b/src/featurize/src/test/scala/benchmarkOneHot.json @@ -0,0 +1,6 @@ +{"Label":0,"col1":2,"col2":0.5,"col3":0.6,"col4":2,"col5":1,"testColumn":{"type":1,"values":[0.0,1.0,0.0,0.0,2.0,0.6,0.5]}} +{"Label":1,"col1":3,"col2":0.4,"col3":0.5,"col4":1,"col5":2,"testColumn":{"type":1,"values":[0.0,0.0,0.0,1.0,3.0,0.5,0.4]}} +{"Label":0,"col1":4,"col2":0.78,"col3":0.99,"col4":2,"col5":0,"testColumn":{"type":1,"values":[1.0,0.0,0.0,0.0,4.0,0.99,0.78]}} +{"Label":1,"col1":5,"col2":0.12,"col3":0.34,"col4":1,"col5":2,"testColumn":{"type":1,"values":[0.0,0.0,0.0,1.0,5.0,0.34,0.12]}} +{"Label":0,"col1":3,"col2":0.78,"col3":0.99,"col4":2,"col5":0,"testColumn":{"type":1,"values":[1.0,0.0,0.0,0.0,3.0,0.99,0.78]}} +{"Label":1,"col1":4,"col2":0.12,"col3":0.34,"col4":0,"col5":2,"testColumn":{"type":1,"values":[0.0,0.0,1.0,0.0,4.0,0.34,0.12]}} diff --git a/src/featurize/src/test/scala/benchmarkString.json b/src/featurize/src/test/scala/benchmarkString.json new file mode 100644 index 0000000000..f6a333ae77 --- /dev/null +++ b/src/featurize/src/test/scala/benchmarkString.json @@ -0,0 +1,5 @@ +{"Label":0,"col1":2,"col2":0.5,"col3":0.6,"col4":"pokemon are everywhere","testColumn":{"type":0,"size":11,"indices":[0,1,2,7,9,10],"values":[2.0,0.6,0.5,1.0,1.0,1.0]}} +{"Label":1,"col1":3,"col2":0.4,"col3":0.5,"col4":"they are in the woods","testColumn":{"type":1,"values":[3.0,0.5,0.4,1.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0]}} +{"Label":0,"col1":4,"col2":0.78,"col3":0.99,"col4":"they are in the water","testColumn":{"type":1,"values":[4.0,0.99,0.78,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0]}} +{"Label":1,"col1":5,"col2":0.12,"col3":0.34,"col4":"they are in the fields","testColumn":{"type":1,"values":[5.0,0.34,0.12,1.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0]}} +{"Label":0,"col1":3,"col2":0.78,"col3":0.99,"col4":"pokemon - gotta catch em all","testColumn":{"type":1,"values":[3.0,0.99,0.78,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0]}} diff --git a/src/featurize/src/test/scala/benchmarkStringIndexOneHot.json b/src/featurize/src/test/scala/benchmarkStringIndexOneHot.json new file mode 100644 index 0000000000..c542136b16 --- /dev/null +++ b/src/featurize/src/test/scala/benchmarkStringIndexOneHot.json @@ -0,0 +1,6 @@ +{"Label":0,"col1":2,"col2":0.5,"col3":0.6,"col4":0.0,"col5":2.0,"testColumn":{"type":1,"values":[0.0,0.0,1.0,0.0,2.0,0.6,0.5]}} +{"Label":1,"col1":3,"col2":0.4,"col3":0.5,"col4":1.0,"col5":0.0,"testColumn":{"type":1,"values":[1.0,0.0,0.0,1.0,3.0,0.5,0.4]}} +{"Label":0,"col1":4,"col2":0.78,"col3":0.99,"col4":0.0,"col5":1.0,"testColumn":{"type":1,"values":[0.0,1.0,1.0,0.0,4.0,0.99,0.78]}} +{"Label":1,"col1":5,"col2":0.12,"col3":0.34,"col4":1.0,"col5":0.0,"testColumn":{"type":1,"values":[1.0,0.0,0.0,1.0,5.0,0.34,0.12]}} +{"Label":0,"col1":3,"col2":0.78,"col3":0.99,"col4":0.0,"col5":1.0,"testColumn":{"type":1,"values":[0.0,1.0,1.0,0.0,3.0,0.99,0.78]}} +{"Label":1,"col1":4,"col2":0.12,"col3":0.34,"col4":2.0,"col5":0.0,"testColumn":{"type":1,"values":[1.0,0.0,0.0,0.0,4.0,0.34,0.12]}} diff --git a/src/featurize/src/test/scala/benchmarkStringMissing.json b/src/featurize/src/test/scala/benchmarkStringMissing.json new file mode 100644 index 0000000000..5bcc3f4166 --- /dev/null +++ b/src/featurize/src/test/scala/benchmarkStringMissing.json @@ -0,0 +1,5 @@ +{"Label":0,"col1":2,"col2":0.5,"col3":"pokemon are everywhere","testColumn":{"type":1,"values":[2.0,0.5,0.0,0.0,1.0,0.0,1.0,1.0]}} +{"Label":1,"col1":3,"col2":0.4,"testColumn":{"type":0,"size":8,"indices":[0,1],"values":[3.0,0.4]}} +{"Label":0,"col1":4,"col2":0.78,"col3":"they are in the water","testColumn":{"type":1,"values":[4.0,0.78,1.0,1.0,0.0,0.0,2.0,1.0]}} +{"Label":1,"col1":5,"col2":0.12,"col3":"they are in the fields","testColumn":{"type":1,"values":[5.0,0.12,1.0,1.0,0.0,1.0,2.0,0.0]}} +{"Label":0,"col1":3,"col2":0.78,"testColumn":{"type":0,"size":8,"indices":[0,1],"values":[3.0,0.78]}} diff --git a/src/featurize/src/test/scala/benchmarkVectors.json b/src/featurize/src/test/scala/benchmarkVectors.json new file mode 100644 index 0000000000..4bc319c1a7 --- /dev/null +++ b/src/featurize/src/test/scala/benchmarkVectors.json @@ -0,0 +1,7 @@ +{"Label":0,"col1":{"type":0,"size":3,"indices":[0,2],"values":[1.0,2.0]},"col2":0.5,"col3":0.6,"col4":0,"col5":{"type":1,"values":[1.0,0.1,-1.5]},"testColumn":{"type":1,"values":[0.0,1.0,0.0,2.0,0.6,0.5,1.0,0.1,-1.5]}} +{"Label":1,"col1":{"type":1,"values":[1.5,0.2,-1.2]},"col2":0.4,"col3":0.5,"col4":1,"col5":{"type":1,"values":[1.5,0.2,-1.2]},"testColumn":{"type":1,"values":[1.0,1.5,0.2,-1.2,0.5,0.4,1.5,0.2,-1.2]}} +{"Label":1,"col1":{"type":0,"size":3,"indices":[0,2],"values":[1.0,2.0]},"col2":0.12,"col3":0.34,"col4":3,"col5":{"type":0,"size":3,"indices":[0,2],"values":[1.0,2.0]},"testColumn":{"type":1,"values":[3.0,1.0,0.0,2.0,0.34,0.12,1.0,0.0,2.0]}} +{"Label":0,"col1":{"type":1,"values":[1.1,0.5,-1.024]},"col2":0.5,"col3":0.6,"col4":0,"col5":{"type":1,"values":[1.0,0.4,-1.23]},"testColumn":{"type":1,"values":[0.0,1.1,0.5,-1.024,0.6,0.5,1.0,0.4,-1.23]}} +{"Label":1,"col1":{"type":1,"values":[1.1,0.5,-1.056]},"col2":0.4,"col3":0.5,"col4":1,"col5":{"type":1,"values":[1.1,0.5,-1.024]},"testColumn":{"type":1,"values":[1.0,1.1,0.5,-1.056,0.5,0.4,1.1,0.5,-1.024]}} +{"Label":0,"col1":{"type":1,"values":["NaN",0.2,-1.23]},"col2":0.78,"col3":0.99,"col4":2,"col5":{"type":1,"values":[1.0,0.1,-1.22]},"testColumn":{"type":1,"values":[2.0,"NaN",0.2,-1.23,0.99,0.78,1.0,0.1,-1.22]}} +{"Label":1,"col1":{"type":1,"values":[1.0,0.4,-1.23]},"col2":0.12,"col3":0.34,"col4":3,"col5":{"type":1,"values":["NaN",0.2,-1.23]},"testColumn":{"type":1,"values":[3.0,1.0,0.4,-1.23,0.34,0.12,"NaN",0.2,-1.23]}} diff --git a/src/find-best-model/build.sbt b/src/find-best-model/build.sbt new file mode 100644 index 0000000000..9aa1bc13cf --- /dev/null +++ b/src/find-best-model/build.sbt @@ -0,0 +1,3 @@ +//> DependsOn: core +//> DependsOn: compute-model-statistics +//> DependsOn: train-classifier diff --git a/src/find-best-model/src/main/scala/FindBestModel.scala b/src/find-best-model/src/main/scala/FindBestModel.scala new file mode 100644 index 0000000000..060283b614 --- /dev/null +++ b/src/find-best-model/src/main/scala/FindBestModel.scala @@ -0,0 +1,331 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.schema.SchemaConstants +import org.apache.hadoop.fs.Path +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.sql.{DataFrame, Dataset, Row, SaveMode} +import org.apache.spark.ml._ +import org.apache.spark.ml.param.{Param, ParamMap, TransformerArrayParam} +import org.apache.spark.ml.util._ +import org.apache.spark.sql.types._ + +import scala.collection.mutable.ListBuffer + +object FindBestModel extends DefaultParamsReadable[FindBestModel] { + val modelNameCol = "model_name" + val metricsCol = "metric" + val paramsCol = "parameters" +} + +/** + * Evaluates and chooses the best model from a list of models. + */ +class FindBestModel(override val uid: String) extends Estimator[BestModel] with MMLParams { + + def this() = this(Identifiable.randomUID("FindBestModel")) + val models: TransformerArrayParam = new TransformerArrayParam(this, "models", "List of models to be evaluated") + + def getModels: Array[Transformer] = $(models) + + /** @group setParam **/ + def setModels(value: Array[Transformer]): this.type = set(models, value) + + /** @group setParam **/ + val evaluationMetric: Param[String] = StringParam(this, "evaluationMetric", "Metric to evaluate models with", + (s: String) => Seq(ComputeModelStatistics.MseSparkMetric, + ComputeModelStatistics.RmseSparkMetric, + ComputeModelStatistics.R2SparkMetric, + ComputeModelStatistics.MaeSparkMetric, + ComputeModelStatistics.AccuracySparkMetric, + ComputeModelStatistics.PrecisionSparkMetric, + ComputeModelStatistics.RecallSparkMetric, + ComputeModelStatistics.AucSparkMetric) contains s) + + // Set default evaluation metric to accuracy + setDefault(evaluationMetric -> ComputeModelStatistics.AccuracySparkMetric) + + def getEvaluationMetric: String = $(evaluationMetric) + + /** @group setParam **/ + def setEvaluationMetric(value: String): this.type = set(evaluationMetric, value) + + var selectedModel: Transformer = null + + var selectedScoredDataset: Dataset[_] = null + + var selectedROCCurve: DataFrame = null + + var selectedBestModelMetrics: Dataset[_] = null + + /** + * + * @param dataset - The input dataset, to be fitted + * @return The Model that results from the fitting + */ + override def fit(dataset: Dataset[_]): BestModel = { + // Staging + val trainedModels = getModels + if (trainedModels.isEmpty) { + throw new Exception("No trained models to evaluate.") + } + // Find type of trained models + def modelTypeDiscriminant(model: Transformer):String = { + model match { + case reg: TrainedRegressorModel => SchemaConstants.RegressionKind + case cls: TrainedClassifierModel => SchemaConstants.ClassificationKind + case evm: BestModel => modelTypeDiscriminant(evm.getBestModel) + case _ => throw new Exception("Model type not supported for evaluation") + } + } + val modelType = modelTypeDiscriminant(trainedModels(0)) + val evaluator = new ComputeModelStatistics() + evaluator.set(evaluator.evaluationMetric, getEvaluationMetric) + + var bestMetric: Double = Double.NaN + // Setup to store metrics and model name data for model metrics table + val modelMetrics = ListBuffer[Double]() + val models = ListBuffer[String]() + val parameters = ListBuffer[String]() + + // TODO: Add the other metrics + // TODO: Check metrics per model + val chooseHighest = (current: Double, best: Double) => { current > best } + val chooseLowest = (current: Double, best: Double) => { current < best } + val (evaluationMetricColumnName, operator): (String, (Double, Double) => Boolean) = modelType match { + case SchemaConstants.RegressionKind => getEvaluationMetric match { + case ComputeModelStatistics.MseSparkMetric => (ComputeModelStatistics.MseColumnName, chooseLowest) + case ComputeModelStatistics.RmseSparkMetric => (ComputeModelStatistics.RmseColumnName, chooseLowest) + case ComputeModelStatistics.R2SparkMetric => (ComputeModelStatistics.R2ColumnName, chooseHighest) + case ComputeModelStatistics.MaeSparkMetric => (ComputeModelStatistics.MaeColumnName, chooseLowest) + case _ => throw new Exception("Metric is not supported for regressors") + } + case SchemaConstants.ClassificationKind => getEvaluationMetric match { + case ComputeModelStatistics.AucSparkMetric => (ComputeModelStatistics.AucColumnName, chooseHighest) + case ComputeModelStatistics.PrecisionSparkMetric => (ComputeModelStatistics.PrecisionColumnName, chooseHighest) + case ComputeModelStatistics.RecallSparkMetric => (ComputeModelStatistics.RecallColumnName, chooseHighest) + case ComputeModelStatistics.AccuracySparkMetric => (ComputeModelStatistics.AccuracyColumnName, chooseHighest) + case _ => throw new Exception("Metric is not supported for classifiers") + } + case _ => throw new Exception("Model type not supported for evaluation") + } + + val compareModels = (model: Transformer, metrics: DataFrame, scoredDataset: Dataset[_]) => { + val currentMetric = metrics.select(evaluationMetricColumnName).first()(0).toString.toDouble + modelMetrics += currentMetric + models += model.uid + def getModelParams(model: Transformer): ParamMap = { + model match { + case reg: TrainedRegressorModel => reg.getParamMap + case cls: TrainedClassifierModel => cls.getParamMap + case evm: BestModel => getModelParams(evm.getBestModel) + case _ => throw new Exception("Model type not supported for evaluation") + } + } + parameters += getModelParams(model).toSeq.map { case pv => s"${pv.param.name}: ${pv.value}" }.mkString(", ") + if (bestMetric.isNaN || operator(currentMetric, bestMetric)) { + bestMetric = currentMetric + selectedModel = model + selectedScoredDataset = scoredDataset + } + } + + for (trainedModel <- trainedModels) { + // Check that models are consistent + if (modelTypeDiscriminant(trainedModel) != modelType) { + throw new Exception("Models are inconsistent. Please evaluate only regressors or classifiers.") + } + val df = trainedModel.transform(dataset) + val metrics = evaluator.transform(df) + compareModels(trainedModel, metrics, df) + } + + // compute ROC curve + evaluator.set(evaluator.evaluationMetric, ComputeModelStatistics.AllSparkMetrics) + selectedBestModelMetrics = evaluator.transform(selectedScoredDataset) + selectedROCCurve = evaluator.rocCurve + + val spark = dataset.sparkSession + val allModelMetricsSchema = StructType(Seq(StructField(FindBestModel.modelNameCol, StringType, true), + StructField(FindBestModel.metricsCol, DoubleType, true), + StructField(FindBestModel.paramsCol, StringType, true))) + var allModelMetrics = spark.createDataFrame(spark.sparkContext.parallelize(models.zip(modelMetrics).zip(parameters) + .map(mmp => Row(mmp._1._1, mmp._1._2, mmp._2))), allModelMetricsSchema) + new BestModel(uid, + selectedModel, + selectedScoredDataset, + selectedROCCurve, + selectedBestModelMetrics, + allModelMetrics) + } + + // Choose a random model as we don't know which one will be chosen yet - all will transform schema in same way + def transformSchema(schema: StructType): StructType = getModels(0).transformSchema(schema) + + def copy(extra: ParamMap): FindBestModel = defaultCopy(extra) + +} + +/** + * Model produced by [[FindBestModel]]. + */ +class BestModel(val uid: String, + val model: Transformer, + val scoredDataset: Dataset[_], + val rocCurve: DataFrame, + val bestModelMetrics: Dataset[_], + val allModelMetrics: Dataset[_]) + extends Model[BestModel] with MLWritable { + + override def write: MLWriter = new BestModel.EvaluateModelWriter(uid, + new Pipeline().setStages(Array(model)).fit(scoredDataset), + scoredDataset, + rocCurve, + bestModelMetrics, + allModelMetrics) + + override def copy(extra: ParamMap): BestModel = + new BestModel(uid, model.copy(extra), scoredDataset, rocCurve, bestModelMetrics, allModelMetrics) + + override def transform(dataset: Dataset[_]): DataFrame = model.transform(dataset) + + /** + * The best model found during evaluation. + * @return The best model. + */ + def getBestModel: Transformer = model + + /** + * Gets the scored dataset. + * @return The scored dataset for the best model. + */ + def getScoredDataset: Dataset[_] = scoredDataset + + /** + * Gets the ROC curve with TPR, FPR. + * @return The evaluation results. + */ + def getEvaluationResults: Dataset[_] = rocCurve + + /** + * Gets all of the best model metrics results from the evaluator. + * @return All of the best model metrics results. + */ + def getBestModelMetrics: Dataset[_] = bestModelMetrics + + /** + * Gets a table of metrics from all models compared from the evaluation comparison. + * @return The model metrics results from all models. + */ + def getAllModelMetrics: Dataset[_] = allModelMetrics + + @DeveloperApi + override def transformSchema(schema: StructType): StructType = model.transformSchema(schema) + +} + +object BestModel extends MLReadable[BestModel] { + + private val modelPart = "model" + private val scoredDatasetPart = "scoredDataset" + private val rocCurvePart = "rocCurve" + private val bestModelMetricsPart = "bestModelMetrics" + private val allModelMetricsPart = "allModelMetrics" + private val dataPart = "data" + + override def read: MLReader[BestModel] = new BestModelReader + + override def load(path: String): BestModel = super.load(path) + + /** [[MLWriter]] instance for [[BestModel]] */ + private[BestModel] + class EvaluateModelWriter(val uid: String, + val model: PipelineModel, + val scoredDataset: Dataset[_], + val rocCurve: DataFrame, + val bestModelMetrics: Dataset[_], + val allModelMetrics: Dataset[_]) + extends MLWriter { + private case class Data(uid: String) + + override protected def saveImpl(path: String): Unit = { + val overwrite = this.shouldOverwrite + val qualPath = PipelineUtilities.makeQualifiedPath(sc, path) + // Required in order to allow this to be part of an ML pipeline + PipelineUtilities.saveMetadata(uid, + BestModel.getClass.getName.replace("$", ""), + new Path(path, "metadata").toString, + sc, + overwrite) + + // save the model + val modelPath = new Path(qualPath, modelPart).toString + val modelWriter = + if (overwrite) model.write.overwrite() + else model.write + modelWriter.save(modelPath) + + val saveMode = + if (overwrite) SaveMode.Overwrite + else SaveMode.ErrorIfExists + + // save the scored dataset + val scoredDatasetPath = new Path(qualPath, scoredDatasetPart).toString + scoredDataset.write.mode(saveMode).parquet(scoredDatasetPath) + + // save the roc curve + val rocCurvePath = new Path(qualPath, rocCurvePart).toString + rocCurve.write.mode(saveMode).parquet(rocCurvePath) + + // save the best model metrics + val bestModelMetricsPath = new Path(qualPath, bestModelMetricsPart).toString + bestModelMetrics.write.mode(saveMode).parquet(bestModelMetricsPath) + + // save all model metrics + val allModelMetricsPath = new Path(qualPath, allModelMetricsPart).toString + allModelMetrics.write.mode(saveMode).parquet(allModelMetricsPath) + + // save model data + val data = Data(uid) + val dataPath = new Path(qualPath, dataPart).toString + sparkSession.createDataFrame(Seq(data)).repartition(1).write.mode(saveMode).parquet(dataPath) + } + } + + private class BestModelReader + extends MLReader[BestModel] { + + override def load(path: String): BestModel = { + val qualPath = PipelineUtilities.makeQualifiedPath(sc, path) + // load the uid, label column and model name + val dataPath = new Path(qualPath, dataPart).toString + val data = sparkSession.read.format("parquet").load(dataPath) + val Row(uid: String) = data.select("uid").head() + + // retrieve the underlying model + val modelPath = new Path(qualPath, modelPart).toString + val model = PipelineModel.load(modelPath) + + // retrieve the scored dataset + val scoredDatasetPath = new Path(qualPath, scoredDatasetPart).toString + val scoredDataset = sparkSession.read.parquet(scoredDatasetPath) + + // retrieve the roc curve + val rocCurvePath = new Path(qualPath, rocCurvePart).toString + val rocCurve = sparkSession.read.parquet(rocCurvePath) + + // retrieve the best model metrics + val bestModelMetricsPath = new Path(qualPath, bestModelMetricsPart).toString + val bestModelMetrics = sparkSession.read.parquet(bestModelMetricsPath) + + // retrieve all model metrics + val allModelMetricsPath = new Path(qualPath, allModelMetricsPart).toString + val allModelMetrics = sparkSession.read.parquet(allModelMetricsPath) + + new BestModel(uid, model.stages(0), scoredDataset, rocCurve, bestModelMetrics, allModelMetrics) + } + } + +} diff --git a/src/find-best-model/src/test/scala/VerifyFindBestModel.scala b/src/find-best-model/src/test/scala/VerifyFindBestModel.scala new file mode 100644 index 0000000000..d36998a7a3 --- /dev/null +++ b/src/find-best-model/src/test/scala/VerifyFindBestModel.scala @@ -0,0 +1,106 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io.File + +import org.apache.spark.sql.DataFrame +import org.apache.spark.ml.{Estimator, Transformer} +import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} + +class VerifyFindBestModel extends EstimatorFuzzingTest { + + val mockLabelColumn = "Label" + + def createMockDataset: DataFrame = { + session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (0, 4, 0.78, 0.99, 2), + (1, 5, 0.12, 0.34, 3), + (0, 1, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (0, 3, 0.78, 0.99, 2), + (1, 4, 0.12, 0.34, 3), + (0, 0, 0.50, 0.60, 0), + (1, 2, 0.40, 0.50, 1), + (0, 3, 0.78, 0.99, 2), + (1, 4, 0.12, 0.34, 3))) + .toDF(mockLabelColumn, "col1", "col2", "col3", "col4") + } + + test("Smoke test to verify that evaluate can be run") { + val dataset = createMockDataset + val randomForestClassifier = TrainClassifierTestUtilities.createRandomForestClassifier(mockLabelColumn) + val model = randomForestClassifier.fit(dataset) + val findBestModel = new FindBestModel() + .setModels(Array(model.asInstanceOf[Transformer], model.asInstanceOf[Transformer])) + .setEvaluationMetric(ComputeModelStatistics.AccuracySparkMetric) + val bestModel = findBestModel.fit(dataset) + bestModel.transform(dataset) + } + + test("Verify the best model can be saved") { + val dataset: DataFrame = createMockDataset + val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn) + val model = logisticRegressor.fit(dataset) + + val findBestModel = new FindBestModel() + .setModels(Array(model.asInstanceOf[Transformer], model.asInstanceOf[Transformer])) + .setEvaluationMetric(ComputeModelStatistics.AucSparkMetric) + val bestModel = findBestModel.fit(dataset) + + val myModelName = "testEvalModel" + bestModel.save(myModelName) + val dir = new File(myModelName) + // assert directory exists + assert(dir.exists()) + // delete the file to cleanup + FileUtilities.delTree(dir) + } + + test("Verify the best model metrics can be retrieved and are valid") { + val dataset: DataFrame = createMockDataset + val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn) + val decisionTreeClassifier = TrainClassifierTestUtilities.createDecisionTreeClassifier(mockLabelColumn) + val GBTClassifier = TrainClassifierTestUtilities.createGradientBoostedTreesClassifier(mockLabelColumn) + val naiveBayesClassifier = TrainClassifierTestUtilities.createNaiveBayesClassifier(mockLabelColumn) + val randomForestClassifier = TrainClassifierTestUtilities.createRandomForestClassifier(mockLabelColumn) + val model1 = logisticRegressor.fit(dataset) + val model2 = decisionTreeClassifier.fit(dataset) + val model3 = GBTClassifier.fit(dataset) + val model4 = naiveBayesClassifier.fit(dataset) + val model5 = randomForestClassifier.fit(dataset) + + val findBestModel = new FindBestModel() + .setModels(Array(model1.asInstanceOf[Transformer], model2, model3, model4, model5)) + .setEvaluationMetric(ComputeModelStatistics.AucSparkMetric) + val bestModel = findBestModel.fit(dataset) + // validate schema is as expected + assert(bestModel.getAllModelMetrics.schema == + StructType(Seq(StructField(FindBestModel.modelNameCol, StringType, true), + StructField(FindBestModel.metricsCol, DoubleType, true), + StructField(FindBestModel.paramsCol, StringType, true)))) + // validate we got metrics for every model + assert(bestModel.getAllModelMetrics.count() == 5) + // validate AUC looks valid + bestModel.getAllModelMetrics + .select(FindBestModel.metricsCol) + .collect() + .foreach(value => assert(value.getDouble(0) >= 0.5)) + } + + override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = { + val assembleFeatures = estimator.asInstanceOf[FindBestModel] + val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn) + val model = logisticRegressor.fit(createMockDataset) + assembleFeatures.setModels(Array(model, model)) + } + + override def createFitDataset: DataFrame = createMockDataset + + override def schemaForDataset: StructType = ??? + + override def getEstimator(): Estimator[_] = new FindBestModel() +} diff --git a/src/fuzzing/build.sbt b/src/fuzzing/build.sbt new file mode 100644 index 0000000000..4e7b0eb87c --- /dev/null +++ b/src/fuzzing/build.sbt @@ -0,0 +1,5 @@ +//> DependsOn: core +//> DependsOn: utils +//> DependsOn: compute-model-statistics +//> DependsOn: find-best-model +//> DependsOn: featurize diff --git a/src/fuzzing/src/test/scala/Fuzzing.scala b/src/fuzzing/src/test/scala/Fuzzing.scala new file mode 100644 index 0000000000..b4480d1467 --- /dev/null +++ b/src/fuzzing/src/test/scala/Fuzzing.scala @@ -0,0 +1,254 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import FileUtilities.File +import org.apache.spark.ml._ +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.{MLReadable, MLWritable} +import org.apache.spark.sql.DataFrame + +import scala.language.existentials +import scala.util.Random + +/** + * Tests to validate fuzzing of modules + */ +class Fuzzing extends TestBase { + + // Needed because the session in MTB is lazy + session + + val numRows = 10 + val numCols = 20 + val numSlotsPerVectorCol = Array(15, 15) + val randomSeed = new Random() + + // Use this for more detailed output from the Jar Loader + val debug = false + + // use this to quickly see all the results for all failing modules + // Note that this could make the tests pass when they should be failing + val disableFailure = false + + test("Verify all PipelineStages can be saved then loaded") { + + val exemptions: Set[String] = Set() + + val applicableStages = pipelineStages.filter(t => !exemptions(t.getClass.getName)) + applicableStages.foreach(t => if (!readerMap.contains(t.getClass.getName)) { + assertOrLog(false, s"need to have a companion reader for class ${t.getClass.getName}") + }) + + applicableStages.foreach(t => trySave(t, Some(readerMap(t.getClass.getName)))) + } + + // TODO verify that model UIDs match the class names, perhaps use a Trait + + test("Verify all estimators can be turned into pipelines, saved and loaded") { + estimators.foreach(est => { + val estimatorName = est.getClass.getName + println() + println(s"Running estimator: ${est.toString} with name: ${estimatorName}") + val (dataset, pipelineStage) = + if (estimatorFuzzers.contains(estimatorName)) { + println("Generating dataset from estimator fuzzer") + val estimatorFuzzer = estimatorFuzzers(estimatorName) + val fitDataset = estimatorFuzzer.createFitDataset + val estUpdated = estimatorFuzzer.setParams(fitDataset, est.copy(ParamMap())) + (fitDataset, estUpdated.asInstanceOf[PipelineStage]) + } else { + println("Generating random dataset") + (createDataSet, est.copy(ParamMap()).asInstanceOf[PipelineStage]) + } + tryRun(() => { + var pipelineModel = new Pipeline().setStages(Array(pipelineStage)).fit(dataset) + pipelineModel = trySave(pipelineModel, + Some(PipelineModel.asInstanceOf[MLReadable[Any]])).get.asInstanceOf[PipelineModel] + val dfTransform = + if (estimatorFuzzers.contains(estimatorName)) { + estimatorFuzzers(estimatorName).createTransformDataset + } else { + createDataSet + } + pipelineModel.transform(dfTransform) + () + }) + }) + } + + test("Verify all transformers can be turned into pipelines, saved and loaded") { + transformers.foreach(tr => { + val transformerName = tr.getClass.getName + println() + println(s"Running transformer: ${tr.toString} with name: ${transformerName}") + val (dataset, pipelineStage) = + if (transformerFuzzers.contains(transformerName)) { + println("Generating dataset from transformer fuzzer") + val transformerFuzzer = transformerFuzzers(transformerName) + val fitDataset = transformerFuzzer.createDataset + val trUpdated = transformerFuzzer.setParams(fitDataset, tr.copy(ParamMap())) + (fitDataset, trUpdated.asInstanceOf[PipelineStage]) + } else { + println("Generating random dataset") + (createDataSet, tr.copy(ParamMap()).asInstanceOf[PipelineStage]) + } + tryRun(() => { + val pipeline = new Pipeline().setStages(Array(pipelineStage)) + val pipelineModel = pipeline.fit(dataset) + trySave(pipelineModel) + () + }) + }) + } + + test("Verify all pipeline stages dont have exotic characters") { + val badChars = List(",", "\"", "'", ".") + pipelineStages.foreach { pipelineStage => + pipelineStage.params.foreach { param => + assertOrLog(!param.name.contains(badChars)) + assertOrLog(!param.doc.contains("\"")) + } + } + } + + test("Verify all pipeline stage values match their param names") { + val exemptions: Set[String] = Set() + pipelineStages.foreach { pipelineStage => + if (!exemptions(pipelineStage.getClass.getName)) { + val paramFields = pipelineStage.getClass.getDeclaredFields + .filter(f => classOf[Param[Any]].isAssignableFrom(f.getType)) + + val paramNames = paramFields.map { f => + f.setAccessible(true) + val p = f.get(pipelineStage) + p.asInstanceOf[Param[Any]].name + } + val paramFieldNames = paramFields.map(_.getName) + assertOrLog(paramNames === paramFieldNames, pipelineStage.getClass.getName) + } + } + } + + test("Verify correct use of mixins") { + val triggers = Map( + "inputCol" -> classOf[HasInputCol], + "inputColumn" -> classOf[HasInputCol], + "outputCol" -> classOf[HasOutputCol], + "outputColumn" -> classOf[HasOutputCol], + "labelCol" -> classOf[HasLabelCol], + "labelColumn" -> classOf[HasLabelCol], + "featuresCol" -> classOf[HasFeaturesCol], + "featuresColumn" -> classOf[HasFeaturesCol] + ) + + val exemptions = Set[String]( + "org.apache.spark.ml.feature.FastVectorAssembler", // In Spark namespace + "com.microsoft.ml.spark.TextFeaturizer" // needs to hide setters from model + ) + pipelineStages.foreach { stage => + if (!exemptions(stage.getClass.getName)) { + stage.params.foreach { param => + triggers.get(param.name) match { + case Some(clazz) => + assertOrLog(clazz.isAssignableFrom(stage.getClass), + stage.getClass.getName + " needs to extend " + clazz.getName) + case None => + } + } + } + } + } + + private def assertOrLog(condition: Boolean, hint: String = "", + disableFailure: Boolean = disableFailure): Unit = { + if (disableFailure && !condition) { + println(hint) + } else { + assert(condition, hint) + } + () + } + + private def throwOrLog(e: Throwable, message: String = "", + disableFailure: Boolean = disableFailure): Unit = { + println(message) + if (disableFailure) { + println(e.getMessage) + e.printStackTrace(System.out) + } else { + throw e + } + } + + // set the context loader to pick up on the jars + Thread.currentThread().setContextClassLoader(JarLoadingUtils.classLoader) + + private lazy val transformers: List[Transformer] = JarLoadingUtils.loadClass[Transformer](debug = debug) + + private lazy val estimators: List[Estimator[_]] = JarLoadingUtils.loadClass[Estimator[_]](debug = debug) + + private lazy val readers: List[MLReadable[_]] = JarLoadingUtils.loadObject[MLReadable[_]](debug = debug) + + private lazy val pipelineStages: List[PipelineStage] = JarLoadingUtils.loadClass[PipelineStage](debug = debug) + + private lazy val readerMap = readers.map { + r => (r.getClass.getName.dropRight(1), r.asInstanceOf[MLReadable[Any]]) + }.toMap + + private lazy val transformerFuzzers: Map[String, TransformerFuzzingTest] = + JarLoadingUtils.loadTestClass[TransformerFuzzingTest](debug = debug) + .map(tr => (tr.getClassName, tr)).toMap + + private lazy val estimatorFuzzers: Map[String, EstimatorFuzzingTest] = + JarLoadingUtils.loadTestClass[EstimatorFuzzingTest](debug = debug) + .map(est => (est.getClassName, est)).toMap + + private def trySave(stage: PipelineStage, reader: Option[MLReadable[Any]] = None, + path: String = "testModels"): Option[PipelineStage] = { + stage match { + case w: PipelineStage with MLWritable => + try { + w.write.overwrite().save(path) + reader match { + case Some(r) => + val loaded = r.load(path).asInstanceOf[PipelineStage] + assertOrLog(loaded.params.sameElements(w.params)) + println(s"Round trip succeeded for ${w.getClass.getName}") + Some(loaded) + case None => None + } + } catch { + case e: Throwable => + throwOrLog(e, w.getClass.getName + " encounters an error while saving/loading") + None + } finally { + FileUtilities.delTree(new File(path)) + () + } + case tr => + assertOrLog(false, tr.getClass.getName + " needs to extend MLWritable") + None + } + } + + private def createDataSet: DataFrame = { + GenerateDataset + .generateDataset(session, + new BasicDatasetGenerationConstraints(numRows, numCols, numSlotsPerVectorCol), + randomSeed.nextLong()) + } + + private def tryRun(func: () => Unit): Unit = { + try { + func() + } catch { + case ne: java.util.NoSuchElementException => + throwOrLog(ne, s"Could not transform: $ne", disableFailure=true) + case th: Throwable => + throwOrLog(th, s"Encountered unknown error: $th", disableFailure=true) + } + } + +} diff --git a/src/image-featurizer/build.sbt b/src/image-featurizer/build.sbt new file mode 100644 index 0000000000..bf243751d5 --- /dev/null +++ b/src/image-featurizer/build.sbt @@ -0,0 +1,5 @@ +//> DependsOn: core +//> DependsOn: readers +//> DependsOn: downloader +//> DependsOn: cntk-model +//> DependsOn: image-transformer diff --git a/src/image-featurizer/src/main/scala/ImageFeaturizer.scala b/src/image-featurizer/src/main/scala/ImageFeaturizer.scala new file mode 100644 index 0000000000..7d59780eb9 --- /dev/null +++ b/src/image-featurizer/src/main/scala/ImageFeaturizer.scala @@ -0,0 +1,128 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.net.URI + +import com.microsoft.ml.spark.FileUtilities.File +import com.microsoft.ml.spark.schema.DatasetExtensions +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql.types.{ArrayType, FloatType, StructType} +import org.apache.spark.sql.{DataFrame, Dataset} + +object ImageFeaturizer extends DefaultParamsReadable[ImageFeaturizer] + +/** + * + * Class for featurizing images with pretrained CNTK models. The ImageFeaturizer allows one to + * leverage deep representations learned on large supervised datasets to improve image processing + * workflows. + * + * The ImageFeaturizer relies on a CNTK model to do the featurization, one can set this model using + * the modelLocation parameter. To map the nodes of the CNTK model onto the standard "layers" structure + * of a feed forward neural net, one needs to supply a list of node names that range from the output node, + * back towards the input node of the CNTK Function. + * This list does not need to be exhaustive, and is provided to you if you + * use a model downloaded from the ModelDownloader, one can find this layer list in the schema of the + * downloaded model. + * + * The ImageFeaturizer takes an input column of images + * (the type returned by the ImageReader), and + * automatically resizes them to fit the CMTKModel's inputs. It then feeds them through a pre-trained + * CNTK model. One can truncate the model using the cutOutputLayers parameter that + * determines how many layers to truncate from the output of the network. + * For example, layer=0 means tha no layers are removed, + * layer=2 means that the image featurizer returns the activations of the layer that is two layers + * from the output layer. + * + * @param uid the uid of the image transformer + */ +class ImageFeaturizer(val uid: String) extends Transformer with HasInputCol with HasOutputCol with MMLParams { + def this() = this(Identifiable.randomUID("ImageFeaturizer")) + + val inputNode: IntParam = IntParam(this, "inputNode", "which node of the CNTKFunctions inputs" + + "to use as the input (default 0)") + + def setInputNode(value: Int): this.type = set(inputNode, value) + + def getInputNode: Int = $(inputNode) + + val cutOutputLayers: IntParam = IntParam(this, "cutOutputLayers", "the number of layers to cut " + + "off the end of the network, 0 leaves the network intact," + + " 1 removes the output layer, etc", ParamValidators.gtEq(0)) + + def setCutOutputLayers(value: Int): this.type = set(cutOutputLayers, value) + + def getCutOutputLayers: Int = $(cutOutputLayers) + + val layerNames: StringArrayParam = new StringArrayParam(this, "layerNames", + "Array with valid CNTK nodes to choose from, this first entries of this array should be closer to the " + + "output node") + + def setLayerNames(value: Array[String]): this.type = set(layerNames, value) + + def getLayerNames: Array[String] = $(layerNames) + + val modelLocation: Param[String] = StringParam(this, "modelLocation", "the location of the model as a URI/URL", + {s: String => + try{ + new URI(s) + true + }catch{ + case e: Exception => false + } + }) + + def setModelLocation(value: String): this.type = set(modelLocation, value) + + def setModelLocation(value: URI): this.type = set(modelLocation, value.toString) + + def getModelLocation: String = $(modelLocation) + + def setModel(modelSchema: ModelSchema): this.type = { + setLayerNames(modelSchema.layerNames) + .setInputNode(modelSchema.inputNode) + .setModelLocation(modelSchema.uri.toString) + } + + setDefault(cutOutputLayers -> 1, inputNode -> 0, outputCol -> (uid + "_output")) + + override def transform(dataset: Dataset[_]): DataFrame = { + val spark = dataset.sparkSession + + val resizedCol = DatasetExtensions.findUnusedColumnName("resized")(dataset.columns.toSet) + + val cntkModel = new CNTKModel() + .setModel(dataset.sparkSession, getModelLocation) + .setInputNode(getInputNode) + .setOutputNodeName(getLayerNames.apply(getCutOutputLayers)) + .setInputCol(resizedCol) + .setOutputCol(getOutputCol) + + val requiredSize = CNTKModel.loadModelFromBytes(cntkModel.getModel) + .getArguments.get(0).getShape().getDimensions + + val prepare = new ImageTransformer() + .setInputCol($(inputCol)) + .resize(requiredSize(0).toInt, requiredSize(1).toInt) + + val unroll = new UnrollImage() + .setInputCol(prepare.getOutputCol) + .setOutputCol(resizedCol) + + val resizedDF = prepare.transform(dataset) + val unrolledDF = unroll.transform(resizedDF).drop(prepare.getOutputCol) + val featurizedDF = cntkModel.transform(unrolledDF).drop(resizedCol) + featurizedDF + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) + + override def transformSchema(schema: StructType): StructType = { + schema.add(getOutputCol, new ArrayType(FloatType, false)) + } + +} diff --git a/src/image-featurizer/src/test/scala/ImageFeaturizerSuite.scala b/src/image-featurizer/src/test/scala/ImageFeaturizerSuite.scala new file mode 100644 index 0000000000..bb81bb56fd --- /dev/null +++ b/src/image-featurizer/src/test/scala/ImageFeaturizerSuite.scala @@ -0,0 +1,66 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.net.URI + +import org.apache.spark.sql.DataFrame +import com.microsoft.ml.spark.FileUtilities.File +import org.apache.spark.ml.linalg.DenseVector +import com.microsoft.ml.spark.Readers.implicits._ + +import scala.collection.JavaConversions._ + +class ImageFeaturizerSuite extends LinuxOnly with CNTKTestUtils { + val images: DataFrame = session.readImages(imagePath, true).withColumnRenamed("image", inputCol) + + val modelDir = new File(filesRoot, "CNTKModel") + val modelDownloader = new ModelDownloader(session, modelDir.toURI) + + lazy val resNetUri: URI = new File(modelDir, "ResNet50_ImageNet.model").toURI + lazy val resNet: ModelSchema = modelDownloader.downloadByName("ResNet50") + + test("Image featurizer should reproduce the CIFAR10 experiment") { + val model = new ImageFeaturizer() + .setInputCol(inputCol) + .setOutputCol(outputCol) + .setModelLocation(s"${sys.env("DATASETS_HOME")}/CNTKModel/ConvNet_CIFAR10.model") + .setCutOutputLayers(0) + .setLayerNames(Array("z")) + val result = model.transform(images) + compareToTestModel(result) + } + + test("the Image feature should work with the modelSchema") { + val model = new ImageFeaturizer() + .setInputCol(inputCol) + .setOutputCol(outputCol) + .setModel(resNet) + .setCutOutputLayers(0) + val result = model.transform(images) + compareToTestModel(result) + } + + test("Image featurizer should work with ResNet50", TestBase.Extended) { + val model = new ImageFeaturizer() + .setModel(resNet) + .setInputCol(inputCol) + .setOutputCol(outputCol) + val result = model.transform(images) + val resVec = result.select(outputCol).collect()(0).getAs[DenseVector](0) + assert(resVec.size == 1000) + } + + test("test layers of network", TestBase.Extended) { + (0 to 9).foreach({ i => + val model = new ImageFeaturizer() + .setModel(resNet) + .setInputCol(inputCol) + .setOutputCol(outputCol) + .setCutOutputLayers(i) + val result = model.transform(images) + }) + } + +} diff --git a/src/image-transformer/build.sbt b/src/image-transformer/build.sbt new file mode 100644 index 0000000000..c354d24346 --- /dev/null +++ b/src/image-transformer/build.sbt @@ -0,0 +1,2 @@ +//> DependsOn: core +//> DependsOn: readers diff --git a/src/image-transformer/src/main/python/ImageTransform.py b/src/image-transformer/src/main/python/ImageTransform.py new file mode 100644 index 0000000000..f2eb61112b --- /dev/null +++ b/src/image-transformer/src/main/python/ImageTransform.py @@ -0,0 +1,96 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +import sys + +if sys.version >= '3': + basestring = str + +import pyspark +from pyspark.ml.common import inherit_doc +from pyspark.sql.types import * +from pyspark.sql.types import Row, _create_row +import numpy as np +from mmlspark._ImageTransformer import _ImageTransformer + +ImageFields = ["path", "height", "width", "type", "bytes"] + +ImageSchema = StructType([ + StructField(ImageFields[0], StringType(), True), + StructField(ImageFields[1], IntegerType(), True), + StructField(ImageFields[2], IntegerType(), True), + StructField(ImageFields[3], IntegerType(), True), # OpenCV type: CV_8U in most cases + StructField(ImageFields[4], BinaryType(), True) ]) # OpenCV bytes: row-wise BGR in most cases + +def toNDArray(image): + return np.asarray(image.bytes, dtype = np.uint8).reshape((image.height, image.width, 3))[:,:,(2,1,0)] + +def toImage(array, path = "", ocvType = 16): + length = np.prod(array.shape) + + data = bytearray(array.astype(dtype=np.int8)[:,:,(2,1,0)].reshape(length)) + height = array.shape[0] + width = array.shape[1] + # Creating new Row with _create_row(), because Row(name = value, ... ) orders fields by name, + # which conflicts with expected ImageSchema order when the new DataFrame is created by UDF + return _create_row(ImageFields, [path, height, width, ocvType, data]) + +from pyspark.ml.common import inherit_doc +@inherit_doc +class ImageTransform(_ImageTransformer): + """ + Resizes the image to the given width and height + :param int height: The height to resize to (>=0) + :param int width: The width to resize to (>=0) + """ + def resize(self, height, width): + self._java_obj.resize(height, width) + return self + + """ + Crops the image given the starting x,y coordinates + and the width and height + :param int x: The initial x coordinate (>=0) + :param int y: The initial y coordinate (>=0) + :param int height: The height to crop to (>=0) + :param int width: The width to crop to (>=0) + """ + def crop(self, x, y, height, width): + self._java_obj.crop(x,y,height,width) + return self + + """ + Formats the image to the given image format + :param int format: The format to convert to, please see OpenCV cvtColor function documentation for all formats + """ + def colorFormat(self, format): + self._java_obj.colorFormat(format) + return self + + """ + Blurs the image using a normalized box filter + :param double height: The height of the box filter (>= 0) + :param double width: The width of the box filter (>= 0) + """ + def blur(self, height, width): + self._java_obj.blur(height, width) + return self + + """ + Thresholds the image, please see OpenCV threshold function documentation for more information + :param double threshold: The threshold value + :param double maxVal: The maximum value to use + :param double thresholdType: The type of threshold, can be binary, binary_inv, trunc, zero, zero_inv + """ + def threshold(self, threshold, maxVal, thresholdType): + self._java_obj.threshold(threshold, maxVal, thresholdType) + return self + + """ + Blurs the image by applying a gaussian kernel + :param double appertureSize: The aperture size, which should be odd and positive + :param double sigma: The standard deviation of the gaussian + """ + def gaussianKernel(self, appertureSize, sigma): + self._java_obj.gaussianKernel(appertureSize, sigma) + return self diff --git a/src/image-transformer/src/main/scala/ImageTransformer.scala b/src/image-transformer/src/main/scala/ImageTransformer.scala new file mode 100644 index 0000000000..ec82e574dc --- /dev/null +++ b/src/image-transformer/src/main/scala/ImageTransformer.scala @@ -0,0 +1,314 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.util.DefaultParamsReadable +import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.ml.param._ +import com.microsoft.ml.spark.schema.ImageSchema +import scala.collection.mutable.ListBuffer +import com.microsoft.ml.spark.schema.BinaryFileSchema +import scala.collection.mutable.{ListBuffer, WrappedArray} +import org.opencv.core.Core +import org.opencv.core.Mat +import org.opencv.core.{Rect, Size} +import org.opencv.imgproc.Imgproc +import org.apache.spark.ml.util.Identifiable + +abstract class ImageTransformerStage(params: Map[String, Any]) extends Serializable { + def apply(image: Mat): Mat + val stageName: String +} + +class ResizeImage(params: Map[String, Any]) extends ImageTransformerStage(params) { + val height = params(ResizeImage.height).asInstanceOf[Int].toDouble + val width = params(ResizeImage.width).asInstanceOf[Int].toDouble + override val stageName = ResizeImage.stageName + + override def apply(image: Mat): Mat = { + var resized = new Mat() + val sz = new Size(width, height) + Imgproc.resize(image, resized, sz) + resized + } +} + +object ResizeImage { + val stageName = "resize" + val height = "height" + val width = "width" +} + +class CropImage(params: Map[String, Any]) extends ImageTransformerStage(params) { + val x = params(CropImage.x).asInstanceOf[Int] + val y = params(CropImage.y).asInstanceOf[Int] + val height = params(CropImage.height).asInstanceOf[Int] + val width = params(CropImage.width).asInstanceOf[Int] + override val stageName = CropImage.stageName + + override def apply(image: Mat): Mat = { + val rect = new Rect(x, y, width, height) + new Mat(image, rect) + } +} + +object CropImage { + val stageName = "crop" + val x = "x" + val y = "y" + val height = "height" + val width = "width" +} + +/** + * Applies a color format to the image, eg COLOR_BGR2GRAY. + */ +class ColorFormat(params: Map[String, Any]) extends ImageTransformerStage(params) { + val format = params(ColorFormat.format).asInstanceOf[Int] + override val stageName = ColorFormat.stageName + + override def apply(image: Mat): Mat = { + val dst = new Mat() + Imgproc.cvtColor(image, dst, format) + dst + } +} + +object ColorFormat { + val stageName = "colorformat" + val format = "format" +} + +/** + * Blurs the image + * @param params + */ +class Blur(params: Map[String, Any]) extends ImageTransformerStage(params) { + val height = params(Blur.height).asInstanceOf[Double] + val width = params(Blur.width).asInstanceOf[Double] + override val stageName = Blur.stageName + + override def apply(image: Mat): Mat = { + val dst = new Mat() + Imgproc.blur(image, dst, new Size(height, width)) + dst + } +} + +object Blur { + val stageName = "blur" + val height = "height" + val width = "width" +} + +/** + * Applies a threshold to the image + * @param params + */ +class Threshold(params: Map[String, Any]) extends ImageTransformerStage(params) { + val threshold = params(Threshold.threshold).asInstanceOf[Double] + val maxVal = params(Threshold.maxVal).asInstanceOf[Double] + // EG Imgproc.THRESH_BINARY + val thresholdType = params(Threshold.thresholdType).asInstanceOf[Int] + override val stageName = Threshold.stageName + + override def apply(image: Mat): Mat = { + val dst = new Mat() + Imgproc.threshold(image, dst, threshold, maxVal, thresholdType) + dst + } +} + +object Threshold { + val stageName = "threshold" + val threshold = "threshold" + val maxVal = "maxVal" + val thresholdType = "type" +} + +/** + * Applies gaussian kernel to the image + */ +class GaussianKernel(params: Map[String, Any]) extends ImageTransformerStage(params) { + val appertureSize = params(GaussianKernel.appertureSize).asInstanceOf[Int] + val sigma = params(GaussianKernel.sigma).asInstanceOf[Double] + override val stageName = GaussianKernel.stageName + + override def apply(image: Mat): Mat = { + val dst = new Mat() + val kernel = Imgproc.getGaussianKernel(appertureSize, sigma) + Imgproc.filter2D(image, dst, -1, kernel) + dst + } +} + +object GaussianKernel { + val stageName = "gaussiankernel" + val appertureSize = "appertureSize" + val sigma = "sigma" +} + +/** + * Pipelined image processing + */ +object ImageTransformer extends DefaultParamsReadable[ImageTransformer] { + + override def load(path: String): ImageTransformer = super.load(path) + + /** + * Convert Spark image representation to OpenCV format + */ + private def row2mat(row: Row): (String, Mat) = { + val path = ImageSchema.getPath(row) + val height = ImageSchema.getHeight(row) + val width = ImageSchema.getWidth(row) + val ocvType = ImageSchema.getType(row) + val bytes = ImageSchema.getBytes(row) + + val img = new Mat(height, width, ocvType) + img.put(0,0,bytes) + (path, img) + } + + /** + * Convert from OpenCV format to Dataframe Row; unroll if needed + */ + private def mat2row(img: Mat, path: String = ""): Row = { + var ocvBytes = new Array[Byte](img.total.toInt*img.elemSize.toInt) + img.get(0,0,ocvBytes) //extract OpenCV bytes + Row(path, img.height, img.width, img.`type`, ocvBytes) + } + + /** + * Apply all OpenCV transformation stages to a single image; unroll the result if needed + * For null inputs or binary files that could not be parsed, return None. + * Break on OpenCV errors. + */ + def process(stages: Seq[ImageTransformerStage], decode: Boolean)(row: Row): Option[Row] = { + + if (row == null) return None + + val decoded = if (decode) { + val path = BinaryFileSchema.getPath(row) + val bytes = BinaryFileSchema.getBytes(row) + + //early return if the image can't be decompressed + ImageReader.decode(path, bytes).getOrElse(return None) + } else row + + var (path, img) = row2mat(decoded) + for (stage <- stages) { + img = stage.apply(img) + } + Some(mat2row(img, path)) + } +} + +@InternalWrapper +class ImageTransformer(val uid: String) extends Transformer + with HasInputCol with HasOutputCol with MMLParams { + + import com.microsoft.ml.spark.ImageTransformer._ + + def this() = this(Identifiable.randomUID("ImageTransformer")) + + val stages: ArrayMapParam = new ArrayMapParam(this, "stages", "image transformation stages") + def setStages(value: Array[Map[String, Any]]): this.type = set(stages, value) + def getStages: Array[Map[String, Any]] = $(stages) + private def addStage(stage: Map[String, Any]): this.type = set(stages, $(stages) :+ stage) + + setDefault(inputCol -> "image", + outputCol -> (uid + "_output"), + stages -> Array[Map[String, Any]]() + ) + + // every stage has a name like "resize", "normalize", "unroll" + val stageName = "action" + + def resize(height: Int, width: Int): this.type = { + require(width >= 0 && height >= 0, "width and height should be nonnegative") + + addStage(Map(stageName -> ResizeImage.stageName, + ResizeImage.width -> width, + ResizeImage.height -> height)) + } + + def crop(x: Int, y: Int, height: Int, width: Int): this.type = { + require(x >= 0 && y >= 0 && width >= 0 && height >= 0, "crop values should be nonnegative") + + addStage(Map(stageName -> CropImage.stageName, + CropImage.width -> width, + CropImage.height -> height, + CropImage.x -> x, + CropImage.y -> y)) + } + + def colorFormat(format: Int): this.type = { + addStage(Map(stageName -> ColorFormat.stageName, ColorFormat.format -> format)) + } + + def blur(height: Double, width: Double): this.type = { + addStage(Map(stageName -> Blur.stageName, Blur.height -> height, Blur.width -> width)) + } + + def threshold(threshold: Double, maxVal: Double, thresholdType: Int): this.type = { + addStage(Map(stageName -> Threshold.stageName, + Threshold.maxVal -> maxVal, + Threshold.threshold -> threshold, + Threshold.thresholdType -> thresholdType)) + } + + def gaussianKernel(appertureSize: Int, sigma: Double): this.type = { + addStage(Map(stageName -> GaussianKernel.stageName, + GaussianKernel.appertureSize -> appertureSize, + GaussianKernel.sigma -> sigma)) + } + + override def transform(dataset: Dataset[_]): DataFrame = { + + // load native OpenCV library on each partition + // TODO: figure out more elegant way + val spark = dataset.sqlContext + + val schema = dataset.toDF.schema + + val loaded = ImageSchema.loadLibraryForAllPartitions(dataset.toDF.rdd, Core.NATIVE_LIBRARY_NAME) + + val df = spark.createDataFrame(loaded, schema) + + val isBinary = BinaryFileSchema.isBinaryFile(df, $(inputCol)) + assert(ImageSchema.isImage(df, $(inputCol)) || isBinary, "input column should have Image or BinaryFile type") + + var transforms = ListBuffer[ImageTransformerStage]() + for(stage <- $(stages)) { + stage(stageName) match { + case ResizeImage.stageName => transforms += new ResizeImage(stage) + case CropImage.stageName => transforms += new CropImage(stage) + case ColorFormat.stageName => transforms += new ColorFormat(stage) + case Blur.stageName => transforms += new Blur(stage) + case Threshold.stageName => transforms += new Threshold(stage) + case GaussianKernel.stageName => transforms += new GaussianKernel(stage) + case unsupported: String => throw new IllegalArgumentException(s"unsupported transformation $unsupported") + } + } + + val func = process(transforms, decode = isBinary)(_) + val convert = udf(func, ImageSchema.columnSchema) + + df.withColumn($(outputCol), convert(df($(inputCol)))) + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) + + override def transformSchema(schema: StructType): StructType = { + schema.add($(outputCol), ImageSchema.columnSchema) + } + +} + + diff --git a/src/image-transformer/src/main/scala/UnrollImage.scala b/src/image-transformer/src/main/scala/UnrollImage.scala new file mode 100644 index 0000000000..f05332de76 --- /dev/null +++ b/src/image-transformer/src/main/scala/UnrollImage.scala @@ -0,0 +1,70 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.schema.ImageSchema._ +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType +import org.apache.spark.ml.param.{Param, ParamMap} +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Dataset, Row} + +object UnrollImage extends DefaultParamsReadable[UnrollImage]{ + + private def unroll(row: Row): DenseVector = { + val width = getWidth(row) + val height = getHeight(row) + val bytes = getBytes(row) + + val area = width*height + require(area >= 0 && area < 1e8, "image has incorrect dimensions" ) + require(bytes.length == width*height*3, "image has incorrect nuber of bytes" ) + + var rearranged = Array.fill[Double](area*3)(0.0) + var count = 0 + for (c <- 0 until 3) { + for (h <- 0 until height) { + val offset = h*width*3 + for (w <- 0 until width) { + val b = bytes(offset + w*3 + c).toDouble + + //TODO: is there a better way to convert to unsigned byte? + rearranged(count) = if(b>0) b else b + 256.0 + count += 1 + } + } + } + new DenseVector(rearranged) + } +} + +class UnrollImage(val uid: String) extends Transformer with HasInputCol with HasOutputCol with MMLParams{ + def this() = this(Identifiable.randomUID("UnrollImage")) + + import com.microsoft.ml.spark.UnrollImage._ + + setDefault(inputCol -> "image", outputCol -> (uid + "_output")) + + override def transform(dataset: Dataset[_]): DataFrame = { + val df = dataset.toDF + assert(isImage(df, $(inputCol)), "input column should have Image type") + + val func = unroll(_) + val unrollUDF = udf(func) + + df.withColumn($(outputCol), unrollUDF(df($(inputCol)))) + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) + + override def transformSchema(schema: StructType): StructType = { + schema.add($(outputCol), VectorType) + } + +} + + diff --git a/src/image-transformer/src/test/scala/ImageTransformerSuite.scala b/src/image-transformer/src/test/scala/ImageTransformerSuite.scala new file mode 100644 index 0000000000..fc7e3b1a7b --- /dev/null +++ b/src/image-transformer/src/test/scala/ImageTransformerSuite.scala @@ -0,0 +1,293 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.awt.GridLayout +import java.nio.file.Paths +import javax.swing._ + +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.sql.DataFrame +import org.opencv.core.{Mat, MatOfByte} +import org.opencv.imgcodecs.Imgcodecs +import org.opencv.imgproc.Imgproc +import org.apache.spark.sql.Row +import com.microsoft.ml.spark.FileUtilities.File +import com.microsoft.ml.spark.Readers.implicits._ +import org.apache.spark.sql.SaveMode + +class ImageTransformerSuite extends LinuxOnly { + + val groceriesDirectory = "/Images/Grocery/" + private val fileLocation = s"${sys.env("DATASETS_HOME")}/$groceriesDirectory" + + test("general workflow") { + + val images = session.readImages(fileLocation, recursive = true) + assert(images.count() == 30) + + val size = (224,200) + val tr = new ImageTransformer() + .setOutputCol("out") + .resize(height = size._1, width = size._2) + .crop(x = 0, y = 0, height = 22, width = 26) + .resize(height = 15, width = 10) + + val preprocessed = tr.transform(images) + + val out_sizes = preprocessed.select(preprocessed("out.height"), preprocessed("out.width")).collect + + out_sizes.foreach( + (row:Row) => { + assert(row.getInt(0) == 15 && row.getInt(1) == 10, "output images have incorrect size") + } + ) + + val unroll = new UnrollImage() + .setInputCol(tr.getOutputCol) + .setOutputCol("final") + + val result = unroll.transform(preprocessed).select("final") + result.collect().foreach( + row => assert(row(0).asInstanceOf[DenseVector].toArray.length == 10*15*3, "unrolled image is incorrect")) + + } + + test("to parquet") { + + val filename = "test_images_parquet" + try { + val images = session.readImages(fileLocation, recursive = true) + images.write.mode(SaveMode.Overwrite).parquet(filename) + + val images1 = session.sqlContext.read.parquet(filename) + assert(images1.count() == images.count()) + } finally { + FileUtilities.delTree(new File(filename)) + () + } + } + + test("binary file input") { + + val images = session.readBinaryFiles(fileLocation, recursive = true) + assert(images.count() == 31) + + val tr = new ImageTransformer() + .setInputCol("value") + .setOutputCol("out") + .resize(height = 15, width = 10) + + val preprocessed = tr.transform(images).na.drop + assert(preprocessed.count() == 30) + + val out_sizes = preprocessed.select(preprocessed("out.height"), preprocessed("out.width")).collect + + out_sizes.foreach( + (row:Row) => { + assert(row.getInt(0) == 15 && row.getInt(1) == 10, "output images have incorrect size") + } + ) + } + + test("crop") { + + val images = session.readImages(fileLocation, recursive = true) + + val tr = new ImageTransformer() + .setOutputCol("out") + .resize(height = 100, width = 200) + .crop(x = 0, y = 0, height = 22, width = 26) + + val preprocessed = tr.transform(images) + + val out_sizes = preprocessed.select(preprocessed("out.height"), preprocessed("out.width")).collect + + out_sizes.foreach( + (row:Row) => { + assert(row.getInt(0) == 22 && row.getInt(1) == 26, "output images have incorrect size") + } + ) + } + + test("color format") { + + val images = session.readImages(fileLocation, recursive = true) + + val tr = new ImageTransformer() + .setOutputCol("out") + .colorFormat(Imgproc.COLOR_BGR2GRAY) + + val preprocessed = tr.transform(images) + + val grayImages = selectImageCols(preprocessed) + + // For visual debugging uncomment: + // displayImages(grayImages) + val bytes = Array(10, 1, 3, 9, 6, 16, 11, 7, 8, 6, 26, 40, 57, 50) + // Validate first image first few bytes have been transformed correctly + val firstImageBytes = selectTestImageBytes(grayImages) + for (i <- 0 until bytes.length) { + assert(firstImageBytes(i) == bytes(i)) + } + } + + test("verify blur") { + + val images = session.readImages(fileLocation, recursive = true) + + val tr = new ImageTransformer() + .setOutputCol("out") + .blur(100, 100) + + val preprocessed = tr.transform(images) + + val blurImages = selectImageCols(preprocessed) + + // For visual debugging uncomment: + // displayImages(grayImages) + val bytes = Array(15, 28, 26, 15, 28, 26, 15, 28, 26, 15, 28, 26, 15, 28, 26, 15) + // Validate first image first few bytes have been transformed correctly + val firstImageBytes = selectTestImageBytes(blurImages) + for (i <- 0 until bytes.length) { + assert(firstImageBytes(i) == bytes(i)) + } + } + + test("verify thresholding") { + + val images = session.readImages(fileLocation, recursive = true) + + val tr = new ImageTransformer() + .setOutputCol("out") + .threshold(100, 100, Imgproc.THRESH_BINARY) + + val preprocessed = tr.transform(images) + + val thresholdedImages = selectImageCols(preprocessed) + + // For visual debugging uncomment: + // displayImages(thresholdedImages) + // Validate first image first few bytes have been transformed correctly + thresholdedImages.foreach( + (row:Row) => { + if (!row.getAs[Array[Byte]](3).forall(b => b == 100 || b == 0)) { + throw new Exception("threshold did not result in binary values") + } + } + ) + } + + test("verify application of gaussian kernel (has blur effect)") { + + val images = session.readImages(fileLocation, recursive = true) + + val tr = new ImageTransformer() + .setOutputCol("out") + .gaussianKernel(20, 10) + + val preprocessed = tr.transform(images) + + val gaussianImages = selectImageCols(preprocessed) + + // For visual debugging uncomment: + // displayImages(gaussianImages) + val firstImageBytes = selectTestImageBytes(gaussianImages) + // Validate first image first few bytes have been transformed correctly + val bytes = Array(8, 14, 14, 4, 8, 7, 4, 5, 5, 4, 5, 6, 5, 9, 8, 3, 8, 7, 7, 13, 12, 8, 12) + // Validate first image first few bytes have been transformed correctly + for (i <- 0 until bytes.length) { + assert(firstImageBytes(i) == bytes(i)) + } + } + + test("unroll") { + val filesRoot = s"${sys.env("DATASETS_HOME")}/" + val imagePath = s"$filesRoot/Images/CIFAR" + + val images = session.readImages(imagePath, recursive = true) + assert(images.count() == 6) + + val unroll = new UnrollImage().setOutputCol("result") + val unrolled = unroll.transform(images).select("image.path","result").collect + + unrolled.foreach(row => { + val path = Paths.get(row.getString(0)) + val expected = firstBytes(path.getFileName().toString()) + val result = row(1).asInstanceOf[DenseVector].toArray + + val length =result.length + if(length != 3072) throw new Exception(s"array length should be 3072, not $length ") + + if(!compareArrays(expected, result)) { + println(path) + println("result: " + result.slice(0,10).deep.toString) + println("expected: " + expected.deep.toString) + throw new Exception("incorrect numeric value for flattened image") + } + }) + } + + private def selectTestImageBytes(images: DataFrame): Array[Byte] = { + images.filter(row => row.getString(4).endsWith("negative/5.jpg")) + .head.getAs[Array[Byte]](3) + } + + private def selectImageCols(images: DataFrame): DataFrame = { + images.select(images("out.height"), + images("out.width"), + images("out.type"), + images("out.bytes"), + images("out.path")) + } + + private def displayImages(images: DataFrame): Unit = { + val (jframe, panel) = createScrollingFrame(images.count()) + images.collect().foreach( + (row:Row) => { + val img = new Mat(row.getInt(0), row.getInt(1), row.getInt(2)) + img.put(0,0,row.getAs[Array[Byte]](3)) + // Have to do the MatOfByte dance here + val matOfByte = new MatOfByte() + Imgcodecs.imencode(".jpg", img, matOfByte) + val icon = new ImageIcon(matOfByte.toArray) + val label: JLabel = new JLabel() + label.setIcon(icon) + panel.add(label) + () + } + ) + jframe.pack() + jframe.setVisible(true) + Thread.sleep(10000) + } + + private def createScrollingFrame(count: Long): (JFrame, JPanel) = { + val jframe: JFrame = new JFrame("images") + jframe.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE) + val panel: JPanel = new JPanel() + panel.setLayout(new GridLayout(count.toInt, 1)) + val scrPane: JScrollPane = new JScrollPane(panel) + jframe.getContentPane.add(scrPane) + (jframe, panel) + } + + private val firstBytes = Map( + "00001.png" -> Array(235.0, 231.0, 232.0, 232.0, 232.0, 232.0, 232.0, 232.0, 232.0, 232.0), + "00002.png" -> Array(222.0, 218.0, 194.0, 186.0, 222.0, 236.0, 238.0, 241.0, 243.0, 245.0), + "00000.png" -> Array(49.0, 47.0, 51.0, 53.0, 46.0, 41.0, 47.0, 45.0, 44.0, 41.0), + "00004.png" -> Array(50.0, 64.0, 46.0, 30.0, 22.0, 36.0, 55.0, 57.0, 59.0, 54.0), + "00005.png" -> Array(83.0, 61.0, 26.0, 36.0, 65.0, 67.0, 58.0, 54.0, 63.0, 65.0), + "00003.png" -> Array(149.0, 187.0, 193.0, 205.0, 202.0, 183.0, 181.0, 180.0, 182.0, 189.0) + ) + + private def compareArrays(x: Array[Double], y:Array[Double]): Boolean = { + val length = Math.min(x.length, y.length) + for(i <- 0 to length-1){ + if(Math.abs(x(i) - y(i)) > 1e-5) return false + } + true + } + +} diff --git a/src/multi-column-adapter/build.sbt b/src/multi-column-adapter/build.sbt new file mode 100644 index 0000000000..6d55f118b6 --- /dev/null +++ b/src/multi-column-adapter/build.sbt @@ -0,0 +1 @@ +//> DependsOn: core diff --git a/src/multi-column-adapter/src/main/scala/MultiColumnAdapter.scala b/src/multi-column-adapter/src/main/scala/MultiColumnAdapter.scala new file mode 100644 index 0000000000..521a2b212c --- /dev/null +++ b/src/multi-column-adapter/src/main/scala/MultiColumnAdapter.scala @@ -0,0 +1,121 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.ml.{PipelineStage, Transformer} +import org.apache.spark.ml.param.{Param, ParamMap, TransformerParam} +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql.types._ + +object MultiColumnAdapter extends DefaultParamsReadable[MultiColumnAdapter] + +/** + * This transformer takes a unary transformer and a list of input output column pairs + * and applies the transformer to each column + */ +class MultiColumnAdapter(override val uid: String) extends Transformer with MMLParams { + + def this() = this(Identifiable.randomUID("MultiColumnAdapter")) + + val inputCols: Param[String] = + StringParam( + this, + "inputCols", + "comma separated list of column names encoded as a string") + + /** @group getParam **/ + final def getInputCols: String = $(inputCols) + + /** @group setParam **/ + def setInputCols(value: String): this.type = set(inputCols, value) + + val outputCols: Param[String] = + StringParam( + this, + "outputCols", + "comma separated list of column names encoded as a string") + + /** @group getParam **/ + final def getOutputCols: String = $(outputCols) + + /** @group setParam **/ + def setOutputCols(value: String): this.type = set(outputCols, value) + + def getInputOutputPairs: List[(String, String)] = + getInputCols.split(",").zip(getOutputCols.split(",")).toList + + val baseTransformer: TransformerParam = + new TransformerParam(this, + "baseTransformer", + "base transformer to apply to every column") + + /** @group getParam **/ + final def getBaseTransformer: Transformer = $(baseTransformer) + + /** @group setParam **/ + def setBaseTransformer(value: Transformer): this.type = { + try { + //Test to see whether the class has the appropriate getters and setters + value.getParam("inputCol") + value.getParam("outputCol") + setParamInternal(value, "inputCol", this.uid + "__in") + setParamInternal(value, "outputCol", this.uid + "__out") + } catch { + case e: Exception => + throw new IllegalArgumentException( + "Need to pass a transformer with inputCol and outputCol params") + } + set(baseTransformer, value) + } + + private def setParamInternal[M <: PipelineStage, V](model: M, + name: String, + value: V) = { + model.set(model.getParam(name), value) + } + + private def getParamInternal[M <: PipelineStage](model: M, name: String) = { + model.getOrDefault(model.getParam(name)) + } + + private def setInOutCols[M <: PipelineStage]( + model: M, + inputOutputPair: (String, String)) = { + setParamInternal(setParamInternal(model, "inputCol", inputOutputPair._1), + "outputCol", + inputOutputPair._2) + } + + override def transform(dataset: Dataset[_]): DataFrame = { + transformSchema(dataset.schema) + val firstOutput = setInOutCols(getBaseTransformer, + getInputOutputPairs.head).transform(dataset) + getInputOutputPairs.tail.foldLeft(firstOutput: DataFrame) { (df, pair) => + setInOutCols(getBaseTransformer, pair).transform(df) + } + } + + def copy(extra: ParamMap): this.type = defaultCopy(extra) + + private def verifyCols(df: DataFrame, + inputOutputPairs: List[(String, String)]) = { + inputOutputPairs.foreach { + case (s1, s2) if !df.columns.contains(s1) => + throw new IllegalArgumentException( + s"DataFrame does not contain specified column: $s1") + case (s1, s2) if df.columns.contains(s2) => + throw new IllegalArgumentException( + s"DataFrame already contains specified column: $s2") + case _ => + } + } + + override def transformSchema(schema: StructType): StructType = { + getInputOutputPairs.foldLeft(schema) { (schema, pair) => + setInOutCols(getBaseTransformer, pair).transformSchema(schema) + } + } + +} diff --git a/src/multi-column-adapter/src/test/scala/MultiColumnAdapterSpec.scala b/src/multi-column-adapter/src/test/scala/MultiColumnAdapterSpec.scala new file mode 100644 index 0000000000..792228ea56 --- /dev/null +++ b/src/multi-column-adapter/src/test/scala/MultiColumnAdapterSpec.scala @@ -0,0 +1,49 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.ml.feature.Tokenizer +import com.microsoft.ml.spark.schema.DatasetExtensions._ +import org.apache.spark.ml.Transformer +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.StructType + +class MultiColumnAdapterSpec extends TransformerFuzzingTest { + + val wordDF = session.createDataFrame(Seq( + (0, "This is a test", "this is one too"), + (1, "could be a test", "maybe not"), + (2, "foo", "bar"))) + .toDF("label", "words1", "words2") + val inputCols = "words1,words2" + val outputCols = "output1,output2" + + test("parallelize transformers") { + val transformer1 = new Tokenizer() + val adapter1 = + new MultiColumnAdapter().setBaseTransformer(transformer1).setInputCols(inputCols).setOutputCols(outputCols) + val tokenizedDF = adapter1.transform(wordDF) + val lines = tokenizedDF.getColAs[Array[String]]("output2") + + val trueLines = Array( + Array("this", "is", "one", "too"), + Array("maybe", "not"), + Array("bar") + ) + assert(lines === trueLines) + } + + override def setParams(fitDataset: DataFrame, transformer: Transformer): Transformer = + transformer.asInstanceOf[MultiColumnAdapter] + .setBaseTransformer(new Tokenizer()) + .setInputCols(inputCols) + .setOutputCols(outputCols) + + override def createDataset: DataFrame = wordDF + + override def schemaForDataset: StructType = ??? + + override def getTransformer(): Transformer = new MultiColumnAdapter() + +} diff --git a/src/partition-sample/build.sbt b/src/partition-sample/build.sbt new file mode 100644 index 0000000000..6d55f118b6 --- /dev/null +++ b/src/partition-sample/build.sbt @@ -0,0 +1 @@ +//> DependsOn: core diff --git a/src/partition-sample/src/main/scala/PartitionSample.scala b/src/partition-sample/src/main/scala/PartitionSample.scala new file mode 100644 index 0000000000..b885f11a1d --- /dev/null +++ b/src/partition-sample/src/main/scala/PartitionSample.scala @@ -0,0 +1,117 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.ml.param._ +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.types._ + +object PSConstants { + final val ModeRS = "RandomSample" + final val ModeHead = "Head" + final val ModeATP = "AssignToPartition" + + final val rsAbsolute = "Absolute" + final val rsPercent = "Percentage" + + final val newColDefault = "Partition" +} + +trait PartitionSampleParams extends MMLParams { + + /* Mode: {RandomSample|AssignToPartition|Head} + - RS: {Absolute|Percentage, Seed} + - Absolute: {Count} + - Percentage: {Percent} + - ATP: {Seed, numParts, newColName} + - Head: {Count} + */ + // TODO: Convert to Enum + final val mode = StringParam(this, "mode", "AssignToPartition, RandomSample, or Head") + setDefault(mode, PSConstants.ModeRS) + final def getMode: String = $(mode) + def setMode(value: String): this.type = set(mode, value) + + // TODO: Convert to Enum + // Relevant on Mode = RS + final val rsMode = StringParam(this, "rsMode", "Absolute or Percentage", PSConstants.rsPercent) + final def getRandomSampleMode: String = $(rsMode) + def setRandomSampleMode(value: String): this.type = set(rsMode, value) + + // Relevant on Mode = RS|ATP + // TODO: We need to create Option[Int] idiom for params + final val seed = LongParam(this, "seed", "seed for random ops", -1L) + final def getSeed: Long = $(seed) + def setSeed(value: Long): this.type = set(seed, value) + + // Relevant on RSMode = Percentage + final val percent = DoubleParam(this, "percent", "percent of rows to return", 0.01) + final def getPercent: Double = $(percent) + def setPercent(value: Double): this.type = set(percent, value) + + // Relevant on Mode = Head | RSMode = Absolute + final val count = LongParam(this, "count", "number of rows to return", 1000L) + final def getCount: Long = $(count) + def setCount(value: Long): this.type = set(count, value) + + // Relevant on Mode = ATP + final val newColName = StringParam(this, "newColName", "name of the partition column", PSConstants.newColDefault) + final def getNewColName: String = $(newColName) + def setNewColName(value: String): this.type = set(newColName, value) + + // Relevant on Mode = ATP + final val numParts = IntParam(this, "numParts", "number of partitions", 10) + final def getNumParts: Int = $(numParts) + def setNumParts(value: Int): this.type = set(numParts, value) + + protected def validateAndTransformSchema(schema: StructType): StructType = { + if (Seq(PSConstants.ModeHead, PSConstants.ModeRS).contains($(mode))) + schema + else + ??? // schema + newCol + } +} + +object PartitionSample extends DefaultParamsReadable[PartitionSample] + +// UID should be overridden by driver for controlled identification at the DAG level +sealed class PartitionSample(override val uid: String) + extends Transformer + with PartitionSampleParams { + + def this() = this(Identifiable.randomUID("PartitionSample")) + + override def transform(dataset: Dataset[_]): DataFrame = { + $(mode) match { + case PSConstants.ModeHead => dataset.limit( + if ($(count) <= 2000000000) $(count).toInt else throw new Exception("Head limit 2b rows")).toDF + case PSConstants.ModeRS => randomSample(dataset, $(rsMode), $(seed)).toDF + case PSConstants.ModeATP => dataset.withColumn($(newColName), /* broken */ dataset.col("input")) + case _ => ??? + } + } + + def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema) + } + + def copy(extra: ParamMap): PartitionSample = defaultCopy(extra) + + private def randomSample( + ds: Dataset[_], + rsMode: String, + seed: Long, + replace: Boolean = false): Dataset[_] = { + val frac = rsMode match { + case PSConstants.rsPercent => $(percent) + case PSConstants.rsAbsolute => $(count).toDouble / ds.count + case _ => ??? + } + println(s"Sampling ${ds.count} rows by ${frac * 100}% to get ~${ds.count * frac} rows") + return ds.sample(replace, frac, seed) + } + +} diff --git a/src/partition-sample/src/test/scala/VerifyPartitionSample.scala b/src/partition-sample/src/test/scala/VerifyPartitionSample.scala new file mode 100644 index 0000000000..1ed325f9c9 --- /dev/null +++ b/src/partition-sample/src/test/scala/VerifyPartitionSample.scala @@ -0,0 +1,67 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql.DataFrame +import org.apache.spark.ml.Transformer +import org.apache.spark.sql.types._ +import org.apache.spark.ml.param._ + +class PartitionSampleSmokeTests extends TestBase { + + import session.implicits._ + + test("head 3") { + val sampler = new PartitionSample().setMode("Head").setCount(3) + val out = sampler.transform(makeDF) + assert(out.count === 3) + assert(makeDF.head === out.head) + } + + test("random sample smoke") { + val df = extendedDF(10) + val sampler = new PartitionSample() + .setMode("RandomSample") + .setRandomSampleMode("Absolute") + .setSeed(1) + .setCount(10) + val out = sampler.transform(df) + assert(out.count < 16) + assert(out.count > 5) + + val sampler2 = new PartitionSample() + .setMode("RandomSample") + .setRandomSampleMode("Percentage") + .setSeed(1) + .setPercent(0.5) + val out2 = sampler2.transform(df) + assert(out2.count < 100) + assert(out2.count > 60) + } + + def extendedDF(n: Int = 10): DataFrame = { + (2 to n).map(_ => makeDF).foldLeft(makeDF)((a, b) => a.union(b)) + } + + lazy val makeDF: DataFrame = { + Seq(( 1, 2), + ( 3, 4), + ( 5, 6), + ( 7, 8), + ( 9, 10), + (11, 12), + (13, 14), + (15, 16), + (17, 18), + (19, 20), + (21, 22), + (23, 24), + (25, 26), + (27, 28), + (29, 30), + (31, 32)) + .toDF("Col1", "Col2") + } + +} diff --git a/src/pipeline-stages/build.sbt b/src/pipeline-stages/build.sbt new file mode 100644 index 0000000000..6d55f118b6 --- /dev/null +++ b/src/pipeline-stages/build.sbt @@ -0,0 +1 @@ +//> DependsOn: core diff --git a/src/pipeline-stages/src/main/scala/Repartition.scala b/src/pipeline-stages/src/main/scala/Repartition.scala new file mode 100644 index 0000000000..ad277fe1c7 --- /dev/null +++ b/src/pipeline-stages/src/main/scala/Repartition.scala @@ -0,0 +1,42 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types._ + +object Repartition extends DefaultParamsReadable[Repartition] + +class Repartition(val uid: String) extends Transformer with MMLParams { + def this() = this(Identifiable.randomUID("Repartition")) + + val n: IntParam = IntParam(this, "n", "Number of partitions", + validation = ParamValidators.gt[Int](0)) + + final def getN: Int = $(n) + + def setN(value: Int): this.type = set(n,value) + + override def transform(dataset: Dataset[_]): DataFrame = { + + if (getN < dataset.rdd.getNumPartitions){ + dataset.coalesce(getN).toDF() + }else{ + dataset.sqlContext.createDataFrame( + dataset.rdd.repartition(getN).asInstanceOf[RDD[Row]], + dataset.schema) + } + } + + def transformSchema(schema: StructType): StructType = { + schema + } + + def copy(extra: ParamMap): this.type = defaultCopy(extra) + +} diff --git a/src/pipeline-stages/src/main/scala/SelectColumns.scala b/src/pipeline-stages/src/main/scala/SelectColumns.scala new file mode 100644 index 0000000000..e29b1f6329 --- /dev/null +++ b/src/pipeline-stages/src/main/scala/SelectColumns.scala @@ -0,0 +1,63 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +object SelectColumns extends DefaultParamsReadable[SelectColumns] + +/** + * This class takes a dataframe and a list of columns to select and returns + * a dataframe comprised of only those columns listed in the input list. + * + * The columns to be selected is a comma separated list of column names, contained in a single string. + */ + +class SelectColumns(val uid: String) extends Transformer with MMLParams { + def this() = this(Identifiable.randomUID("SelectColumns")) + + val cols: StringArrayParam = new StringArrayParam(this, "cols", "comma separated list of selected column names") + + /** @group getParam **/ + final def getCols: Array[String] = $(cols) + + /** @group setParam **/ + def setCols(value: Array[String]): this.type = set(cols, value) + + def setCol(value: String): this.type = set(cols, Array(value)) + + /** + * @param dataset - The input dataset, to be transformed + * @return The DataFrame that results from column selection + */ + override def transform(dataset: Dataset[_]): DataFrame = { + verifySchema(dataset.schema) + dataset.toDF().select(getCols.map(col): _*) + } + + def transformSchema(schema: StructType): StructType = { + verifySchema(schema) + val selectedCols = getCols.toSet + StructType(schema.fields.filter(f => selectedCols(f.name))) + } + + def copy(extra: ParamMap): SelectColumns = defaultCopy(extra) + + private def verifySchema(schema: StructType): Unit = { + val providedCols = schema.fields.map(_.name).toSet + val invalidCols = getCols.filter(!providedCols(_)) + + if (invalidCols.length > 0) { + throw new NoSuchElementException( + s"DataFrame does not contain specified columns: ${invalidCols.reduce(_ + "," + _)}") + } + + } + +} diff --git a/src/pipeline-stages/src/test/scala/RepartitionSuite.scala b/src/pipeline-stages/src/test/scala/RepartitionSuite.scala new file mode 100644 index 0000000000..60d3c4f1b2 --- /dev/null +++ b/src/pipeline-stages/src/test/scala/RepartitionSuite.scala @@ -0,0 +1,50 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.param.{ParamMap, ParamPair} + +class RepartitionSuite extends TestBase { + + import session.implicits._ + + val input = Seq( + (0, "guitars", "drums"), + (1, "piano", "trumpet"), + (2, "bass", "cymbals"), + (3, "guitars", "drums"), + (4, "piano", "trumpet"), + (5, "bass", "cymbals"), + (6, "guitars", "drums"), + (7, "piano", "trumpet"), + (8, "bass", "cymbals"), + (9, "guitars", "drums"), + (10, "piano", "trumpet"), + (11, "bass", "cymbals") + ).toDF("numbers", "words", "more") + + test("Work for several values of n") { + + def test(n: Int): Unit = { + val result = new Repartition() + .setN(n) + .transform(input) + assert(result.rdd.getNumPartitions == n) + () + } + List(1, 2, 3, 10).foreach(test) + + } + + test("Should allow a user to set the partitions" + + " specifically in pipeline transform"){ + val r = new Repartition().setN(1) + val pipe = new Pipeline().setStages(Array(r)) + val fitPipe = pipe.fit(input) + assert(fitPipe.transform(input).rdd.getNumPartitions==1) + assert(fitPipe.transform(input, ParamMap(r.n->5)).rdd.getNumPartitions ==5) + } + +} diff --git a/src/pipeline-stages/src/test/scala/SelectColumnsSuite.scala b/src/pipeline-stages/src/test/scala/SelectColumnsSuite.scala new file mode 100644 index 0000000000..830e48d320 --- /dev/null +++ b/src/pipeline-stages/src/test/scala/SelectColumnsSuite.scala @@ -0,0 +1,75 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.ml.Transformer +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.StructType + +class SelectColumnsSuite extends TransformerFuzzingTest { + + import session.implicits._ + + test("Select all columns in a data frame") { + val input = makeBasicDF() + val result = new SelectColumns() + .setCols(Array("numbers", "words", "more")) + .transform(input) + assert(verifyResult(input, result)) + } + + test("Test: Select two columns in a data frame") { + val expected = Seq( + ("guitars", "drums"), + ("piano", "trumpet"), + ("bass", "cymbals") + ).toDF("words", "more") + val result = new SelectColumns() + .setCols(Array("words", "more")) + .transform(makeBasicDF()) + assert(verifyResult(expected, result)) + } + + test("Test: Select columns with spaces") { + val expected = Seq( + ("guitars", "drums"), + ("piano", "trumpet"), + ("bass", "cymbals") + ).toDF("words", "Scored Labels") + val result = new SelectColumns() + .setCols(Array("words", "Scored Labels")) + .transform(makeBasicDF().withColumnRenamed("more", "Scored Labels")) + assert(verifyResult(expected, result)) + } + + test("Test: Select one column from the data frame") { + val expected = Seq( + "guitars", + "piano", + "bass" + ).toDF("words") + val result = new SelectColumns() + .setCols(Array("words")) + .transform(makeBasicDF()) + assert(verifyResult(expected, result)) + } + + test("Invalid column specified") { + try { + new SelectColumns().setCol("four").transform(makeBasicDF()) + fail() + } catch { + case _: NoSuchElementException => + } + } + + override def setParams(fitDataset: DataFrame, transformer: Transformer): Transformer = + transformer.asInstanceOf[SelectColumns].setCols(fitDataset.columns) + + override def createDataset: DataFrame = makeBasicDF() + + override def schemaForDataset: StructType = ??? + + override def getTransformer(): Transformer = new SelectColumns() +} diff --git a/src/project/build.sbt b/src/project/build.sbt new file mode 100644 index 0000000000..898a255da9 --- /dev/null +++ b/src/project/build.sbt @@ -0,0 +1,16 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +// Same options as in build.scala +scalacOptions ++= Seq( + "-encoding", "UTF-8", + // Explain warnings, optimize + "-deprecation", "-unchecked", "-feature", "-optimise", + "-Xfatal-warnings", "-Xlint", // all warnings + // -Y* are Scala options + "-Yno-adapted-args", // "-Ywarn-adapted-args", + "-Ywarn-dead-code", + "-Ywarn-numeric-widen", + "-Ywarn-value-discard" + // this leads to problems sometimes: "-Yinline-warnings" +) diff --git a/src/project/build.scala b/src/project/build.scala new file mode 100644 index 0000000000..a368c0bde9 --- /dev/null +++ b/src/project/build.scala @@ -0,0 +1,201 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +import sbt._ +import Keys._ + +import sys.process.Process + +import sbtassembly._ +import sbtassembly.AssemblyKeys._ +import sbtassembly.AssemblyPlugin.autoImport.assembly + +object Extras { + + private def env(varName: String, default: String = "") = + sys.env.getOrElse(varName, + if (default != null) default + else sys.error(s"Missing $$$varName environment variable")) + + // get the version from $MML_VERSION, or from `show-version` + def mmlVer = sys.env.getOrElse("MML_VERSION", + Process("../tools/runme/show-version").!!.trim) + + def defaultOrg = "com.microsoft.ml.spark" + def scalaVer = env("SCALA_FULL_VERSION", null) + def sparkVer = env("SPARK_VERSION", null) + + def commonResolvers = Seq( + "MMLSpark Maven Repo" at "https://mmlspark.azureedge.net/maven" + ) + def commonLibs = Seq( + "org.apache.spark" %% "spark-core" % sparkVer % "provided", + "org.apache.spark" %% "spark-mllib" % sparkVer % "provided", + "org.scalatest" %% "scalatest" % "3.0.0" % "provided", + // should include these things in the distributed jar + "io.spray" %% "spray-json" % "1.3.2", + "com.microsoft.CNTK" % "cntk_jni" % "2.0rc3", + "org.opencv" % "opencv_jni" % "3.2.0" + ) + def overrideLibs = Set( + // spark wants 2.2.6, but we don't use its tests anyway + "org.scalatest" %% "scalatest" % "3.0.0" % "provided" + ) + + def artifactsDir = file(env("BUILD_ARTIFACTS", "../BuildArtifacts/packages/m2")) + def testsDir = file(env("TEST_RESULTS", "../TestResults")) + def scalacOpts = Seq( + "-encoding", "UTF-8", + // Explain warnings, optimize + "-deprecation", "-unchecked", "-feature", "-optimise", + "-Xfatal-warnings", "-Xlint", // all warnings + // -Y* are Scala options + "-Yno-adapted-args", // "-Ywarn-adapted-args", + "-Ywarn-dead-code", + "-Ywarn-numeric-widen", + "-Ywarn-value-discard" + // this leads to problems sometimes: "-Yinline-warnings" + ) + + // Some convenience commands + val sectionPrefix = + if (env("BUILDMODE") == "server") "##[section]SBT: " else "===>>> SBT: " + def addCommands(st: State, cmds: String*): State = + st.copy(remainingCommands = cmds ++ st.remainingCommands) + def newCommands = Seq( + Command.single("cd") { (st, arg) => + addCommands(st, s"project $arg") }, + Command.args("echo", "") { (st, args) => + println(args.map(s => if (s == "
") "\n" else s).mkString(" ")); st }, + Command.single("show-section") { (st, arg) => + println("\n" + sectionPrefix + arg); st }, + Command.single("noisy-command") { (st, cmd) => + addCommands(st, s"show-section $cmd", cmd) }, + Command.single("on-all-subs") { (st, cmd) => + addCommands(st, SubProjects.all.map("noisy-command " + _ + "/" + cmd): _*) }, + Command.command("full-build") { st => + val steps = Seq(if (env("PUBLISH") == "all") "update" else null, + "run-scalastyle", + "compile", + if (testSpec == "none") null else "test:compile", + "package", + if (testSpec == "none") null else "on-all-subs test", + "codegen/run", + "publish") + addCommands(st, steps.filter(_ != null).map("noisy-command " + _): _*) } + ) ++ ScalaStyleExtras.commands + + // Utilities for sub-project sbt files + def noJar = Seq(Keys.`package` := file("")) + + // Translate $TESTS to command-line arguments + val testSpec = env("TESTS", "-extended") + def testOpts = + // Generate JUnit-style test result files + Seq(testOptions in (ThisBuild, Test) += + Tests.Argument("-u", testsDir.toString())) ++ + (if (testSpec == "all" || testSpec == "none") Seq() + else testSpec.split(",").map { spec => + testOptions in (ThisBuild, Test) += + Tests.Argument(if (spec.substring(0,1) == "+") "-n" else "-l", + "com.microsoft.ml.spark.test.tags." + + spec.substring(1)) }) + + def defaultSettings = Seq( + // Common stuff: defaults for all subprojects + scalaVersion in ThisBuild := scalaVer, + organization in ThisBuild := defaultOrg, + resolvers in ThisBuild ++= commonResolvers, + libraryDependencies in ThisBuild ++= commonLibs, + dependencyOverrides in ThisBuild ++= overrideLibs, + scalacOptions in ThisBuild ++= scalacOpts, + // Don't run tests in parallel, and fork subprocesses for them + parallelExecution in (ThisBuild, Test) := false, + fork in (ThisBuild, Test) := true, + // Assembly options + aggregate in assembly := false, + aggregate in publish := false, + test in assembly := {}, + // Documentation settings + autoAPIMappings in ThisBuild := true, + // Ctrl+C kills a running job, not sbt + cancelable in ThisBuild := true, + // No verbose logs during update + logLevel in (ThisBuild, update) := Level.Warn, + // Fewer errors to display (the default is 100) + maxErrors in ThisBuild := 20, + // Show stack traces up to the first SBT stack frame + traceLevel in ThisBuild := 0, + // Stamp the jar manifests with the build info + packageOptions in (Compile, packageBin) += + Package.ManifestAttributes( + "MMLBuildInfo" -> env("MML_BUILD_INFO", "(direct sbt build, no info collected)")), + // For convenience, import the main package in a scala console + initialCommands in (ThisBuild, console) := "import com.microsoft.ml.spark._", + // Use the above commands + commands in ThisBuild ++= newCommands + ) ++ testOpts + + def rootSettings = + defaultSettings ++ + noJar ++ // no toplevel jar + // With this we get: + // mmlspark_2.11-$ver-assembly.jar{,.md5,.sha1} + // mmlspark_2.11-$ver.pom{,.md5,.sha1} + // mmlspark_2.11-$ver{,-javadoc,-sources}.jar{,.md5,.sha1} + // the first are the combined jar, and the second are the needed pom files. + // The third all look empty and discardable. Without this, we get the same + // structure, except it seems that it tries to write both the empty jar and + // the combined jar onto the same mmlspark_2.11-$ver.jar{,.md5,.sha1} files, + // spitting up a warning, and sometimes the result is the combined jar and + // sometimes it's the empty (probably the above empty jar with the same no + // "-assembly" name). Later in the build we discard the junk files, and + // leave only the combined one. + Seq(artifact in (Compile, assembly) := + (artifact in (Compile, assembly)).value.copy(`classifier` = Some("assembly"))) ++ + addArtifact(artifact in (Compile, assembly), assembly) ++ + Seq( + // This creates a maven structure, which we upload to azure storage later + publishTo := Some(Resolver.file("file", artifactsDir)), + // In case we need to add more stuff to the uber-jar, use this: + // unmanagedResourceDirectories in Compile += artifactsDir / "more", + publishArtifact in Test := false, + publishMavenStyle := true, + // Remove the "scala-library" dependency + autoScalaLibrary := false, + // Don't include things we depend on (we leave the dependency in the POM) + assemblyOption in assembly := + (assemblyOption in assembly).value.copy( + includeScala = false, includeDependency = false), + pomPostProcess := { n: scala.xml.Node => + import scala.xml._, scala.xml.transform._ + new RuleTransformer(new RewriteRule { + override def transform(n: Node) = + // Filter out things that shouldn't be a dependency: things that + // have "provided", or "true". + // The latter is generated in meta.sbt for toplevel dependencies. + if (n.label == "dependency" && + (n.child.contains(true) || + n.child.contains(provided))) + Seq.empty + else if (n.label == "repositories") + // Deduplicate repo entries, since we get one for each subproject + { n.child.distinct } + else + Seq(n) + }).transform(Seq(pomPostProcess.value.apply(n))).head + }, + // Show the current project in the prompt + shellPrompt in ThisBuild := (st => { + val ex = Project.extract(st) + val proj = ex.currentRef.project + val root = ex.rootProject(ex.currentRef.build) + s"${if (proj == root) "" else root+"/"}${proj}> " + }), + // Use the same history path for everything instead of per project files + historyPath in ThisBuild := Some((target in LocalRootProject).value / ".history") + ) + + LibraryCheck() // invoke the library checker + +} diff --git a/src/project/lib-check.scala b/src/project/lib-check.scala new file mode 100644 index 0000000000..168542fad5 --- /dev/null +++ b/src/project/lib-check.scala @@ -0,0 +1,34 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +import sbt._ +import Keys._ +import scala.io._ + +// Warn user when the library configuration has changed +object LibraryCheck { + def apply() = () + val info = file(sys.env.getOrElse("HOME", System.getProperty("user.home"))) / + ".mmlspark_installed_libs" + val conf = file("..") / "tools" / "config.sh" + val (len, modif) = (conf.length, conf.lastModified) + def read[T](file: File, read: BufferedSource => T): T = { + val i = Source.fromFile(file); try read(i) finally i.close + } + lazy val text = + "(?s)INSTALLATIONS=\\(.*?\r?\n\\)\r?\n".r.findFirstIn(read(conf, _.mkString)).get + lazy val (len_, modif_, text_) = + read(info, i => { + val meta = i.getLines.take(2).toList.map(_.toLong) + (meta(0), meta(1), i.mkString) }) + def writeInfo() = scala.tools.nsc.io.File(info).writeAll(s"$len\n$modif\n$text") + if (!info.exists) writeInfo() + else if (len_ != len || modif_ != modif) { + if (text_ == text) writeInfo() + else { + println("\n!!! Warning: Library configuration changed," + + " consider using ./runme to update !!!\n") + Thread.sleep(1000) + } + } +} diff --git a/src/project/meta.sbt b/src/project/meta.sbt new file mode 100644 index 0000000000..6eefb69158 --- /dev/null +++ b/src/project/meta.sbt @@ -0,0 +1,108 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +// Auto-generate sub-project definitions + +val _ = { + + val topDir = file(".") + val topName = "MMLSpark" + val ignoredDirs = Array(topDir / "project") + + def userError(message: String): Nothing = { + System.err.println(message) + System.err.println("Aborting...") + exit(1) + } + + case class Proj(val dir: File, val children: List[Proj]) { + val name = if (dir == topDir) topName + else Project.normalizeModuleID(dir.toString.substring(2)) + val dirs = Iterator.iterate(dir)(_.getParentFile).takeWhile(_ != null) + .map(_.getName).toList.reverse.drop(1) + val props = { + val i = scala.io.Source.fromFile(dir / "build.sbt") + val rx = "//> *(\\w+) *: *(.*?) *".r + (try i.getLines.toList finally i.close) + .map(_ match { case rx(x,y) => (x,y); case _ => null; }).filter(_ != null) + .foldLeft(Map[String,List[String]]()) { (m,kv) => + m + (kv._1 -> (m.getOrElse(kv._1,Nil) :+ kv._2)) } + } + lazy val deps = children ++ props.getOrElse("DependsOn", Nil).map(nameToProj) + def flatList(): List[Proj] = this +: children.flatMap(_.flatList) + override def toString() = name + } + + def findProjects(dir: File): List[Proj] = { + val (dirs, files) = dir.listFiles().sorted.toList.partition(_.isDirectory) + val nested = dirs.flatMap(findProjects) + if (ignoredDirs.contains(dir) || !files.exists(p => p.getName == "build.sbt")) nested + else List(Proj(dir, nested)) + } + + def nameToProj(name: String): Proj = + nameProjMap.getOrElse(Project.normalizeModuleID(name), + userError(s"Bad project name: $name...")) + + // Cheap topological sort for projects; note that the input is sorted + // alphabetically, and it preserves this order when posible + def depSort(projs: List[List[Proj]]): List[Proj] = { + if (projs.isEmpty) Nil + else projs.find(_.tail.isEmpty) match { + case Some(x +: _) => x +: depSort(projs.map(_.filterNot(_==x)).filterNot(_.isEmpty)) + case _ => userError(s"Dependency cycle! {${projs.map(_.head).mkString(", ")}}") + } + } + + lazy val topProj = findProjects(topDir)(0) + lazy val nameProjMap = topProj.flatList.map(p => (p.name -> p)).toMap + lazy val sortedProjs = depSort(topProj.flatList.map(p => p +: p.deps)) + + def projToSbt(proj: Proj): String = { + def showList(list: List[Proj], what: String, sfx: String) = { + if (list.isEmpty) "" + else s"""\n .${what}(\n ${list.map(p => s"`$p`$sfx").mkString(",\n ")})""" + } + (s"""val `$proj` = (project in ${("topDir" +: proj.dirs.map("\""+_+"\"")) + .mkString(" / ")})""" + + "\n .settings(Extras.defaultSettings: _*)" + + showList(proj.children, "aggregate", "") + // for the root project, use "optional" -- I don't know what it should + // do, but it's visible in the POM file, which allows us to filter our + // dependencies out of it (in "build.scala"). + + showList(proj.deps, "dependsOn", + if (proj != topProj) " % \"compile->compile;test->test\"" + else " % \"compile->compile;optional\""))} + + IO.write(topDir / "autogen.sbt", + s"""// Automatically generated, DO NOT EDIT\n + |val topDir = file(".")\n + |${sortedProjs.map(projToSbt).mkString("\n\n")} + |""".stripMargin) + + IO.write(topDir / "project" / "autogen.scala", + s"""// Automatically generated, DO NOT EDIT\n + |import sbt._\nimport Keys._\n + |object SubProjects { + | val all = Seq( + | ${sortedProjs.filter(_.children.isEmpty) + .map("\"" + _.name + "\"").mkString(",\n ")}) + |} + |""".stripMargin) + + IO.write(topDir / "project" / "project-roots.txt", + sortedProjs + .map(p => { + val d = p.dir + (if (d == topDir) d else d.toString.substring(2)) + "\n"}) + .mkString("")) + + IO.write(topDir / "project" / "dependencies.digraph", + s"""// Automatically generated, DO NOT EDIT\n + |digraph ${topName} { + | ${sortedProjs.flatMap(p => p.deps.map(d => s""""$p" -> "$d";""")) + .mkString("\n ")} + |} + |""".stripMargin) + +} diff --git a/src/project/plugins.sbt b/src/project/plugins.sbt new file mode 100644 index 0000000000..7400f588e0 --- /dev/null +++ b/src/project/plugins.sbt @@ -0,0 +1,5 @@ +logLevel := Level.Warn + +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.4") +addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0") +addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.0") diff --git a/src/project/scalastyle.scala b/src/project/scalastyle.scala new file mode 100644 index 0000000000..6902ce397a --- /dev/null +++ b/src/project/scalastyle.scala @@ -0,0 +1,136 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +import sbt._ +import Keys._ + +final object ScalaStyleExtras { + + // First string determines options: + // * [st]?: optional src- or test-only rule (default: both) + // * [FS]: whether this is a .scalariform or a .file rule + // * [EWI]: Error / Warn / Ignore + val rules = List( + r("FE", "FileLengthChecker", ("maxFileLength", 800)), + r("FE", "FileTabChecker"), + r("FE", "FileLineLengthChecker", ("maxLineLength", 120)), + r("FE", "NewLineAtEofChecker"), + r("FE", "RegexChecker", ("regex", "\n\n\n")), + r("SW", "TokenChecker", ("regex", ".{33}")), + r("FE", "HeaderMatchesChecker", + ("header", + Seq("^// Copyright \\(C\\) Microsoft Corporation\\. All rights reserved\\.", + "// Licensed under the MIT License\\. See LICENSE in project root for information\\.", + "", + "package (?:com\\.microsoft\\.ml\\.spark|org\\.apache\\.spark)[.\n]") + .mkString("\n")), + ("regex", true)), + r("FI", "IndentationChecker", ("tabSize", 2), ("methodParamIndentSize", 2)), + r("FE", "WhitespaceEndOfLineChecker"), + r("SE", "SpacesAfterPlusChecker"), + r("SE", "SpacesBeforePlusChecker"), + r("SE", "NoWhitespaceBeforeLeftBracketChecker"), + r("SE", "NoWhitespaceAfterLeftBracketChecker"), + r("SE", "EmptyClassChecker"), + r("SW", "EnsureSingleSpaceAfterTokenChecker", ("tokens", "COLON")), // IF, FOR, WHILE, ELSE -- one or more + r("SE", "EnsureSingleSpaceBeforeTokenChecker"), // what is this doing? + r("SW", "DisallowSpaceAfterTokenChecker"), + r("SE", "DisallowSpaceBeforeTokenChecker"), + r("SE", "ClassNamesChecker", ("regex", "^[A-Z][A-Za-z0-9]*$")), + r("SE", "ObjectNamesChecker", ("regex", "^[A-Za-z][A-Za-z0-9]*$")), // allow function-like names + r("SE", "PackageObjectNamesChecker", ("regex", "^[a-z][A-Za-z]*$")), + // this matches the first token after a `val`, which might be `(` in case of val (x, y) = ... + r("SW", "FieldNamesChecker", ("regex", "^([a-z][A-Za-z0-9]*| *\\( *)$")), + r("SW", "MethodNamesChecker", ("regex", "^[a-z][A-Za-z0-9]*(_=)?$")), + r("SE", "ClassTypeParameterChecker", ("regex", "^[A-Z_]$")), + r("SE", "EqualsHashCodeChecker"), + r("SE", "IllegalImportsChecker", ("illegalImports", "sun._")), + r("SE", "DeprecatedJavaChecker"), + r("SE", "ParameterNumberChecker", ("maxParameters", 9)), + r("SW", "MethodLengthChecker", ("maxLength", 50)), + r("SE", "NumberOfTypesChecker", ("maxTypes", 30)), + r("SE", "NumberOfMethodsInTypeChecker", ("maxMethods", 30)), + r("SE", "NumberOfTypesChecker"), + r("SW", "CyclomaticComplexityChecker", ("maximum", 10)), + r("SE", "PublicMethodsHaveTypeChecker"), + r("sSW", "MagicNumberChecker", ("ignore", "-1,0,1,2,3")), + r("SE", "UppercaseLChecker"), + r("SE", "ProcedureDeclarationChecker"), + r("SE", "RedundantIfChecker"), + r("SW", "WhileChecker"), + r("SW", "ReturnChecker"), + r("SW", "NullChecker"), + r("SE", "NoCloneChecker"), + r("SE", "NoFinalizeChecker"), + r("SE", "StructuralTypeChecker"), + r("SE", "CovariantEqualsChecker"), + r("SE", "NonASCIICharacterChecker"), + // looks like this doesn't work + r("SE", "ImportOrderChecker", ("groups" , "our,scala,java,other"), + ("our", "com.microsoft.ml.spark[.].+"), ("scala", "scala[.].+"), ("java", "java[.].+"), ("other", ".+")), + r("SE", "SimplifyBooleanExpressionChecker"), + r("SW", "NotImplementedErrorUsage") + // r("SE", "ScalaDocChecker") <-- use when we add scaladoc + // Rules that are not used: + // VarLocalChecker: needed in some places + // VarFieldChecker: -"- + // BlockImportChecker: we want to be able to name specific imports... + // ImportGroupingChecker: ... and be able to import in the middle of code + // UnderscoreImportChecker: ... and use _ wildcards + // NoNewLineAtEofChecker: want a newline there + // ForBraceChecker: "for {...} yield ..." looks fine, but "for { ... } { ... }" looks messy + // XmlLiteralChecker: maybe it'll be useful + // LowercasePatternMatchChecker: Lots of places where it's fine + // MultipleStringLiteralsChecker: applies even in interpolation parts + // PatternMatchAlignChecker: Looks like it's wrong anyway + // SpaceAfterCommentStartChecker: rejects the popular "//TODO:" + // TodoCommentChecker: at least for now we need them + ) + + val conf = file(".") / "scalastyle-config.xml" + + def modes = Map(' ' -> null, 's' -> "src", 't' -> "test") + def prefixes = Map('F' -> "file", 'S' -> "scalariform") + def levels = Map('E' -> "error", 'W' -> "warning", 'I' -> null) + def r(flags: String, name: String, params: (String, Any)*) = { + val f3 = if (flags.length < 3) " "+flags else flags + (modes(f3(0)), s"org.scalastyle.${prefixes(f3(1))}.${name}", levels(f3(2)), params) + } + + def mkRule(curmode: String)(rule: (String, String, String, Seq[(String,Any)])): String = { + val (mode, name, level, params) = rule + if (level == null || (mode != null && curmode != mode)) return null + val paramStr = + if (params.isEmpty) "" + else ("" + + params.map(p => "\n ") + .mkString("") + + "") + s"""$paramStr""" + } + def mkConfig(mode: String): String = + s""" + | Scalastyle Module Configuration ($mode) + | ${rules.map(mkRule(mode)).filter(_ != null).mkString("\n ")} + | + |""".stripMargin + + def commands = Seq( + Command.command("run-scalastyle") { st => + Extras.addCommands(st, "run-scalastyle-on src", "run-scalastyle-on test") + }, + Command.single("scalastyle-make-config") { (st, mode) => + scala.tools.nsc.io.File(conf).writeAll(mkConfig(mode)) + st + }, + Command.single("run-scalastyle-on") { (st, mode) => + val cmd = (if (mode == "src") "" else mode + ":") + "scalastyle" + Extras.addCommands(st, s"scalastyle-make-config $mode", + s"noisy-command on-all-subs $cmd", + "scalastyle-delete-config") + }, + Command.command("scalastyle-delete-config") { st => conf.delete; st } + ) + +} diff --git a/src/readers/build.sbt b/src/readers/build.sbt new file mode 100644 index 0000000000..6d55f118b6 --- /dev/null +++ b/src/readers/build.sbt @@ -0,0 +1 @@ +//> DependsOn: core diff --git a/src/readers/src/main/python/BinaryFileReader.py b/src/readers/src/main/python/BinaryFileReader.py new file mode 100644 index 0000000000..7cbac9205c --- /dev/null +++ b/src/readers/src/main/python/BinaryFileReader.py @@ -0,0 +1,52 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +import sys + +if sys.version >= '3': + basestring = str + +import pyspark +from pyspark import SparkContext +from pyspark import sql +from pyspark.ml.param.shared import * +from pyspark.sql import DataFrame +from pyspark.sql.types import * + +BinaryFileFields = ["path", "bytes"] + +BinaryFileSchema = StructType([ + StructField(BinaryFileFields[0], StringType(), True), + StructField(BinaryFileFields[1], BinaryType(), True) ]) + +def readBinaryFiles(self, path, recursive = False, sampleRatio = 1.0, inspectZip = True): + """ + Reads the directory of binary files from the local or remote (WASB) source + + :param str path: Path to the file directory + :param bool recursive: Recursive search flag + :param double sampleRatio: Fraction of the files loaded into the dataframe + :return: DataFrame with a single column "value"; see binaryFileSchema for details + :rtype: DataFrame + """ + ctx = SparkContext.getOrCreate() + reader = ctx._jvm.com.microsoft.ml.spark.BinaryFileReader + sql_ctx = pyspark.SQLContext.getOrCreate(ctx) + jsession = sql_ctx.sparkSession._jsparkSession + jresult = reader.read(path, recursive, jsession, float(sampleRatio), inspectZip) + return DataFrame(jresult, sql_ctx) + +setattr(sql.SparkSession, 'readBinaryFiles', classmethod(readBinaryFiles)) + +def isBinaryFile(df, column): + """ + Returns True if the column contains binary files + + :param DataFrame df: The DataFrame to be processed + :param bool column: The name of the column being inspected + :return: True if the colum is a binary files column + :rtype: bool + """ + ctx = SparkContext.getOrCreate() + schema = ctx._jvm.com.microsoft.ml.spark.schema.BinaryFileSchema + return schema.isBinaryFile(df._jdf, column) diff --git a/src/readers/src/main/python/ImageReader.py b/src/readers/src/main/python/ImageReader.py new file mode 100644 index 0000000000..176326b1f6 --- /dev/null +++ b/src/readers/src/main/python/ImageReader.py @@ -0,0 +1,50 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +import sys + +if sys.version >= '3': + basestring = str + +import pyspark +from pyspark import SparkContext +from pyspark import sql +from pyspark.ml.param.shared import * +from pyspark.sql import DataFrame + +def readImages(sparkSession, path, recursive = False, sampleRatio = 1.0, inspectZip = True): + """ + Reads the directory of images from the local or remote (WASB) source. + This function is attached to SparkSession class. + Example: spark.readImages(path, recursive, ...) + + :param SparkSession sparkSession: Existing sparkSession + :param str path: Path to the image directory + :param bool recursive: Recursive search flag + :param double sampleRatio: Fraction of the images loaded into dataframe + :param bool inspectZip: Look for images inside zip files + :return: DataFrame with a single column of "images", see imageSchema for details + :rtype: DataFrame + """ + ctx = SparkContext.getOrCreate() + reader = ctx._jvm.com.microsoft.ml.spark.ImageReader + sql_ctx = pyspark.SQLContext.getOrCreate(ctx) + jsession = sql_ctx.sparkSession._jsparkSession + jresult = reader.read(path, recursive, jsession, float(sampleRatio), inspectZip) + return DataFrame(jresult, sql_ctx) + +setattr(sql.SparkSession, 'readImages', classmethod(readImages)) + +def isImage(df, column): + """ + Returns True if the column contains images + + :param DataFrame df: The DataFrame to be processed + :param bool column: The name of the column being inspected + :return: True if the colum is an image column + :rtype: bool + """ + + jvm = SparkContext.getOrCreate()._jvm + schema = jvm.com.microsoft.ml.spark.schema.ImageSchema + return schema.isImage(df._jdf, column) diff --git a/src/readers/src/main/scala/AzureBlobReader.scala b/src/readers/src/main/scala/AzureBlobReader.scala new file mode 100644 index 0000000000..39a8830e7d --- /dev/null +++ b/src/readers/src/main/scala/AzureBlobReader.scala @@ -0,0 +1,72 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark +import com.microsoft.ml.spark.FileFormat.FileFormat +import java.util.NoSuchElementException +import org.apache.spark.SparkContext +import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession} +import scala.util.parsing.json._ + +object AzureBlobReader { + def read (accountName: String, accountKey: String, containerName: String, filePath: String, + fileFormat: String, hasHeader: Boolean): DataFrame = { + val spark = SparkSession.builder.getOrCreate() + val fileFormatEnum = ReaderUtils.getFileFormat(fileFormat) + + // Register the credential + if (!ReaderUtils.isNullOrEmpty(accountKey)) { + val config = spark.sparkContext.hadoopConfiguration + val azureAccountKeyPrefix = "fs.azure.account.key." + val azureAccountKeyPostfix = ".blob.core.windows.net" + config.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem") + config.set(azureAccountKeyPrefix.concat(accountName).concat(azureAccountKeyPostfix), accountKey) + } + + // Generate the url + var url: String = null + if (!ReaderUtils.isNullOrEmpty(containerName) && !ReaderUtils.isNullOrEmpty(accountName)) { + val urlPrefix = "wasbs://" + val urlPostfix = ".blob.core.windows.net/" + url = urlPrefix.concat(containerName).concat("@").concat(accountName).concat(urlPostfix).concat(filePath) + } else { + val urlPrefix = "wasbs:///" + url = urlPrefix.concat(filePath) + } + + // Populate the options + val options = ReaderUtils.getOptionsForBlobReader(fileFormatEnum, true, hasHeader) + + // Get the file format + var format = fileFormatEnum.toString + if (format == "tsv") { + format = "csv" + } + + spark.read.format(format).options(options).load(url) + } + + def read2 (jsonStr: String): DataFrame = { + val parsedJsonStr = JSON.parseFull(jsonStr) + var accountName = "" + var accountKey = "" + var containerName = "" + var filePath = "" + var fileFormat = "" + var hasHeader = false; + try { + hasHeader = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("hasHeader").asInstanceOf[Boolean] + fileFormat = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("fileFormat").asInstanceOf[String] + filePath = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("filePath").asInstanceOf[String] + containerName = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("containerName").asInstanceOf[String] + accountKey = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("accountKey").asInstanceOf[String] + accountName = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("accountName").asInstanceOf[String] + } catch { + case ex: NoSuchElementException => { + throw new IllegalArgumentException("parameter not found or invalid Json format detected in the input.") + } + } + + read(accountName, accountKey, containerName, filePath, fileFormat, hasHeader) + } +} diff --git a/src/readers/src/main/scala/AzureSQLReader.scala b/src/readers/src/main/scala/AzureSQLReader.scala new file mode 100644 index 0000000000..f9dc56c2e1 --- /dev/null +++ b/src/readers/src/main/scala/AzureSQLReader.scala @@ -0,0 +1,53 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.util.NoSuchElementException +import org.apache.spark.SparkContext +import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession} +import scala.util.parsing.json.JSON + +object AzureSQLReader { + def read(serverName: String, databaseName: String, query: String, userName: String, password: String): DataFrame = { + // val spark = SQLContext.getOrCreate(null) + val spark = SparkSession.builder.getOrCreate() + + // Convert query to subquery + val subQueryPrefix = "(" + val subQueryPostfix = ") AS mmlTempTable123" + val subQuery = subQueryPrefix.concat(query).concat(subQueryPostfix) + println(subQuery) + val driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver" + val urlPrefix = "jdbc:sqlserver://" + val urlPostfix = ".database.windows.net" + val url = urlPrefix.concat(serverName).concat(urlPostfix) + val options = Map("url" -> url, "databaseName" -> databaseName, "driver" -> driver, "dbtable" -> subQuery, + "user" -> userName, "password" -> password) + + spark.read.format("jdbc").options(options).load() + } + + def read2 (jsonStr: String): DataFrame = { + val parsedJsonStr = JSON.parseFull(jsonStr) + var serverName= "" + var databaseName = "" + var query = "" + var userName = "" + var password = "" + + try { + password = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("password").asInstanceOf[String] + userName = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("userName").asInstanceOf[String] + query = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("query").asInstanceOf[String] + databaseName = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("databaseName").asInstanceOf[String] + serverName = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("serverName").asInstanceOf[String] + } catch { + case ex: NoSuchElementException => { + throw new IllegalArgumentException("parameter not found or invalid Json format detected in the input.") + } + } + + read(serverName, databaseName, query, userName, password) + } +} diff --git a/src/readers/src/main/scala/BinaryFileReader.scala b/src/readers/src/main/scala/BinaryFileReader.scala new file mode 100644 index 0000000000..69451ac7df --- /dev/null +++ b/src/readers/src/main/scala/BinaryFileReader.scala @@ -0,0 +1,79 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.schema.BinaryFileSchema +import org.apache.spark.input.PortableDataStream +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row, SparkSession} + +import scala.language.existentials +import com.microsoft.ml.spark.FileUtilities.{ZipIterator} +import com.microsoft.ml.spark.hadoop.{SamplePathFilter, RecursiveFlag} + +object BinaryFileReader { + + //single column of images named "image" + private val binaryDFSchema = StructType(StructField("value", BinaryFileSchema.columnSchema, true) :: Nil) + + /** + * Read the directory of images from the local or remote source + * + * @param path Path to the image directory + * @param recursive Recursive search flag + * @return Dataframe with a single column of "images", see imageSchema for details + */ + private[spark] def readRDD(path: String, recursive: Boolean, spark: SparkSession, + sampleRatio: Double, inspectZip: Boolean) + : RDD[(String, Array[Byte])] = { + + require(sampleRatio <= 1.0 && sampleRatio >= 0, "sampleRatio should be between 0 and 1") + + val oldRecursiveFlag = RecursiveFlag.setRecursiveFlag(Some(recursive.toString), spark) + val oldPathFilter: Option[Class[_]] = + if (sampleRatio < 1) + SamplePathFilter.setPathFilter(Some(classOf[SamplePathFilter]), Some(sampleRatio), Some(inspectZip), spark) + else + None + + var data: RDD[(String, Array[Byte])] = null + try { + val streams = spark.sparkContext.binaryFiles(path, spark.sparkContext.defaultParallelism) + + // Create files RDD and load bytes + data = if(!inspectZip) { + streams.mapValues((stream: PortableDataStream) => stream.toArray) + } + else{ + // if inspectZip is enabled, examine/sample the contents of zip files + streams.flatMap({ case (filename: String, stream: PortableDataStream) => + if (SamplePathFilter.isZipFile(filename)) { + new ZipIterator(stream, filename, sampleRatio) + } + else { + Some((filename, stream.toArray)) + } + }) + } + } + finally { + // return Hadoop flag to its original value + RecursiveFlag.setRecursiveFlag(oldRecursiveFlag, spark = spark) + SamplePathFilter.setPathFilter(oldPathFilter, spark = spark) + () + } + + data + } + + def read(path: String, recursive: Boolean, spark: SparkSession, + sampleRatio: Double = 1, inspectZip: Boolean = true): DataFrame = { + val rowRDD = readRDD(path, recursive, spark, sampleRatio, inspectZip) + .map({row:(String, Array[Byte]) => Row(Row(row._1, row._2))}) + + spark.createDataFrame(rowRDD, binaryDFSchema) + } +} + diff --git a/src/readers/src/main/scala/FileFormat.scala b/src/readers/src/main/scala/FileFormat.scala new file mode 100644 index 0000000000..d7ead617e4 --- /dev/null +++ b/src/readers/src/main/scala/FileFormat.scala @@ -0,0 +1,12 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +object FileFormat extends Enumeration { + type FileFormat = Value + val Csv = Value("csv") + val Tsv = Value("tsv") + val Json = Value("json") + val Parquet = Value("parquet") +} diff --git a/src/readers/src/main/scala/ImageReader.scala b/src/readers/src/main/scala/ImageReader.scala new file mode 100644 index 0000000000..3594dbcb7d --- /dev/null +++ b/src/readers/src/main/scala/ImageReader.scala @@ -0,0 +1,63 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.schema.ImageSchema +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.opencv.core.{Core, MatOfByte} +import org.opencv.imgcodecs.Imgcodecs + +object ImageReader { + + //single column of images named "image" + private val imageDFSchema = StructType(StructField("image", ImageSchema.columnSchema, true) :: Nil) + + /** + * Convert the image from compressd (jpeg, etc.) into OpenCV representation and store it in Row + * See ImageSchema for details. + * + * @param filename arbitrary string + * @param bytes image bytes (for example, jpeg) + * @return returns None if decompression fails + */ + private[spark] def decode(filename: String, bytes: Array[Byte]): Option[Row] = { + val mat = new MatOfByte(bytes: _*) + val decoded = Imgcodecs.imdecode(mat, Imgcodecs.CV_LOAD_IMAGE_COLOR) + + if (decoded.empty()) { + None + } else { + val ocvBytes = new Array[Byte](decoded.total.toInt * decoded.elemSize.toInt) + + // extract OpenCV bytes + decoded.get(0, 0, ocvBytes) + + // type: CvType.CV_8U + Some(Row(filename, decoded.height, decoded.width, decoded.`type`, ocvBytes)) + } + } + + /** + * Read the directory of images from the local or remote source + * + * @param path Path to the image directory + * @param recursive Recursive search flag + * @return Dataframe with a single column of "images", see imageSchema for details + */ + def read(path: String, recursive: Boolean, spark: SparkSession, + sampleRatio: Double = 1, inspectZip: Boolean = true): DataFrame = { + + val binaryRDD = BinaryFileReader.readRDD(path, recursive, spark, sampleRatio, inspectZip) + val binaryRDDlib = ImageSchema.loadLibraryForAllPartitions(binaryRDD, Core.NATIVE_LIBRARY_NAME) + + val validImages = binaryRDDlib.flatMap { + case (filename, bytes) => { + decode(filename, bytes).map(x => Row(x)) + } + } + + spark.createDataFrame(validImages, imageDFSchema) + } +} diff --git a/src/readers/src/main/scala/ReaderUtils.scala b/src/readers/src/main/scala/ReaderUtils.scala new file mode 100644 index 0000000000..fa36d768f7 --- /dev/null +++ b/src/readers/src/main/scala/ReaderUtils.scala @@ -0,0 +1,47 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.FileFormat.FileFormat + +object ReaderUtils { + def isNullOrEmpty(str: String): Boolean = { + return str == null || str.trim.isEmpty + } + + def getFileFormat(str: String): FileFormat = { + if (isNullOrEmpty(str)) { + throw new IllegalArgumentException("str is invalid.") + } + + if (str.equalsIgnoreCase("csv")) { + return FileFormat.Csv + } else if (str.equalsIgnoreCase("tsv")) { + return FileFormat.Tsv + }else if (str.equalsIgnoreCase("json")) { + return FileFormat.Json + } else if (str.equalsIgnoreCase("parquet")) { + return FileFormat.Parquet + } else { + throw new IllegalArgumentException("str is not valid file format.") + } + } + + def getOptionsForBlobReader(fileFormat: FileFormat, inferSchema: Boolean, hasHeader: Boolean): Map[String, String] = { + var headerOpt = "false" + if (hasHeader) { + headerOpt = "true" + } + var schemaOpt = "false" + if (inferSchema) { + schemaOpt = "true" + } + var options = Map("inferSchema" -> schemaOpt, "header" -> headerOpt) + if (fileFormat == FileFormat.Tsv) { + options = options + ("delimiter" -> "\t") + } + + return options + } +} diff --git a/src/readers/src/main/scala/Readers.scala b/src/readers/src/main/scala/Readers.scala new file mode 100644 index 0000000000..a4ee2359ea --- /dev/null +++ b/src/readers/src/main/scala/Readers.scala @@ -0,0 +1,50 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql.{DataFrame, SparkSession} +import scala.language.implicitConversions + +/** + * Implicit conversion allows sparkSession.readImages(...) syntax + * Example: + * import com.microsoft.ml.spark.Readers.implicits._ + * sparkSession.readImages(path, recursive = false) + */ +object Readers { + + object implicits { + + class Session(sparkSession: SparkSession) { + + /** + * + * @param path Path to the files directory + * @param recursive Recursive path search flag + * @param sampleRatio Fraction of the files loaded + * @param inspectZip Whether zip files are treated as directories + * @return Dataframe with a single column "value" of binary files, see BinaryFileSchema for details + */ + def readBinaryFiles(path: String, recursive: Boolean, + sampleRatio: Double = 1, inspectZip: Boolean = true): DataFrame = + BinaryFileReader.read(path, recursive, sparkSession, sampleRatio, inspectZip) + + /** + * Read the directory of images from the local or remote source + * + * @param path Path to the image directory + * @param recursive Recursive path search flag + * @param sampleRatio Fraction of the files loaded + * @param inspectZip Whether zip files are treated as directories + * @return Dataframe with a single column "image" of images, see ImageSchema for details + */ + def readImages(path: String, recursive: Boolean, + sampleRatio: Double = 1, inspectZip: Boolean = true): DataFrame = + ImageReader.read(path, recursive, sparkSession, sampleRatio, inspectZip) + } + + implicit def ImplicitSession(sparkSession: SparkSession):Session = new Session(sparkSession) + + } +} diff --git a/src/readers/src/main/scala/WasbReader.scala b/src/readers/src/main/scala/WasbReader.scala new file mode 100644 index 0000000000..80d38b22bc --- /dev/null +++ b/src/readers/src/main/scala/WasbReader.scala @@ -0,0 +1,47 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.util.NoSuchElementException +import com.microsoft.ml.spark.FileFormat.FileFormat +import org.apache.spark.SparkContext +import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession} +import scala.util.parsing.json.JSON + +object WasbReader { + def read (url: String, fileFormat: String, hasHeader: Boolean): DataFrame = { + val spark = SparkSession.builder.getOrCreate() + val fileFormatEnum = ReaderUtils.getFileFormat(fileFormat) + + // Populate the options + val options = ReaderUtils.getOptionsForBlobReader(fileFormatEnum, true, hasHeader) + + // Get the file format + var format = fileFormatEnum.toString + if (format == "tsv") { + format = "csv" + } + + spark.read.format(format).options(options).load(url) + } + + def read2 (jsonStr: String): DataFrame = { + val parsedJsonStr = JSON.parseFull(jsonStr) + var url = "" + var fileFormat = "" + var hasHeader = false + + try { + hasHeader = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("hasHeader").asInstanceOf[Boolean] + fileFormat = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("fileFormat").asInstanceOf[String] + url = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("url").asInstanceOf[String] + } catch { + case ex: NoSuchElementException => { + throw new IllegalArgumentException("parameter not found or invalid Json format detected in the input.") + } + } + + read(url, fileFormat, hasHeader) + } +} diff --git a/src/readers/src/test/scala/BinaryFileReaderSuite.scala b/src/readers/src/test/scala/BinaryFileReaderSuite.scala new file mode 100644 index 0000000000..b1d00f30e7 --- /dev/null +++ b/src/readers/src/test/scala/BinaryFileReaderSuite.scala @@ -0,0 +1,44 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql._ +import com.microsoft.ml.spark.schema.BinaryFileSchema.isBinaryFile +import com.microsoft.ml.spark.Readers.implicits._ +import FileReaderSuiteUtils._ + +class BinaryFileReaderSuite extends TestBase { + + test("binary dataframe") { + + val data = session.readBinaryFiles(groceriesDirectory, recursive = true) + + println(time { data.count }) + + assert(isBinaryFile(data, "value")) + + val paths = data.select("value.path") //make sure that SQL has access to the sub-fields + assert(paths.count == 31) //note that text file is also included + } + + test("sample ratio test") { + + val all = session.readBinaryFiles(groceriesDirectory, recursive = true, sampleRatio = 1.0) + val sampled = session.readBinaryFiles(groceriesDirectory, recursive = true, sampleRatio = 0.5) + val count = sampled.count + assert(count > 0 && count < all.count, "incorrect sampling behavior") + } + + test("with zip file") { + /* remove when datasets/Images is updated */ + creatZips + + val images = session.readBinaryFiles(imagesDirectory, recursive = true) + assert(images.count == 74) + + val images1 = session.readBinaryFiles(imagesDirectory, recursive = true, inspectZip = false) + assert(images1.count == 39) + } + +} diff --git a/src/readers/src/test/scala/ImageReaderSuite.scala b/src/readers/src/test/scala/ImageReaderSuite.scala new file mode 100644 index 0000000000..312b376b20 --- /dev/null +++ b/src/readers/src/test/scala/ImageReaderSuite.scala @@ -0,0 +1,75 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql._ +import com.microsoft.ml.spark.schema.ImageSchema.isImage +import com.microsoft.ml.spark.schema.BinaryFileSchema.isBinaryFile +import org.apache.spark.input.PortableDataStream +import com.microsoft.ml.spark.Readers.implicits._ +import com.microsoft.ml.spark.FileUtilities._ + +object FileReaderSuiteUtils { + val fileLocation = s"${sys.env("DATASETS_HOME")}" + val imagesDirectory = fileLocation + "/Images" + val groceriesDirectory = imagesDirectory + "/Grocery" + val cifarDirectory = imagesDirectory + "/CIFAR" + + def createZip(directory: String): Unit ={ + val dir = new File(directory) + val zipfile = new File(directory + ".zip") + if(!zipfile.exists()) + zipFolder(dir, zipfile) + } + + def creatZips(): Unit ={ + createZip(groceriesDirectory) + createZip(cifarDirectory) + } +} + +import FileReaderSuiteUtils._ + +class ImageReaderSuite extends TestBase { + + test("image dataframe") { + + val images = session.readImages(groceriesDirectory, recursive = true) + + println(time { images.count }) + + assert(isImage(images, "image")) // make sure the column "images" exists and has the right type + + val paths = images.select("image.path") //make sure that SQL has access to the sub-fields + assert(paths.count == 30) + + val areas = images.select(images("image.width") * images("image.height")) //more complicated SQL statement + + println(s" area of image 1 ${areas.take(1)(0)}") + } + + test("with zip file") { + /* remove when datasets/Images is updated */ + creatZips + + val images = session.readImages(imagesDirectory, recursive = true) + assert(isImage(images, "image")) + assert(images.count == 72) + + val images1 = session.readImages(imagesDirectory, recursive = true, inspectZip = false) + assert(images1.count == 36) + } + + test("sample ratio test") { + + sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive", "true") + + val f = sc.binaryFiles(groceriesDirectory) + println(time { f.count }) + + val images = session.readImages(groceriesDirectory, recursive = true, sampleRatio = 0.5) + println(time { images.count }) //the count changes depending on random number generator + } + +} diff --git a/src/summarize-data/build.sbt b/src/summarize-data/build.sbt new file mode 100644 index 0000000000..6d55f118b6 --- /dev/null +++ b/src/summarize-data/build.sbt @@ -0,0 +1 @@ +//> DependsOn: core diff --git a/src/summarize-data/src/main/scala/SummarizeData.scala b/src/summarize-data/src/main/scala/SummarizeData.scala new file mode 100644 index 0000000000..d104c7d2d6 --- /dev/null +++ b/src/summarize-data/src/main/scala/SummarizeData.scala @@ -0,0 +1,189 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.storage.StorageLevel + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer + +trait SummarizeDataParams extends MMLParams { + + final val counts: BooleanParam = BooleanParam(this, "counts", "compute count statistics", true) + final def getCounts: Boolean = $(counts) + def setCounts(value: Boolean): this.type = set(counts, value) + + final val basic: BooleanParam = new BooleanParam(this, "basic", "compute basic statistics") + setDefault(basic, true) + final def getBasic: Boolean = $(basic) + def setBasic(value: Boolean): this.type = set(basic, value) + + final val sample: BooleanParam = new BooleanParam(this, "sample", "compute sample statistics") + setDefault(sample, true) + final def getSample: Boolean = $(sample) + def setSample(value: Boolean): this.type = set(sample, value) + + final val percentiles: BooleanParam = new BooleanParam(this, "percentiles", "compute percentiles") + setDefault(percentiles, true) + final def getPercentiles: Boolean = $(percentiles) + def setPercentiles(value: Boolean): this.type = set(percentiles, value) + + final val errorThreshold: DoubleParam = + new DoubleParam(this, "errorThreshold", "threshold for quantiles - 0 is exact") + setDefault(errorThreshold, 0.0) + final def getErrorThreshold: Double = $(errorThreshold) + def setErrorThreshold(value: Double): this.type = set(errorThreshold, value) + + protected def validateAndTransformSchema(schema: StructType): StructType = { + val columns = ListBuffer(SummarizeData.featureColumn) + if ($(counts)) columns ++= SummarizeData.countFields + if ($(basic)) columns ++= SummarizeData.basicFields + if ($(sample)) columns ++= SummarizeData.sampleFields + if ($(percentiles)) columns ++= SummarizeData.percentilesFields + StructType(columns) + } +} + +// UID should be overridden by driver for controlled identification at the DAG level +class SummarizeData(override val uid: String) + extends Transformer + with SummarizeDataParams { + + import SummarizeData.Statistic._ + + def this() = this(Identifiable.randomUID("SummarizeData")) + + def setStatistics(stats: List[Statistic]): Unit = ??? + + override def transform(dataset: Dataset[_]): DataFrame = { + + val df = dataset.toDF() + // Some of these statistics are bad to compute + df.persist(StorageLevel.MEMORY_ONLY) + + val subFrames = ListBuffer[DataFrame]() + if ($(counts)) subFrames += computeCounts(df) + if ($(basic)) subFrames += curriedBasic(df) + if ($(sample)) subFrames += sampleStats(df) + if ($(percentiles)) subFrames += curriedPerc(df) + + df.unpersist(false) + + val base = createJoinBase(df) + subFrames.foldLeft(base) { (z, dfi) => z.join(dfi, SummarizeData.featureColumnName) } + } + + def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema) + } + + def copy(extra: ParamMap): SummarizeData = defaultCopy(extra) + + private def computeCounts = computeOnAll(computeCountsImpl, SummarizeData.countFields) + + private def computeCountsImpl(col: String, df: DataFrame): Array[Double] = { + val column = df.col(col) + val mExpr = isnan(column) || isnull(column) + val countMissings = df.where(mExpr).count().toDouble + // approxCount returns Long which > Double! + val dExpr = approx_count_distinct(column) + val distinctCount = df.select(dExpr).first.getLong(0).toDouble + Array(df.count() - countMissings, distinctCount, countMissings) + } + + private def sampleStats = computeOnNumeric(sampleStatsImpl, SummarizeData.sampleFields) + + private def sampleStatsImpl(col: String, df: DataFrame): Array[Double] = { + val column = df.col(col) + val k = kurtosis(column) + val sk = skewness(column) + val v = variance(column) + val sd = stddev(column) + df.select(v, sd, sk, k).first.toSeq.map(_.asInstanceOf[Double]).toArray + } + + private def curriedBasic = { + val quants = SummarizeData.basicQuantiles + computeOnNumeric(quantStub(quants, $(errorThreshold)), SummarizeData.basicFields) + } + + private def curriedPerc = { + val quants = SummarizeData.percentilesQuantiles + computeOnNumeric(quantStub(quants, $(errorThreshold)), SummarizeData.percentilesFields) + } + + private def quantStub(vals: Array[Double], err: Double) = + (cn: String, df: DataFrame) => df.stat.approxQuantile(cn, vals, err) + + private def computeOnNumeric = computeColumnStats(sf => sf.dataType.isInstanceOf[NumericType]) _ + + private def computeOnAll = computeColumnStats(sf => true) _ + + private def allNaNs(l: Int): Array[Double] = Array.fill(l)(Double.NaN) + + private def createJoinBase(df: DataFrame) = computeColumnStats(sf => false)((cn, df) => Array(), List())(df) + + private def computeColumnStats + (p: StructField => Boolean) + (statFunc: (String, DataFrame) => Array[Double], newColumns: Seq[StructField]) + (df: DataFrame): DataFrame = { + val emptyRow = allNaNs(newColumns.length) + val outList = df.schema.map(col => (col.name, if (p(col)) statFunc(col.name, df) else emptyRow)) + val rows = outList.map { case (n, r) => Row.fromSeq(n +: r) } + val schema = SummarizeData.featureColumn +: newColumns + df.sparkSession.createDataFrame(rows.asJava, StructType(schema)) + } + +} + +object SummarizeData extends DefaultParamsReadable[SummarizeData] { + + object Statistic extends Enumeration { + type Statistic = Value + val Counts, Basic, Sample, Percentiles = Value + } + + final val featureColumnName = "Feature" + final val featureColumn = StructField(featureColumnName, StringType, false) + + final val percentilesQuantiles = Array(0.005, 0.01, 0.05, 0.95, 0.99, 0.995) + final val percentilesFields = List( + StructField("P0.5", DoubleType, true), + StructField("P1", DoubleType, true), + StructField("P5", DoubleType, true), + StructField("P95", DoubleType, true), + StructField("P99", DoubleType, true), + StructField("P99.5", DoubleType, true)) + + final val sampleFields = List( + StructField("Sample Variance", DoubleType, true), + StructField("Sample Standard Deviation", DoubleType, true), + StructField("Sample Skewness", DoubleType, true), + StructField("Sample Kurtosis", DoubleType, true)) + + final val basicQuantiles = Array(0, 0.25, 0.5, 0.75, 1) + final val basicFields = List( + StructField("Min", DoubleType, true), + StructField("1st Quartile", DoubleType, true), + StructField("Median", DoubleType, true), + StructField("3rd Quartile", DoubleType, true), + StructField("Max", DoubleType, true) + //TODO: StructField("Range", DoubleType, true), + //TODO: StructField("Mean", DoubleType, true), + //TODO: StructField("Mean Deviation", DoubleType, true), + // Mode is JSON Array of modes - needs a little special treatment + //TODO: StructField("Mode", StringType, true)) + ) + + final val countFields = List( + StructField("Count", DoubleType, false), + StructField("Unique Value Count", DoubleType, false), + StructField("Missing Value Count", DoubleType, false)) +} diff --git a/src/summarize-data/src/test/scala/SummarizeDataSuite.scala b/src/summarize-data/src/test/scala/SummarizeDataSuite.scala new file mode 100644 index 0000000000..db7b03fcaa --- /dev/null +++ b/src/summarize-data/src/test/scala/SummarizeDataSuite.scala @@ -0,0 +1,52 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions._ + +class SummarizeDataSuite extends TestBase { + + test("Smoke test for summarizing basic DF - schema transform") { + + val input = makeBasicDF() + val summary = new SummarizeData() + val result = summary.transformSchema(input.schema) + assert(result.length > 10) + } + + test("Smoke test for summary params") { + val s = new SummarizeData() + assert(s.params.length == 5) + assert(s.params.map(s.isSet).toSeq == (1 to s.params.length).map(i => false)) + + val sNoCounts = s.setCounts(false).setPercentiles(false) + assert(sNoCounts.params.map(sNoCounts.isSet).toSeq === Seq(false, true, false, true, false)) + } + + test("Smoke test for summarizing basic DF") { + val input = makeBasicDF() + val summary = new SummarizeData() + val result = summary.transform(input) + assert(result.count === input.columns.length) + assert(result.columns.length > 18) + } + + test("Smoke test for summarizing missings DF") { + val input = makeBasicNullableDF() + val summary = new SummarizeData() + val result = summary.transform(input) + assert(result.count === input.columns.length) + assert(result.columns.length > 18) + } + + test("Smoke test for subset summarizing missings DF") { + val input = makeBasicNullableDF() + val summary = new SummarizeData().setPercentiles(false).setCounts(false) + val result = summary.transform(input) + assert(result.count === input.columns.length) + assert(result.columns.length < 11) + } + +} diff --git a/src/text-featurizer/build.sbt b/src/text-featurizer/build.sbt new file mode 100644 index 0000000000..47a5d9cbee --- /dev/null +++ b/src/text-featurizer/build.sbt @@ -0,0 +1,2 @@ +//> DependsOn: core +//> DependsOn: utils diff --git a/src/text-featurizer/src/main/scala/TextFeaturizer.scala b/src/text-featurizer/src/main/scala/TextFeaturizer.scala new file mode 100644 index 0000000000..d97fd3b946 --- /dev/null +++ b/src/text-featurizer/src/main/scala/TextFeaturizer.scala @@ -0,0 +1,442 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.util.NoSuchElementException + +import org.apache.hadoop.fs.Path +import org.apache.spark.ml.feature._ +import org.apache.spark.ml._ +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util._ +import org.apache.spark.sql.{DataFrame, Dataset, Row, SaveMode} +import org.apache.spark.sql.types._ +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.attribute.AttributeGroup + +trait TextFeaturizerParams extends MMLParams{ + + //Tokenizer Params + /** @group param */ + val useTokenizer = BooleanParam(this, "useTokenizer", "Whether to tokenize the input") + + /** @group getParam **/ + final def getUseTokenizer: Boolean = $(useTokenizer) + + /** @group param */ + val tokenizerGaps = BooleanParam( + this, + "tokenizerGaps", + "Indicates whether regex splits on gaps (true) or matches tokens (false)." + ) + + /** @group getParam **/ + final def getTokenizerGaps: Boolean = $(tokenizerGaps) + + /** @group param */ + val minTokenLength = IntParam(this, "minTokenLength", "Minimum token length, >= 0.") + + /** @group getParam **/ + final def getMinTokenLength: Int = $(minTokenLength) + + /** @group param */ + val tokenizerPattern = StringParam( + this, + "tokenizerPattern", + "Regex pattern used to match delimiters if gaps is true or tokens if gaps is false.") + + /** @group getParam **/ + final def getTokenizerPattern: String = $(tokenizerPattern) + + /** @group param */ + val toLowercase = BooleanParam( + this, + "toLowercase", + "Indicates whether to convert all characters to lowercase before tokenizing.") + + /** @group getParam **/ + final def getToLowercase: Boolean = $(toLowercase) + + //Stop Word Remover Params + /** @group param */ + val useStopWordsRemover = BooleanParam(this, + "useStopWordsRemover", + "Whether to remove stop words from tokenized data") + + /** @group getParam **/ + final def getUseStopWordsRemover: Boolean = $(useStopWordsRemover) + + val caseSensitiveStopWords = BooleanParam( + this, + "caseSensitiveStopWords", + " Whether to do a case sensitive comparison over the stop words") + + final def getCaseSensitiveStopWords: Boolean = $(caseSensitiveStopWords) + + val defaultStopWordLanguage = StringParam(this, + "defaultStopWordLanguage", + "Which language to use for the stop word remover," + + " set this to custom to use the stopWords input") + + final def getDefaultStopWordLanguage: String = $(defaultStopWordLanguage) + + val stopWords = StringParam(this, "stopWords", "The words to be filtered out.") + + final def getStopWords: String = $(stopWords) + + //Ngram Params + /** @group param */ + val useNGram = BooleanParam(this, "useNGram", "Whether to enumerate N grams") + + /** @group getParam **/ + final def getUseNGram: Boolean = $(useNGram) + + /** @group param */ + val nGramLength = IntParam(this, "nGramLength", "The size of the Ngrams") + + /** @group getParam **/ + final def getNGramLength: Int = $(nGramLength) + + //HashingTF Params + /** @group param */ + val binary = BooleanParam( + this, + "binary", + "If true, all nonegative word counts are set to 1") + + /** @group getParam **/ + final def getBinary: Boolean = $(binary) + + /** @group param */ + val numFeatures = IntParam( + this, + "numFeatures", + "Set the number of features to hash each document to") + + /** @group getParam **/ + final def getNumFeatures: Int = $(numFeatures) + + //IDF Params + /** @group param */ + val useIDF = BooleanParam( + this, + "useIDF", + "Whether to scale the Term Frequencies by IDF") + + /** @group getParam **/ + final def getUseIDF: Boolean = $(useIDF) + + /** @group param */ + val minDocFreq = IntParam( + this, + "minDocFreq", + "The minimum number of documents in which a term should appear.") + + /** @group getParam **/ + final def getMinDocFreq: Int = $(minDocFreq) + +} + +object TextFeaturizer extends DefaultParamsReadable[TextFeaturizer] + +class TextFeaturizer(override val uid: String) + extends Estimator[TextFeaturizerModel] + with TextFeaturizerParams with HasInputCol with HasOutputCol { + def this() = this(Identifiable.randomUID("TextFeaturizer")) + + setDefault(outputCol, uid + "__output") + + def setUseTokenizer(value: Boolean): this.type = set(useTokenizer, value) + + setDefault(useTokenizer -> true) + + /** @group setParam **/ + def setTokenizerGaps(value: Boolean): this.type = set(tokenizerGaps, value) + + setDefault(tokenizerGaps -> true) + + /** @group setParam **/ + def setMinTokenLength(value: Int): this.type = set(minTokenLength, value) + + setDefault(minTokenLength -> 0) + + /** @group setParam **/ + def setTokenizerPattern(value: String): this.type = + set(tokenizerPattern, value) + + setDefault(tokenizerPattern -> "\\s+") + + /** @group setParam **/ + def setToLowercase(value: Boolean): this.type = set(toLowercase, value) + + setDefault(toLowercase -> true) + + /** @group setParam **/ + def setUseStopWordsRemover(value: Boolean): this.type = + set(useStopWordsRemover, value) + + setDefault(useStopWordsRemover -> false) + + /** @group setParam **/ + def setCaseSensitiveStopWords(value: Boolean): this.type = + set(caseSensitiveStopWords, value) + + setDefault(caseSensitiveStopWords -> false) + + /** @group setParam **/ + def setDefaultStopWordLanguage(value: String): this.type = + set(defaultStopWordLanguage, value) + + setDefault(defaultStopWordLanguage -> "english") + + /** @group setParam **/ + def setStopWords(value: String): this.type = set(stopWords, value) + + /** @group setParam **/ + def setUseNGram(value: Boolean): this.type = set(useNGram, value) + + /** @group setParam **/ + def setNGramLength(value: Int): this.type = set(nGramLength, value) + + setDefault(useNGram -> false, nGramLength -> 2) + + /** @group setParam **/ + def setBinary(value: Boolean): this.type = set(binary, value) + + /** @group setParam **/ + def setNumFeatures(value: Int): this.type = set(numFeatures, value) + + setDefault(numFeatures -> (1 << 18), binary -> false) + + /** @group setParam **/ + def setUseIDF(value: Boolean): this.type = set(useIDF, value) + + /** @group setParam **/ + def setMinDocFreq(value: Int): this.type = set(minDocFreq, value) + + setDefault(useIDF -> true, minDocFreq -> 1) + + private def setParamInternal[M <: PipelineStage, T](model: M, + name: String, + value: T) = { + model.set(model.getParam(name), value) + } + + private def getParamInternal[M <: PipelineStage, T](model: M, name: String) = { + model.getOrDefault(model.getParam(name)) + } + + override def fit(dataset: Dataset[_]): TextFeaturizerModel = { + try { + getUseTokenizer + } catch { + case e: NoSuchElementException => setUseTokenizer(needsTokenizer(dataset.schema)) + } + + transformSchema(dataset.schema) + var models: List[PipelineStage] = Nil + if (getUseTokenizer) + models ::= new RegexTokenizer() + .setGaps(getTokenizerGaps) + .setPattern(getTokenizerPattern) + .setMinTokenLength(getMinTokenLength) + .setToLowercase(getToLowercase) + if (getUseStopWordsRemover) { + val swr = + new StopWordsRemover().setCaseSensitive(getCaseSensitiveStopWords) + if (getDefaultStopWordLanguage == "custom") { + models ::= swr.setStopWords(getStopWords.split(",")) + } else { + models ::= swr.setStopWords( + StopWordsRemover.loadDefaultStopWords(getDefaultStopWordLanguage)) + } + } + if (getUseNGram) + models ::= new NGram().setN(getNGramLength) + models ::= new HashingTF() + .setBinary(getBinary) + .setNumFeatures(getNumFeatures) + if (getUseIDF) + models ::= new IDF().setMinDocFreq(getMinDocFreq) + models = models.reverse + + val chainedModels = models + .zip(0 to models.length) + .map( + { pair: (PipelineStage, Int) => + val model = pair._1 + val i = pair._2 + if (i == 0) { + setParamInternal(model, "inputCol", getInputCol) + } else if (i < models.length - 1) { + setParamInternal(model, + "inputCol", + getParamInternal(models(i - 1), "outputCol")) + } else { + val m1 = + setParamInternal(model, + "inputCol", + getParamInternal(models(i - 1), "outputCol")) + setParamInternal(m1, "outputCol", getOutputCol) + } + } + ) + val colsToDrop = chainedModels.reverse.tail + .map(getParamInternal(_, "outputCol").asInstanceOf[String]) + val fitPipeline = + new Pipeline().setStages(chainedModels.toArray).fit(dataset) + new TextFeaturizerModel(uid, fitPipeline, colsToDrop).setParent(this) + } + + override def copy(extra: ParamMap): Estimator[TextFeaturizerModel] = + defaultCopy(extra) + + def transformSchema(schema: StructType): StructType = { + val inputType = schema($(inputCol)).dataType + validateInputType(inputType) + if (schema.fieldNames.contains($(outputCol))) { + throw new IllegalArgumentException( + s"Output column ${$(outputCol)} already exists.") + } + val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) + appendColumn(schema, attrGroup.toStructField()) + } + + private def needsTokenizer(schema: StructType) = { + val inputType = schema($(inputCol)).dataType + inputType == StringType + } + + private def validateInputType(inputType: DataType) = { + if (getUseTokenizer) { + if (inputType == ArrayType(StringType)) { + require( + inputType == StringType, + s"Input type must be string type but got $inputType. " + + s"It looks like your data is already tokenized, Try with useTokenizer=False") + } + require(inputType == StringType, + s"Input type must be string type but got $inputType.") + } else if (getUseNGram) { + if (inputType == StringType) { + require( + inputType == ArrayType(StringType), + s"Input type must be Array[string] type but got $inputType. " + + s"It looks like your data not tokenized, Try with useTokenizer=True") + } + require( + inputType == ArrayType(StringType), + s"Input type must be Array(String) type type but got $inputType.") + } else { + if (inputType == StringType) { + require( + inputType.isInstanceOf[ArrayType], + s"Input type must be Array[_] type but got $inputType. " + + s"It looks like your data not tokenized, Try with useTokenizer=True") + } + require(inputType.isInstanceOf[ArrayType], + s"Input type must be Array(_) type type but got $inputType.") + } + } + + private def appendColumn(schema: StructType, col: StructField): StructType = { + require(!schema.fieldNames.contains(col.name), + s"Column ${col.name} already exists.") + StructType(schema.fields :+ col) + } +} + +class TextFeaturizerModel(val uid: String, + fitPipeline: PipelineModel, + colsToDrop: List[String]) + extends Model[TextFeaturizerModel] with MLWritable { + + override def write: MLWriter = new TextFeaturizerModel.TextFeaturizerModelWriter(uid, fitPipeline, colsToDrop) + + override def copy(extra: ParamMap): TextFeaturizerModel = defaultCopy(extra) + + override def transform(dataset: Dataset[_]): DataFrame = { + colsToDrop.foldRight(fitPipeline.transform(dataset))((col, df) => + df.drop(col)) + } + + override def transformSchema(schema: StructType): StructType = + colsToDrop.foldRight(fitPipeline.transformSchema(schema))((col, schema) => + StructType(schema.drop(schema.fieldIndex(col)))) +} + +object TextFeaturizerModel extends MLReadable[TextFeaturizerModel] { + + private val fitPipelinePart = "fitPipeline" + private val colsToDropPart = "colsToDrop" + private val dataPart = "data" + + override def read: MLReader[TextFeaturizerModel] = new TextFeaturizerModelReader + + override def load(path: String): TextFeaturizerModel = super.load(path) + + /** [[MLWriter]] instance for [[TextFeaturizerModel]] */ + private[TextFeaturizerModel] + class TextFeaturizerModelWriter(val uid: String, + val fitPipeline: PipelineModel, + val colsToDrop: List[String]) + extends MLWriter { + + private case class Data(uid: String) + + override protected def saveImpl(path: String): Unit = { + val overwrite = this.shouldOverwrite + val qualPath = PipelineUtilities.makeQualifiedPath(sc, path) + // Required in order to allow this to be part of an ML pipeline + PipelineUtilities.saveMetadata(uid, + TextFeaturizerModel.getClass.getName.replace("$", ""), + new Path(path, "metadata").toString, + sc, + overwrite) + + val dataPath = new Path(qualPath, dataPart).toString + + // Save data + val data = Data(uid) + + // save the columns to drop + ObjectUtilities.writeObject(colsToDrop, qualPath, colsToDropPart, sc, overwrite) + + // save the pipeline + val fitPipelinePath = new Path(qualPath, fitPipelinePart).toString + val fitPipelineWriter = + if (overwrite) fitPipeline.write.overwrite() + else fitPipeline.write + fitPipelineWriter.save(fitPipelinePath) + + val saveMode = + if (overwrite) SaveMode.Overwrite + else SaveMode.ErrorIfExists + + sparkSession.createDataFrame(Seq(data)).repartition(1).write.mode(saveMode).parquet(dataPath) + } + } + + private class TextFeaturizerModelReader + extends MLReader[TextFeaturizerModel] { + override def load(path: String): TextFeaturizerModel = { + val qualPath = PipelineUtilities.makeQualifiedPath(sc, path) + // load the uid and one hot encoding param + val dataPath = new Path(qualPath, dataPart).toString + val data = sparkSession.read.format("parquet").load(dataPath) + val Row(uid: String) = data.select("uid").head() + + // load the fit pipeline + val fitPipelinePath = new Path(qualPath, fitPipelinePart).toString + val fitPipeline = PipelineModel.load(fitPipelinePath) + + // load the columns to drop + val colsToDrop = ObjectUtilities.loadObject[List[String]](qualPath, colsToDropPart, sc) + + new TextFeaturizerModel(uid, fitPipeline, colsToDrop) + } + } + +} + diff --git a/src/text-featurizer/src/test/scala/TextFeaturizerSpec.scala b/src/text-featurizer/src/test/scala/TextFeaturizerSpec.scala new file mode 100644 index 0000000000..c1018de873 --- /dev/null +++ b/src/text-featurizer/src/test/scala/TextFeaturizerSpec.scala @@ -0,0 +1,86 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import org.apache.spark.ml.feature.{NGram, Tokenizer} +import com.microsoft.ml.spark.schema.DatasetExtensions._ +import org.apache.spark.ml.Estimator +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.{StringType, StructField, StructType} + +class TextFeaturizerSpec extends EstimatorFuzzingTest { + val dfRaw = session + .createDataFrame(Seq((0, "Hi I"), + (1, "I wish for snow today"), + (2, "we Cant go to the park, because of the snow!"), + (3, ""))) + .toDF("label", "sentence") + val dfTok = new Tokenizer() + .setInputCol("sentence") + .setOutputCol("tokens") + .transform(dfRaw) + val dfNgram = + new NGram().setInputCol("tokens").setOutputCol("ngrams").transform(dfTok) + + test("operate on sentences,tokens,or ngrams") { + val tfRaw = new TextFeaturizer() + .setInputCol("sentence") + .setOutputCol("features") + .setNumFeatures(20) + val tfTok = new TextFeaturizer() + .setUseTokenizer(false) + .setInputCol("tokens") + .setOutputCol("features") + .setNumFeatures(20) + val tfNgram = new TextFeaturizer() + .setUseTokenizer(false) + .setUseNGram(false) + .setInputCol("ngrams") + .setOutputCol("features") + .setNumFeatures(20) + + val dfRaw2 = tfRaw.fit(dfRaw).transform(dfRaw) + val dfTok2 = tfTok.fit(dfTok).transform(dfTok) + val dfNgram2 = tfNgram.fit(dfNgram).transform(dfNgram) + + val linesRaw = dfRaw2.getSVCol("features") + val linesTok = dfTok2.getSVCol("features") + val linesNgram = dfNgram2.getSVCol("features") + + assert(linesRaw.length == 4) + assert(linesTok.length == 4) + assert(linesNgram.length == 4) + assert(linesRaw(0)(0) == 0.9162907318741551) + assert(linesTok(1)(9) == 0.5108256237659907) + assert(linesNgram(2)(7) == 1.8325814637483102) + assert(linesNgram(3)(1) == 0.0) + } + + test("throw errors if the schema is incorrect") { + val tfRaw = new TextFeaturizer() + .setUseTokenizer(true) + .setInputCol("sentence") + .setOutputCol("features") + .setNumFeatures(20) + val tfTok = new TextFeaturizer() + .setUseTokenizer(false) + .setInputCol("tokens") + .setOutputCol("features") + .setNumFeatures(20) + assertSparkException[IllegalArgumentException](tfRaw.setInputCol("tokens"), dfTok) + assertSparkException[IllegalArgumentException](tfRaw.setInputCol("ngrams"), dfNgram) + assertSparkException[IllegalArgumentException](tfTok.setInputCol("sentence"), dfRaw) + assertSparkException[IllegalArgumentException](tfRaw.setInputCol("tokens_incorrect"), dfTok) + assertSparkException[IllegalArgumentException](tfRaw.setOutputCol("tokens"), dfTok) + } + + val inputCol = "text" + + override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = + estimator.asInstanceOf[TextFeaturizer].setInputCol(inputCol) + + override def getEstimator(): Estimator[_] = new TextFeaturizer() + + override def schemaForDataset: StructType = new StructType(Array(StructField(inputCol, StringType, false))) +} diff --git a/src/train-classifier/build.sbt b/src/train-classifier/build.sbt new file mode 100644 index 0000000000..f0037af8bd --- /dev/null +++ b/src/train-classifier/build.sbt @@ -0,0 +1,3 @@ +//> DependsOn: core +//> DependsOn: utils +//> DependsOn: featurize diff --git a/src/train-classifier/src/main/scala/TrainClassifier.scala b/src/train-classifier/src/main/scala/TrainClassifier.scala new file mode 100644 index 0000000000..509cb1875c --- /dev/null +++ b/src/train-classifier/src/main/scala/TrainClassifier.scala @@ -0,0 +1,367 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.util.UUID + +import com.microsoft.ml.spark.schema._ +import org.apache.hadoop.fs.Path +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.ml.classification._ +import org.apache.spark.ml.param._ +import org.apache.spark.ml.param.EstimatorParam +import org.apache.spark.ml.util._ +import org.apache.spark.ml._ +import org.apache.spark.sql._ +import org.apache.spark.sql.types.{DoubleType, StructField, StructType} + +/** + * Trains a classification model. + */ +class TrainClassifier(override val uid: String) extends Estimator[TrainedClassifierModel] + with HasLabelCol with MMLParams { + + def this() = this(Identifiable.randomUID("TrainClassifier")) + + val model = new EstimatorParam(this, "model", "Classifier to run") + + def getModel: Estimator[_ <: Model[_]] = $(model) + /** @group setParam **/ + def setModel(value: Estimator[_ <: Model[_]]): this.type = set(model, value) + + val featuresColumn = this.uid + "_features" + + val numFeatures = IntParam(this, "numFeatures", "number of features to hash to", 0) + def getNumFeatures: Int = $(numFeatures) + def setNumFeatures(value: Int): this.type = set(numFeatures, value) + + val indexLabel = BooleanParam(this, "indexLabel", "index the label column", true) + def getIndexLabel: Boolean = $(indexLabel) + def setIndexLabel(value: Boolean): this.type = set(indexLabel, value) + + /** + * Fits the classification model. + * + * @param dataset The input dataset to train. + * @return The trained classification model. + */ + override def fit(dataset: Dataset[_]): TrainedClassifierModel = { + val labelColumn = getLabelCol + val indexLabelFeaturize = getIndexLabel + var levels: Option[Array[_]] = None + var oneHotEncodeCategoricals = true + var modifyInputLayer = false + + // Convert label column to categorical on train, remove rows with missing labels + val convertedLabelDataset = if (indexLabelFeaturize) { + val dataframe = dataset.toDF().na.drop(Seq(labelColumn)) + if (!SparkSchema.isCategorical(dataframe, labelColumn)) { + val categoricalLabelDataset = SparkSchema.makeCategorical(dataframe, labelColumn, labelColumn, true) + levels = CategoricalUtilities.getLevels(categoricalLabelDataset.schema, labelColumn) + categoricalLabelDataset.withColumn(labelColumn, + categoricalLabelDataset(labelColumn).cast(DoubleType).as(labelColumn, + categoricalLabelDataset.schema(labelColumn).metadata)) + } else { + levels = CategoricalUtilities.getLevels(dataframe.schema, labelColumn) + dataframe + } + } else { + dataset.na.drop(Seq(labelColumn)) + } + + // Create trainer based on the pipeline stage and set the parameters + val numFeatures: Int = getModel match { + case _: DecisionTreeClassifier | _: GBTClassifier | _: RandomForestClassifier => + oneHotEncodeCategoricals = false + FeaturizeUtilities.numFeaturesTreeOrNNBased + case _: MultilayerPerceptronClassifier => + modifyInputLayer = true + FeaturizeUtilities.numFeaturesTreeOrNNBased + case _ => + FeaturizeUtilities.numFeaturesDefault + } + + var classifier: Estimator[_ <: PipelineStage] = getModel match { + case logisticRegressionClassifier: LogisticRegression => { + if (indexLabelFeaturize && levels.isDefined && levels.get.length > 2) { + new OneVsRest() + .setClassifier( + logisticRegressionClassifier + .setLabelCol(labelColumn) + .setFeaturesCol(featuresColumn)) + .setLabelCol(labelColumn) + .setFeaturesCol(featuresColumn) + } else { + logisticRegressionClassifier + } + } + case gradientBoostedTreesClassifier: GBTClassifier => { + if (indexLabelFeaturize && levels.isDefined && levels.get.length > 2) { + throw new Exception("Multiclass Gradient Boosted Tree Classifier not supported yet") + } else { + gradientBoostedTreesClassifier + } + } + case default @ defaultType if defaultType.isInstanceOf[Estimator[_ <: PipelineStage]] => { + default + } + case _ => throw new Exception("Unsupported learner type " + getModel.getClass.toString) + } + + classifier = classifier match { + case predictor: Predictor[_, _, _] => { + predictor + .setLabelCol(labelColumn) + .setFeaturesCol(featuresColumn).asInstanceOf[Estimator[_ <: PipelineStage]] + } + case default @ defaultType if defaultType.isInstanceOf[Estimator[_ <: PipelineStage]] => { + // assume label col and features col already set + default + } + } + + val featuresToHashTo = + if (getNumFeatures != 0) { + getNumFeatures + } else { + numFeatures + } + + val featureColumns = convertedLabelDataset.columns.filter(col => col != labelColumn).toSeq + + val featurizer = new Featurize() + .setFeatureColumns(Map(featuresColumn -> featureColumns)) + .setOneHotEncodeCategoricals(oneHotEncodeCategoricals) + .setNumberOfFeatures(featuresToHashTo) + val featurizedModel = featurizer.fit(convertedLabelDataset) + val processedData = featurizedModel.transform(convertedLabelDataset) + + processedData.cache() + + // For neural network, need to modify input layer so it will automatically work during train + if (modifyInputLayer) { + val multilayerPerceptronClassifier = classifier.asInstanceOf[MultilayerPerceptronClassifier] + val row = processedData.take(1)(0) + val featuresVector = row.get(row.fieldIndex(featuresColumn)) + val vectorSize = featuresVector.asInstanceOf[org.apache.spark.ml.linalg.Vector].size + multilayerPerceptronClassifier.getLayers(0) = vectorSize + multilayerPerceptronClassifier.setLayers(multilayerPerceptronClassifier.getLayers) + } + + // Train the learner + val fitModel = classifier.fit(processedData) + + processedData.unpersist() + + // Note: The fit shouldn't do anything here + val pipelineModel = new Pipeline().setStages(Array(featurizedModel, fitModel)).fit(convertedLabelDataset) + new TrainedClassifierModel(uid, labelColumn, pipelineModel, levels, featuresColumn) + } + + override def copy(extra: ParamMap): Estimator[TrainedClassifierModel] = defaultCopy(extra) + + @DeveloperApi + override def transformSchema(schema: StructType): StructType = { + val hasScoreCols = + $(model) match { + case _: GBTClassifier => false + case _: MultilayerPerceptronClassifier => false + case _ => true + } + TrainClassifier.validateTransformSchema(hasScoreCols, schema) + } +} + +object TrainClassifier extends DefaultParamsReadable[TrainClassifier] { + def validateTransformSchema(hasScoreCols: Boolean, schema: StructType): StructType = { + val scoresSchema = + if (hasScoreCols) { + StructType(schema.fields :+ StructField(SchemaConstants.ScoresColumn, DoubleType)) + } else schema + val probSchema = + if (hasScoreCols) { + StructType(scoresSchema.fields :+ StructField(SchemaConstants.ScoredProbabilitiesColumn, DoubleType)) + } else scoresSchema + StructType(probSchema.fields :+ StructField(SchemaConstants.ScoredLabelsColumn, DoubleType)) + } +} + +/** + * Model produced by [[TrainClassifier]]. + */ +class TrainedClassifierModel(val uid: String, + val labelColumn: String, + val model: PipelineModel, + val levels: Option[Array[_]], + val featuresColumn: String) + extends Model[TrainedClassifierModel] with MLWritable { + + override def write: MLWriter = new TrainedClassifierModel.TrainClassifierModelWriter(uid, + labelColumn, + model, + levels, + featuresColumn) + + override def copy(extra: ParamMap): TrainedClassifierModel = + new TrainedClassifierModel(uid, + labelColumn, + model.copy(extra), + levels, + featuresColumn) + + override def transform(dataset: Dataset[_]): DataFrame = { + val hasScoreCols = hasScoreColumns(model.stages.last) + + // re-featurize and score the data + val scoredData = model.transform(dataset) + + // Drop the vectorized features column + val cleanedScoredData = scoredData.drop(featuresColumn) + + // Update the schema - TODO: create method that would generate GUID and add it to the scored model + val moduleName = SchemaConstants.ScoreModelPrefix + UUID.randomUUID().toString + val labelColumnExists = cleanedScoredData.columns.contains(labelColumn) + val schematizedScoredDataWithLabel = + if (labelColumnExists) { + SparkSchema.setLabelColumnName(cleanedScoredData, moduleName, labelColumn, SchemaConstants.ClassificationKind) + } else { + cleanedScoredData + } + + // Note: The GBT model does not have scores, only scored labels. Same for OneVsRest with any binary model. + val schematizedScoredDataWithScores = + if (hasScoreCols) { + setMetadataForColumnName(SparkSchema.setScoredProbabilitiesColumnName, + SchemaConstants.SparkProbabilityColumn, + SchemaConstants.ScoredProbabilitiesColumn, + moduleName, + setMetadataForColumnName(SparkSchema.setScoresColumnName, + SchemaConstants.SparkRawPredictionColumn, + SchemaConstants.ScoresColumn, + moduleName, + schematizedScoredDataWithLabel)) + } else schematizedScoredDataWithLabel + + val scoredDataWithUpdatedScoredLabels = + setMetadataForColumnName(SparkSchema.setScoredLabelsColumnName, + SchemaConstants.SparkPredictionColumn, + SchemaConstants.ScoredLabelsColumn, + moduleName, + schematizedScoredDataWithScores) + + val scoredDataWithUpdatedScoredLevels = + if (levels.isEmpty) scoredDataWithUpdatedScoredLabels + else CategoricalUtilities.setLevels(scoredDataWithUpdatedScoredLabels, + SchemaConstants.ScoredLabelsColumn, + levels.get) + + // add metadata to the scored labels and true labels for the levels in label column + if (levels.isEmpty || !labelColumnExists) scoredDataWithUpdatedScoredLevels + else CategoricalUtilities.setLevels(scoredDataWithUpdatedScoredLevels, + labelColumn, + levels.get) + } + + private def setMetadataForColumnName(setter: (DataFrame, String, String, String) => DataFrame, + sparkColumnName: String, + mmlColumnName: String, + moduleName: String, + dataset: DataFrame): DataFrame = { + if (dataset.columns.contains(sparkColumnName)) { + setter(dataset.withColumnRenamed(sparkColumnName, mmlColumnName), + moduleName, + mmlColumnName, + SchemaConstants.ClassificationKind) + } else { + dataset + } + } + + @DeveloperApi + override def transformSchema(schema: StructType): StructType = + TrainClassifier.validateTransformSchema(hasScoreColumns(model.stages.last), schema) + + def hasScoreColumns(model: Transformer): Boolean = { + model match { + case _: GBTClassificationModel => false + case _: MultilayerPerceptronClassificationModel => false + case _ => true + } + } + + def getParamMap: ParamMap = model.stages.last.extractParamMap() +} + +object TrainedClassifierModel extends MLReadable[TrainedClassifierModel] { + + private val featurizeModelPart = "featurizeModel" + private val modelPart = "model" + private val levelsPart = "levels" + private val dataPart = "data" + + override def read: MLReader[TrainedClassifierModel] = new TrainedClassifierModelReader + + override def load(path: String): TrainedClassifierModel = super.load(path) + + /** [[MLWriter]] instance for [[TrainedClassifierModel]] */ + private[TrainedClassifierModel] + class TrainClassifierModelWriter(val uid: String, + val labelColumn: String, + val model: PipelineModel, + val levels: Option[Array[_]], + val featuresColumn: String) + extends MLWriter { + private case class Data(uid: String, labelColumn: String, featuresColumn: String) + + override protected def saveImpl(path: String): Unit = { + val overwrite = this.shouldOverwrite + val qualPath = PipelineUtilities.makeQualifiedPath(sc, path) + // Required in order to allow this to be part of an ML pipeline + PipelineUtilities.saveMetadata(uid, + TrainedClassifierModel.getClass.getName.replace("$", ""), + new Path(path, "metadata").toString, + sc, + overwrite) + + // save the model + val modelWriter = + if (overwrite) model.write.overwrite() + else model.write + modelWriter.save(new Path(qualPath, modelPart).toString) + + // save the levels + ObjectUtilities.writeObject(levels, qualPath, levelsPart, sc, overwrite) + + // save model data + val data = Data(uid, labelColumn, featuresColumn) + val dataPath = new Path(qualPath, dataPart).toString + val saveMode = + if (overwrite) SaveMode.Overwrite + else SaveMode.ErrorIfExists + sparkSession.createDataFrame(Seq(data)).repartition(1).write.mode(saveMode).parquet(dataPath) + } + } + + private class TrainedClassifierModelReader + extends MLReader[TrainedClassifierModel] { + + override def load(path: String): TrainedClassifierModel = { + val qualPath = PipelineUtilities.makeQualifiedPath(sc, path) + // load the uid, label column and model name + val dataPath = new Path(qualPath, dataPart).toString + val data = sparkSession.read.format("parquet").load(dataPath) + val Row(uid: String, labelColumn: String, featuresColumn: String) = + data.select("uid", "labelColumn", "featuresColumn").head() + + // retrieve the underlying model + val model = PipelineModel.load(new Path(qualPath, modelPart).toString) + + // get the levels + val levels = ObjectUtilities.loadObject[Option[Array[_]]](qualPath, levelsPart, sc) + + new TrainedClassifierModel(uid, labelColumn, model, levels, featuresColumn) + } + } + +} diff --git a/src/train-classifier/src/test/scala/VerifyTrainClassifier.scala b/src/train-classifier/src/test/scala/VerifyTrainClassifier.scala new file mode 100644 index 0000000000..5a8674b3b4 --- /dev/null +++ b/src/train-classifier/src/test/scala/VerifyTrainClassifier.scala @@ -0,0 +1,560 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import FileUtilities._ +import com.microsoft.ml.spark.schema.{CategoricalUtilities, SchemaConstants, SparkSchema} + +import scala.collection.mutable.ArrayBuffer +import org.apache.spark.ml.Estimator +import org.apache.spark.ml.classification._ +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics} +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.StructType + +object ClassifierTestUtils { + + def classificationTrainFile(name: String): File = + new File(s"${sys.env("DATASETS_HOME")}/Binary/Train", name) + + def multiclassClassificationTrainFile(name: String): File = + new File(s"${sys.env("DATASETS_HOME")}/Multiclass/Train", name) + +} + +/** + * Tests to validate the functionality of Train Classifier module. + */ +class VerifyTrainClassifier extends EstimatorFuzzingTest { + + val thisDirectory = new File("src/test/scala") + val targetDirectory = new File("target") + assert(thisDirectory.isDirectory, "-- the test should run in the sub-project root level") + val historicMetricsFile = new File(thisDirectory, "benchmarkMetrics.csv") + val benchmarkMetricsFile = new File(targetDirectory, s"newMetrics_${System.currentTimeMillis}_.csv") + + val LogisticRegressionClassifierName = "LogisticRegression" + val DecisionTreeClassifierName = "DecisionTreeClassification" + val RandomForestClassifierName = "RandomForestClassification" + val GradientBoostedTreesClassifierName = "GradientBoostedTreesClassification" + val NaiveBayesClassifierName = "NaiveBayesClassifier" + val MultilayerPerceptronClassifierName = "MultilayerPerceptronClassifier" + + val accuracyResults = ArrayBuffer.empty[String] + def addAccuracyResult(items: Any*): Unit = { + val line = items.map(_.toString).mkString(",") + println(s"... $line") + accuracyResults += line + () + } + + val mockLabelColumn = "Label" + + def createMockDataset: DataFrame = { + session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (0, 4, 0.78, 0.99, 2), + (1, 5, 0.12, 0.34, 3), + (0, 1, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (0, 3, 0.78, 0.99, 2), + (1, 4, 0.12, 0.34, 3), + (0, 0, 0.50, 0.60, 0), + (1, 2, 0.40, 0.50, 1), + (0, 3, 0.78, 0.99, 2), + (1, 4, 0.12, 0.34, 3))) + .toDF(mockLabelColumn, "col1", "col2", "col3", "col4") + } + + test("Smoke test for training on a classifier") { + val dataset: DataFrame = createMockDataset + + val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn) + + TrainClassifierTestUtilities.trainScoreDataset(mockLabelColumn, dataset, logisticRegressor) + } + + test("Verify you can score on a dataset without a label column") { + val dataset: DataFrame = createMockDataset + + val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn) + + val data = dataset.randomSplit(Seq(0.6, 0.4).toArray, 42) + val trainData = data(0) + val testData = data(1) + + val model = logisticRegressor.fit(trainData) + + model.transform(testData.drop(mockLabelColumn)) + } + + test("Verify train classifier works on a dataset with categorical columns") { + val cat = "Cat" + val dog = "Dog" + val bird = "Bird" + val dataset: DataFrame = session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, dog, cat), + (1, 3, 0.40, 0.50, cat, dog), + (0, 4, 0.78, 0.99, dog, bird), + (1, 5, 0.12, 0.34, cat, dog), + (0, 1, 0.50, 0.60, dog, bird), + (1, 3, 0.40, 0.50, bird, dog), + (0, 3, 0.78, 0.99, dog, cat), + (1, 4, 0.12, 0.34, cat, dog), + (0, 0, 0.50, 0.60, dog, cat), + (1, 2, 0.40, 0.50, bird, dog), + (0, 3, 0.78, 0.99, dog, bird), + (1, 4, 0.12, 0.34, cat, dog))) + .toDF(mockLabelColumn, "col1", "col2", "col3", "col4", "col5") + + val catDataset = SparkSchema.makeCategorical( + SparkSchema.makeCategorical(dataset, "col4", "col4", false), + "col5", + "col5", + false) + + val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn) + TrainClassifierTestUtilities.trainScoreDataset(mockLabelColumn, catDataset, logisticRegressor) + + val randomForestClassifier = TrainClassifierTestUtilities.createRandomForestClassifier(mockLabelColumn) + TrainClassifierTestUtilities.trainScoreDataset(mockLabelColumn, catDataset, randomForestClassifier) + } + + test("Verify a trained classifier model can be saved and loaded") { + val dataset: DataFrame = createMockDataset + + val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn) + + val model = logisticRegressor.fit(dataset) + + val myModelName = "testModel" + lazy val dir = new File(myModelName) + try { + model.write.overwrite().save(myModelName) + // write a second time with overwrite flag, verify still works + model.write.overwrite().save(myModelName) + // assert directory exists + assert(dir.exists()) + + // load the model + val loadedModel = TrainedClassifierModel.load(myModelName) + + // verify model data loaded + assert(loadedModel.labelColumn == model.labelColumn) + assert(loadedModel.uid == model.uid) + val transformedDataset = loadedModel.transform(dataset) + val benchmarkDataset = model.transform(dataset) + assert(verifyResult(transformedDataset, benchmarkDataset)) + } finally { + // delete the file to cleanup + FileUtilities.delTree(dir) + () + } + } + + test("Verify you can train on a dataset that contains a vector column") { + val dataset: DataFrame = session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, 0, Vectors.dense(1.0, 0.1, -1.5)), + (1, 3, 0.40, 0.50, 1, Vectors.dense(1.5, 0.2, -1.2)), + (0, 4, 0.78, 0.99, 2, Vectors.dense(1.3, 0.3, -1.1)), + (1, 5, 0.12, 0.34, 3, Vectors.sparse(3, Seq((0, 1.0), (2, 2.0)))), + (0, 1, 0.50, 0.60, 0, Vectors.dense(1.0, 0.4, -1.23)), + (1, 3, 0.40, 0.50, 1, Vectors.dense(1.1, 0.5, -1.024)), + (0, 3, 0.78, 0.99, 2, Vectors.dense(1.0, 0.1, -1.22)), + (1, 4, 0.12, 0.34, 3, Vectors.dense(Double.NaN, 0.2, -1.23)), + (0, 0, 0.50, 0.60, 0, Vectors.dense(0.5, 0.3, 1.0)), + (1, 2, 0.40, 0.50, 1, Vectors.dense(1.0, 0.4, -1.2)), + (0, 3, 0.78, 0.99, 2, Vectors.dense(0.7, 0.5, -1.1)), + (1, 4, 0.12, 0.34, 3, Vectors.dense(1.8, 0.1, 2.02)))) + .toDF(mockLabelColumn, "col1", "col2", "col3", "col4", "col5") + + val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn) + TrainClassifierTestUtilities.trainScoreDataset(mockLabelColumn, dataset, logisticRegressor) + } + + verifyLearnerOnMulticlassCsvFile("abalone.csv", "Rings", 2, true) + // Has multiple columns with the same name. Spark doesn't seem to be able to handle that yet. + // verifyLearnerOnMulticlassCsvFile("arrhythmia.csv", "Arrhythmia") + verifyLearnerOnMulticlassCsvFile("BreastTissue.csv", "Class", 2, true) + verifyLearnerOnMulticlassCsvFile("CarEvaluation.csv", "Col7", 2, true) + // Getting "code generation" exceeded max size limit error + // verifyLearnerOnMulticlassCsvFile("mnist.train.csv", "Label") + // This works with 2.0.0, but on 2.1.0 it looks like it loops infinitely while leaking memory + // verifyLearnerOnMulticlassCsvFile("au3_25000.csv", "class", 2, true) + // This takes way too long for a gated build. Need to make it something like a p3 test case. + // verifyLearnerOnMulticlassCsvFile("Seattle911.train.csv", "Event Clearance Group") + + verifyLearnerOnBinaryCsvFile("PimaIndian.csv", "Diabetes mellitus", 2, true) + verifyLearnerOnBinaryCsvFile("data_banknote_authentication.csv", "class", 2, false) + verifyLearnerOnBinaryCsvFile("task.train.csv", "TaskFailed10", 2, true) + verifyLearnerOnBinaryCsvFile("breast-cancer.train.csv", "Label", 2, true) + verifyLearnerOnBinaryCsvFile("random.forest.train.csv", "#Malignant", 2, true) + verifyLearnerOnBinaryCsvFile("transfusion.csv", "Donated", 2, true) + // verifyLearnerOnBinaryCsvFile("au2_10000.csv", "class", 1) + verifyLearnerOnBinaryCsvFile("breast-cancer-wisconsin.csv", "Class", 2, true) + verifyLearnerOnBinaryCsvFile("fertility_Diagnosis.train.csv", "Diagnosis", 2, false) + verifyLearnerOnBinaryCsvFile("bank.train.csv", "y", 2, false) + verifyLearnerOnBinaryCsvFile("TelescopeData.csv", " Class", 2, false) + + test("Compare benchmark results file to generated file", TestBase.Extended){ + try writeFile(benchmarkMetricsFile, accuracyResults.mkString("\n") + "\n") + catch { + case e: java.io.IOException => throw new Exception("Not able to process benchmarks file") + } + val historicMetrics = readFile(historicMetricsFile, _.getLines.toList) + if (historicMetrics.length != accuracyResults.length) + throw new Exception(s"Mis-matching number of lines in new benchmarks file: $benchmarkMetricsFile") + for (((hist,acc),i) <- (historicMetrics zip accuracyResults).zipWithIndex) { + assert(hist == acc, + s"""Lines do not match on file comparison: + | $historicMetricsFile:$i: + | $hist + | $benchmarkMetricsFile:$i: + | $acc + |.""".stripMargin) + } + } + + def verifyLearnerOnBinaryCsvFile(fileName: String, + labelColumnName: String, + decimals: Int, + includeNaiveBayes: Boolean): Unit = { + test("Verify classifier can be trained and scored on " + fileName, TestBase.Extended) { + val fileLocation = ClassifierTestUtils.classificationTrainFile(fileName).toString + val (trainScoreResultLogisticRegression: DataFrame, + trainScoreResultDecisionTree: DataFrame, + trainScoreResultGradientBoostedTrees: Option[DataFrame], + trainScoreResultRandomForest: DataFrame, + trainScoreResultMultilayerPerceptron: Option[DataFrame], + trainScoreResultNaiveBayes: Option[DataFrame]) = + readAndScoreDataset(fileName, labelColumnName, fileLocation, true, includeNaiveBayes) + + // Evaluate and get auc, round to 2 decimals + val (aucLogisticRegression, prLogisticRegression) = + evalAUC(trainScoreResultLogisticRegression, labelColumnName, SchemaConstants.ScoresColumn, decimals) + + val (aucDecisionTree, prDecisionTree) = + evalAUC(trainScoreResultDecisionTree, labelColumnName, SchemaConstants.ScoresColumn, decimals) + + val (aucGradientBoostedTrees, prGradientBoostedTrees) = + evalAUC(trainScoreResultGradientBoostedTrees.get, + labelColumnName, + SchemaConstants.ScoredLabelsColumn, + decimals) + + val (aucRandomForest, prRandomForest) = + evalAUC(trainScoreResultRandomForest, labelColumnName, SchemaConstants.ScoresColumn, decimals) + + val (aucMultilayerPerceptron, prMultilayerPerceptron) = + evalAUC(trainScoreResultMultilayerPerceptron.get, + labelColumnName, + SchemaConstants.ScoredLabelsColumn, + decimals) + + addAccuracyResult(fileName, LogisticRegressionClassifierName, + aucLogisticRegression, prLogisticRegression) + addAccuracyResult(fileName, DecisionTreeClassifierName, + aucDecisionTree, prDecisionTree) + addAccuracyResult(fileName, GradientBoostedTreesClassifierName, + aucGradientBoostedTrees, prGradientBoostedTrees) + addAccuracyResult(fileName, RandomForestClassifierName, + aucRandomForest, prRandomForest) + addAccuracyResult(fileName, MultilayerPerceptronClassifierName, + aucMultilayerPerceptron, prMultilayerPerceptron) + if (includeNaiveBayes) { + val (aucNaiveBayes, prNaiveBayes) = + evalAUC(trainScoreResultNaiveBayes.get, + labelColumnName, + SchemaConstants.ScoredLabelsColumn, + decimals) + addAccuracyResult(fileName, NaiveBayesClassifierName, + aucNaiveBayes, prNaiveBayes) + } + } + } + + def verifyLearnerOnMulticlassCsvFile(fileName: String, + labelColumnName: String, + decimals: Int, + includeNaiveBayes: Boolean): Unit = { + test("Verify classifier can be trained and scored on multiclass " + fileName, TestBase.Extended) { + val fileLocation = ClassifierTestUtils.multiclassClassificationTrainFile(fileName).toString + val (trainScoreResultLogisticRegression: DataFrame, + trainScoreResultDecisionTree: DataFrame, + trainScoreResultGradientBoostedTrees: Option[DataFrame], + trainScoreResultRandomForest: DataFrame, + trainScoreResultMultilayerPerceptron: Option[DataFrame], + trainScoreResultNaiveBayes: Option[DataFrame]) = + readAndScoreDataset(fileName, labelColumnName, fileLocation, false, includeNaiveBayes) + + // Evaluate and get accuracy, F1-Measure + val (accuracyLogisticRegression, f1LogisticRegression) = + evalMulticlass(trainScoreResultLogisticRegression, + labelColumnName, + SchemaConstants.ScoredLabelsColumn, + decimals) + + val (accuracyDecisionTree, f1DecisionTree) = + evalMulticlass(trainScoreResultDecisionTree, labelColumnName, SchemaConstants.ScoredLabelsColumn, decimals) + + val (accuracyRandomForest, f1RandomForest) = + evalMulticlass(trainScoreResultRandomForest, labelColumnName, SchemaConstants.ScoredLabelsColumn, decimals) + + addAccuracyResult(fileName, LogisticRegressionClassifierName, + accuracyLogisticRegression, f1LogisticRegression) + + addAccuracyResult(fileName, DecisionTreeClassifierName, + accuracyDecisionTree, f1DecisionTree) + + addAccuracyResult(fileName, RandomForestClassifierName, + accuracyRandomForest, f1RandomForest) + + if (includeNaiveBayes) { + val (accuracyNaiveBayes, f1NaiveBayes) = + evalMulticlass(trainScoreResultNaiveBayes.get, labelColumnName, SchemaConstants.ScoredLabelsColumn, decimals) + + addAccuracyResult(fileName, NaiveBayesClassifierName, + accuracyNaiveBayes, f1NaiveBayes) + } + } + } + + def readAndScoreDataset(fileName: String, + labelColumnName: String, + fileLocation: String, + includeNonProb: Boolean, + includeNaiveBayes: Boolean) + : (DataFrame, DataFrame, Option[DataFrame], DataFrame, Option[DataFrame], Option[DataFrame]) = { + // TODO: Add other file types for testing + val dataset: DataFrame = + session.read.format("com.databricks.spark.csv") + .option("header", "true").option("inferSchema", "true") + .option("treatEmptyValuesAsNulls", "false") + .option("delimiter", if (fileName.endsWith(".csv")) "," else "\t") + .load(fileLocation) + val logisticRegressor = + TrainClassifierTestUtilities.createLogisticRegressor(labelColumnName) + + val decisionTreeClassifier = + TrainClassifierTestUtilities.createDecisionTreeClassifier(labelColumnName) + + val gradientBoostedTreesClassifier = + TrainClassifierTestUtilities.createGradientBoostedTreesClassifier(labelColumnName) + + val randomForestClassifier = + TrainClassifierTestUtilities.createRandomForestClassifier(labelColumnName) + + val multilayerPerceptronClassifier = + TrainClassifierTestUtilities.createMultilayerPerceptronClassifier(labelColumnName) + + val naiveBayesClassifier = + TrainClassifierTestUtilities.createNaiveBayesClassifier(labelColumnName) + + val trainScoreResultLogisticRegression = + TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, logisticRegressor) + + val trainScoreResultDecisionTree = + TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, decisionTreeClassifier) + + val trainScoreResultGradientBoostedTrees = + if (includeNonProb) { + Some(TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, gradientBoostedTreesClassifier)) + } + else None + + val trainScoreResultMultilayerPerceptron = + if (includeNonProb) { + Some(TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, multilayerPerceptronClassifier)) + } + else None + + val trainScoreResultNaiveBayes = + if (includeNaiveBayes) { + Some(TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, naiveBayesClassifier)) + } + else None + + val trainScoreResultRandomForest = + TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, randomForestClassifier) + (trainScoreResultLogisticRegression, trainScoreResultDecisionTree, + trainScoreResultGradientBoostedTrees, trainScoreResultRandomForest, + trainScoreResultMultilayerPerceptron, trainScoreResultNaiveBayes) + } + + /** + * Get the auc and area over PR for the scored dataset. + * + * @param scoredDataset The scored dataset to evaluate. + * @param labelColumn The label column. + * @param predictionColumn The prediction column. + * @return The AUC for the scored dataset. + */ + def evalAUC(scoredDataset: DataFrame, + labelColumn: String, + predictionColumn: String, + decimals: Int): (Double, Double) = { + // Get levels if categorical + val levels = CategoricalUtilities.getLevels(scoredDataset.schema, labelColumn) + if (levels.isEmpty) throw new Exception("Test unexpectedly received empty levels") + val levelsToIndexMap: Map[Any, Double] = levels.get.zipWithIndex.map(t => t._1 -> t._2.toDouble).toMap + + val scoreAndLabels = + scoredDataset.select(col(predictionColumn), col(labelColumn)).na.drop().rdd.map { + case Row(prediction: Vector, label) => (prediction(1), levelsToIndexMap(label)) + case Row(prediction: Double, label) => (prediction, levelsToIndexMap(label)) + } + val metrics = new BinaryClassificationMetrics(scoreAndLabels) + val result = (round(metrics.areaUnderROC(), decimals), + round(metrics.areaUnderPR(), decimals)) + metrics.unpersist() + result + } + + /** + * Get the accuracy and f1-score from multiclass data. + * + * @param scoredDataset The scored dataset to evaluate. + * @param labelColumn The label column. + * @param predictionColumn The prediction column. + * @return The AUC for the scored dataset. + */ + def evalMulticlass(scoredDataset: DataFrame, + labelColumn: String, + predictionColumn: String, + decimals: Int): (Double, Double) = { + + // Get levels if categorical + val levels = CategoricalUtilities.getLevels(scoredDataset.schema, labelColumn) + if (levels.isEmpty) throw new Exception("Test unexpectedly received empty levels") + val levelsToIndexMap: Map[Any, Double] = levels.get.zipWithIndex.map(t => t._1 -> t._2.toDouble).toMap + + val scoreAndLabels = + scoredDataset.select(col(predictionColumn), col(labelColumn)).na.drop().rdd.map { + case Row(prediction: Vector, label) => (prediction(1), levelsToIndexMap(label)) + case Row(prediction: Double, label) => (prediction, levelsToIndexMap(label)) + } + val metrics = new MulticlassMetrics(scoreAndLabels) + val result = (round(metrics.accuracy, decimals), + round(metrics.weightedFMeasure, decimals)) + result + } + + /** + * Rounds the given metric to 2 decimals. + * @param metric The metric to round. + * @return The rounded metric. + */ + def round(metric: Double, decimals: Int): Double = { + BigDecimal(metric) + .setScale(decimals, BigDecimal.RoundingMode.HALF_UP).toDouble + } + + override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = + estimator.asInstanceOf[TrainClassifier].setModel(new LogisticRegression()).setLabelCol(mockLabelColumn) + + override def createFitDataset: DataFrame = createMockDataset + + override def schemaForDataset: StructType = ??? + + override def getEstimator(): Estimator[_] = new TrainClassifier() +} + +/** + * Test helper methods for Train Classifier module. + */ +object TrainClassifierTestUtilities { + + def createLogisticRegressor(labelColumn: String): Estimator[TrainedClassifierModel] = { + val logisticRegression = new LogisticRegression() + .setRegParam(0.3) + .setElasticNetParam(0.8) + .setMaxIter(10) + val trainClassifier = new TrainClassifier() + trainClassifier + .setModel(logisticRegression) + .set(trainClassifier.labelCol, labelColumn) + } + + def createDecisionTreeClassifier(labelColumn: String): Estimator[TrainedClassifierModel] = { + val decisionTreeClassifier = new DecisionTreeClassifier() + .setMaxBins(32) + .setMaxDepth(5) + .setMinInfoGain(0.0) + .setMinInstancesPerNode(1) + .setSeed(0L) + val trainClassifier = new TrainClassifier() + trainClassifier + .setModel(decisionTreeClassifier) + .set(trainClassifier.labelCol, labelColumn) + } + + def createGradientBoostedTreesClassifier(labelColumn: String): Estimator[TrainedClassifierModel] = { + val decisionTreeClassifier = new GBTClassifier() + .setMaxBins(32) + .setMaxDepth(5) + .setMaxIter(20) + .setMinInfoGain(0.0) + .setMinInstancesPerNode(1) + .setStepSize(0.1) + .setSubsamplingRate(1.0) + .setSeed(0L) + val trainClassifier = new TrainClassifier() + trainClassifier + .setModel(decisionTreeClassifier) + .set(trainClassifier.labelCol, labelColumn) + } + + def createRandomForestClassifier(labelColumn: String): Estimator[TrainedClassifierModel] = { + val decisionTreeClassifier = new RandomForestClassifier() + .setMaxBins(32) + .setMaxDepth(5) + .setMinInfoGain(0.0) + .setMinInstancesPerNode(1) + .setNumTrees(20) + .setSubsamplingRate(1.0) + .setSeed(0L) + val trainClassifier = new TrainClassifier() + trainClassifier + .setModel(decisionTreeClassifier) + .set(trainClassifier.labelCol, labelColumn) + } + + def createMultilayerPerceptronClassifier(labelColumn: String): Estimator[TrainedClassifierModel] = { + val layers = Array[Int](2, 5, 2) + val multilayerPerceptronClassifier = new MultilayerPerceptronClassifier() + .setLayers(layers) + .setBlockSize(1) + .setMaxIter(1) + .setTol(1e-6) + .setSeed(0L) + val trainClassifier = new TrainClassifier() + trainClassifier + .setModel(multilayerPerceptronClassifier) + .set(trainClassifier.labelCol, labelColumn) + } + + def createNaiveBayesClassifier(labelColumn: String): Estimator[TrainedClassifierModel] = { + val naiveBayesClassifier = new NaiveBayes() + val trainClassifier = new TrainClassifier() + trainClassifier + .setModel(naiveBayesClassifier) + .set(trainClassifier.labelCol, labelColumn) + } + + def trainScoreDataset(labelColumn: String, dataset: DataFrame, + trainClassifier: Estimator[TrainedClassifierModel]): DataFrame = { + val data = dataset.randomSplit(Seq(0.6, 0.4).toArray, 42) + val trainData = data(0) + val testData = data(1) + + val model = trainClassifier.fit(trainData) + + val scoredData = model.transform(testData) + scoredData + } + +} diff --git a/src/train-classifier/src/test/scala/benchmarkMetrics.csv b/src/train-classifier/src/test/scala/benchmarkMetrics.csv new file mode 100644 index 0000000000..0144c21824 --- /dev/null +++ b/src/train-classifier/src/test/scala/benchmarkMetrics.csv @@ -0,0 +1,68 @@ +abalone.csv,LogisticRegression,0.15,0.04 +abalone.csv,DecisionTreeClassification,0.25,0.22 +abalone.csv,RandomForestClassification,0.26,0.22 +abalone.csv,NaiveBayesClassifier,0.21,0.15 +BreastTissue.csv,LogisticRegression,0.43,0.29 +BreastTissue.csv,DecisionTreeClassification,0.59,0.58 +BreastTissue.csv,RandomForestClassification,0.57,0.52 +BreastTissue.csv,NaiveBayesClassifier,0.54,0.5 +CarEvaluation.csv,LogisticRegression,0.7,0.58 +CarEvaluation.csv,DecisionTreeClassification,0.76,0.74 +CarEvaluation.csv,RandomForestClassification,0.76,0.7 +CarEvaluation.csv,NaiveBayesClassifier,0.74,0.69 +PimaIndian.csv,LogisticRegression,0.5,0.68 +PimaIndian.csv,DecisionTreeClassification,0.62,0.56 +PimaIndian.csv,GradientBoostedTreesClassification,0.68,0.68 +PimaIndian.csv,RandomForestClassification,0.83,0.72 +PimaIndian.csv,MultilayerPerceptronClassifier,0.5,0.68 +PimaIndian.csv,NaiveBayesClassifier,0.51,0.5 +data_banknote_authentication.csv,LogisticRegression,0.92,0.89 +data_banknote_authentication.csv,DecisionTreeClassification,0.98,0.97 +data_banknote_authentication.csv,GradientBoostedTreesClassification,0.98,0.98 +data_banknote_authentication.csv,RandomForestClassification,1.0,1.0 +data_banknote_authentication.csv,MultilayerPerceptronClassifier,0.7,0.74 +task.train.csv,LogisticRegression,0.5,0.57 +task.train.csv,DecisionTreeClassification,0.74,0.71 +task.train.csv,GradientBoostedTreesClassification,0.83,0.85 +task.train.csv,RandomForestClassification,0.9,0.8 +task.train.csv,MultilayerPerceptronClassifier,0.5,0.57 +task.train.csv,NaiveBayesClassifier,0.71,0.56 +breast-cancer.train.csv,LogisticRegression,0.99,0.98 +breast-cancer.train.csv,DecisionTreeClassification,0.96,0.96 +breast-cancer.train.csv,GradientBoostedTreesClassification,0.94,0.94 +breast-cancer.train.csv,RandomForestClassification,1.0,0.99 +breast-cancer.train.csv,MultilayerPerceptronClassifier,0.7,0.71 +breast-cancer.train.csv,NaiveBayesClassifier,0.96,0.96 +random.forest.train.csv,LogisticRegression,1.0,0.99 +random.forest.train.csv,DecisionTreeClassification,0.96,0.96 +random.forest.train.csv,GradientBoostedTreesClassification,0.95,0.95 +random.forest.train.csv,RandomForestClassification,0.99,0.99 +random.forest.train.csv,MultilayerPerceptronClassifier,0.62,0.67 +random.forest.train.csv,NaiveBayesClassifier,0.91,0.91 +transfusion.csv,LogisticRegression,0.5,0.62 +transfusion.csv,DecisionTreeClassification,0.68,0.51 +transfusion.csv,GradientBoostedTreesClassification,0.64,0.52 +transfusion.csv,RandomForestClassification,0.77,0.51 +transfusion.csv,MultilayerPerceptronClassifier,0.5,0.62 +transfusion.csv,NaiveBayesClassifier,0.71,0.61 +breast-cancer-wisconsin.csv,LogisticRegression,1.0,1.0 +breast-cancer-wisconsin.csv,DecisionTreeClassification,0.94,0.95 +breast-cancer-wisconsin.csv,GradientBoostedTreesClassification,0.93,0.95 +breast-cancer-wisconsin.csv,RandomForestClassification,1.0,0.99 +breast-cancer-wisconsin.csv,MultilayerPerceptronClassifier,0.5,0.66 +breast-cancer-wisconsin.csv,NaiveBayesClassifier,0.96,0.95 +fertility_Diagnosis.train.csv,LogisticRegression,0.5,0.56 +fertility_Diagnosis.train.csv,DecisionTreeClassification,0.65,0.18 +fertility_Diagnosis.train.csv,GradientBoostedTreesClassification,0.58,0.29 +fertility_Diagnosis.train.csv,RandomForestClassification,0.68,0.39 +fertility_Diagnosis.train.csv,MultilayerPerceptronClassifier,0.5,0.56 +bank.train.csv,LogisticRegression,0.5,0.56 +bank.train.csv,DecisionTreeClassification,0.53,0.25 +bank.train.csv,GradientBoostedTreesClassification,0.66,0.49 +bank.train.csv,RandomForestClassification,0.88,0.49 +bank.train.csv,MultilayerPerceptronClassifier,0.5,0.06 +TelescopeData.csv,LogisticRegression,0.5,0.68 +TelescopeData.csv,DecisionTreeClassification,0.62,0.58 +TelescopeData.csv,GradientBoostedTreesClassification,0.82,0.83 +TelescopeData.csv,RandomForestClassification,0.89,0.86 +TelescopeData.csv,MultilayerPerceptronClassifier,0.56,0.53 diff --git a/src/train-regressor/build.sbt b/src/train-regressor/build.sbt new file mode 100644 index 0000000000..5031f836f0 --- /dev/null +++ b/src/train-regressor/build.sbt @@ -0,0 +1,2 @@ +//> DependsOn: core +//> DependsOn: featurize diff --git a/src/train-regressor/src/main/scala/TrainRegressor.scala b/src/train-regressor/src/main/scala/TrainRegressor.scala new file mode 100644 index 0000000000..e94d921407 --- /dev/null +++ b/src/train-regressor/src/main/scala/TrainRegressor.scala @@ -0,0 +1,246 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.util.UUID +import com.microsoft.ml.spark.schema.{SchemaConstants, SparkSchema} +import org.apache.hadoop.fs.Path +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.ml.param._ +import org.apache.spark.ml.regression._ +import org.apache.spark.ml.util._ +import org.apache.spark.ml._ +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ + +/** + * Trains a regression model. + */ +class TrainRegressor(override val uid: String) extends Estimator[TrainedRegressorModel] + with HasLabelCol with MMLParams { + + def this() = this(Identifiable.randomUID("TrainRegressor")) + + val model = new EstimatorParam(this, "model", "Regressor to run") + + def getModel: Estimator[_ <: Model[_]] = $(model) + /** @group setParam **/ + def setModel(value: Estimator[_ <: Model[_]]): this.type = set(model, value) + + val featuresColumn = this.uid + "_features" + + val numFeatures = IntParam(this, "numFeatures", "number of features to hash to", 0) + def getNumFeatures: Int = $(numFeatures) + def setNumFeatures(value: Int): this.type = set(numFeatures, value) + + /** + * Fits the regression model. + * + * @param dataset The input dataset to train. + * @return The trained regression model. + */ + override def fit(dataset: Dataset[_]): TrainedRegressorModel = { + val labelColumn = getLabelCol + var oneHotEncodeCategoricals = true + + val numFeatures: Int = getModel match { + case _: DecisionTreeRegressor | _: GBTRegressor | _: RandomForestRegressor => + oneHotEncodeCategoricals = false + FeaturizeUtilities.numFeaturesTreeOrNNBased + case _ => + FeaturizeUtilities.numFeaturesDefault + } + + val regressor = getModel match { + case predictor: Predictor[_, _, _] => { + predictor + .setLabelCol(labelColumn) + .setFeaturesCol(featuresColumn).asInstanceOf[Estimator[_ <: PipelineStage]] + } + case default @ defaultType if defaultType.isInstanceOf[Estimator[_ <: PipelineStage]] => { + // assume label col and features col already set + default + } + case _ => throw new Exception("Unsupported learner type " + getModel.getClass.toString) + } + + val featuresToHashTo = + if (getNumFeatures != 0) { + getNumFeatures + } else { + numFeatures + } + + // TODO: Handle DateType, TimestampType and DecimalType for label + // Convert the label column during train to the correct type and drop missings + val convertedLabelDataset = dataset.withColumn(labelColumn, + dataset.schema(labelColumn).dataType match { + case _: IntegerType | + _: BooleanType | + _: FloatType | + _: ByteType | + _: LongType | + _: ShortType => { + dataset(labelColumn).cast(DoubleType) + } + case _: StringType => { + throw new Exception("Invalid type: Regressors are not able to train on a string label column: " + labelColumn) + } + case _: DoubleType => { + dataset(labelColumn) + } + case default => throw new Exception("Unknown type: " + default.typeName + ", for label column: " + labelColumn) + } + ).na.drop(Seq(labelColumn)) + + val featureColumns = convertedLabelDataset.columns.filter(col => col != labelColumn).toSeq + + val featurizer = new Featurize() + .setFeatureColumns(Map(featuresColumn -> featureColumns)) + .setOneHotEncodeCategoricals(oneHotEncodeCategoricals) + .setNumberOfFeatures(featuresToHashTo) + + val featurizedModel = featurizer.fit(convertedLabelDataset) + val processedData = featurizedModel.transform(convertedLabelDataset) + + processedData.cache() + + // Train the learner + val fitModel = regressor.fit(processedData) + + processedData.unpersist() + + // Note: The fit shouldn't do anything here + val pipelineModel = new Pipeline().setStages(Array(featurizedModel, fitModel)).fit(convertedLabelDataset) + new TrainedRegressorModel(uid, labelColumn, pipelineModel, featuresColumn) + } + + override def copy(extra: ParamMap): Estimator[TrainedRegressorModel] = defaultCopy(extra) + + @DeveloperApi + override def transformSchema(schema: StructType): StructType = TrainRegressor.validateTransformSchema(schema) + +} + +object TrainRegressor extends DefaultParamsReadable[TrainRegressor] { + def validateTransformSchema(schema: StructType): StructType = { + StructType(schema.fields :+ StructField(SchemaConstants.ScoresColumn, DoubleType)) + } +} + +/** + * Model produced by [[TrainRegressor]]. + */ +class TrainedRegressorModel(val uid: String, + val labelColumn: String, + val model: PipelineModel, + val featuresColumn: String) + extends Model[TrainedRegressorModel] with MLWritable { + + override def write: MLWriter = new TrainedRegressorModel.TrainedRegressorModelWriter(uid, + labelColumn, + model, + featuresColumn) + + override def copy(extra: ParamMap): TrainedRegressorModel = + new TrainedRegressorModel(uid, + labelColumn, + model.copy(extra), + featuresColumn) + + override def transform(dataset: Dataset[_]): DataFrame = { + // re-featurize and score the data + val scoredData = model.transform(dataset) + + // Drop the vectorized features column + val cleanedScoredData = scoredData.drop(featuresColumn) + + // Update the schema - TODO: create method that would generate GUID and add it to the scored model + val moduleName = SchemaConstants.ScoreModelPrefix + UUID.randomUUID().toString + val labelColumnExists = cleanedScoredData.columns.contains(labelColumn) + val schematizedScoredDataWithLabel = + if (labelColumnExists) { + SparkSchema.setLabelColumnName(cleanedScoredData, moduleName, labelColumn, SchemaConstants.RegressionKind) + } else { + cleanedScoredData + } + + SparkSchema.setScoresColumnName( + schematizedScoredDataWithLabel.withColumnRenamed(SchemaConstants.SparkPredictionColumn, + SchemaConstants.ScoresColumn), + moduleName, + SchemaConstants.ScoresColumn, + SchemaConstants.RegressionKind) + } + + @DeveloperApi + override def transformSchema(schema: StructType): StructType = TrainRegressor.validateTransformSchema(schema) + + def getParamMap: ParamMap = model.stages.last.extractParamMap() +} + +object TrainedRegressorModel extends MLReadable[TrainedRegressorModel] { + + private val featurizeModelPart = "featurizeModel" + private val modelPart = "model" + private val dataPart = "data" + + override def read: MLReader[TrainedRegressorModel] = new TrainedRegressorModelReader + + override def load(path: String): TrainedRegressorModel = super.load(path) + + /** [[MLWriter]] instance for [[TrainedRegressorModel]] */ + private[TrainedRegressorModel] + class TrainedRegressorModelWriter(val uid: String, + val labelColumn: String, + val model: PipelineModel, + val featuresColumn: String) + extends MLWriter { + private case class Data(uid: String, labelColumn: String, featuresColumn: String) + + override protected def saveImpl(path: String): Unit = { + val overwrite = this.shouldOverwrite + val qualPath = PipelineUtilities.makeQualifiedPath(sc, path) + // Required in order to allow this to be part of an ML pipeline + PipelineUtilities.saveMetadata(uid, + TrainedRegressorModel.getClass.getName.replace("$", ""), + new Path(path, "metadata").toString, + sc, + overwrite) + // save the featurize model and regressor + val modelPath = new Path(qualPath, modelPart).toString + val modelWriter = + if (overwrite) model.write.overwrite() + else model.write + modelWriter.save(modelPath) + + // save model data + val data = Data(uid, labelColumn, featuresColumn) + val dataPath = new Path(qualPath, dataPart).toString + val saveMode = + if (overwrite) SaveMode.Overwrite + else SaveMode.ErrorIfExists + sparkSession.createDataFrame(Seq(data)).repartition(1).write.mode(saveMode).parquet(dataPath) + } + } + + private class TrainedRegressorModelReader + extends MLReader[TrainedRegressorModel] { + + override def load(path: String): TrainedRegressorModel = { + val qualPath = PipelineUtilities.makeQualifiedPath(sc, path) + // load the uid, label column and model name + val dataPath = new Path(qualPath, dataPart).toString + val data = sparkSession.read.format("parquet").load(dataPath) + val Row(uid: String, labelColumn: String, featuresColumn: String) = + data.select("uid", "labelColumn", "featuresColumn").head() + + // retrieve the underlying model + val model = PipelineModel.load(new Path(qualPath, modelPart).toString) + + new TrainedRegressorModel(uid, labelColumn, model, featuresColumn) + } + } + +} diff --git a/src/train-regressor/src/test/scala/VerifyTrainRegressor.scala b/src/train-regressor/src/test/scala/VerifyTrainRegressor.scala new file mode 100644 index 0000000000..c50b3a84c1 --- /dev/null +++ b/src/train-regressor/src/test/scala/VerifyTrainRegressor.scala @@ -0,0 +1,184 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io.File + +import org.apache.spark.ml.Estimator +import org.apache.spark.ml.regression.{LinearRegression, RandomForestRegressor} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types._ + +/** + * Tests to validate the functionality of Train Regressor module. + */ +class VerifyTrainRegressor extends EstimatorFuzzingTest { + + val regressionTrainFilesDirectory = "/Regression/Train/" + + val mockLabelColumn = "Label" + + def createMockDataset: DataFrame = { + session.createDataFrame(Seq( + (0, 2, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (2, 4, 0.78, 0.99, 2), + (3, 5, 0.12, 0.34, 3), + (0, 1, 0.50, 0.60, 0), + (1, 3, 0.40, 0.50, 1), + (2, 3, 0.78, 0.99, 2), + (3, 4, 0.12, 0.34, 3), + (0, 0, 0.50, 0.60, 0), + (1, 2, 0.40, 0.50, 1), + (2, 3, 0.78, 0.99, 2), + (3, 4, 0.12, 0.34, 3))) + .toDF(mockLabelColumn, "col1", "col2", "col3", "col4") + } + + test("Smoke test for training on a regressor") { + val dataset = createMockDataset + + val linearRegressor = TrainRegressorTestUtilities.createLinearRegressor(mockLabelColumn) + + TrainRegressorTestUtilities.trainScoreDataset(mockLabelColumn, dataset, linearRegressor) + } + + test("Verify you can score on a dataset without a label column") { + val dataset: DataFrame = createMockDataset + + val linearRegressor = TrainRegressorTestUtilities.createLinearRegressor(mockLabelColumn) + + val data = dataset.randomSplit(Seq(0.6, 0.4).toArray, 42) + val trainData = data(0) + val testData = data(1) + + val model = linearRegressor.fit(trainData) + + model.transform(testData.drop(mockLabelColumn)) + } + + test("Verify train regressor works with different output types") { + val dataset = createMockDataset + val castLabelCol = "cast_" + mockLabelColumn + for (outputType <- + Seq(IntegerType, LongType, ByteType, BooleanType, FloatType, DoubleType, ShortType)) { + val modifiedDataset = dataset.withColumn(castLabelCol, dataset(mockLabelColumn).cast(outputType)) + val linearRegressor = TrainRegressorTestUtilities.createLinearRegressor(castLabelCol) + TrainRegressorTestUtilities.trainScoreDataset(castLabelCol, modifiedDataset, linearRegressor) + } + } + + test("Verify a trained regression model can be saved") { + val dataset: DataFrame = createMockDataset + + val linearRegressor = TrainRegressorTestUtilities.createLinearRegressor(mockLabelColumn) + + val model = linearRegressor.fit(dataset) + + val myModelName = "testModel" + lazy val dir = new File(myModelName) + try { + model.write.overwrite().save(myModelName) + // write a second time with overwrite flag, verify still works + model.write.overwrite().save(myModelName) + // assert directory exists + assert(dir.exists()) + + // load the model + val loadedModel = TrainedRegressorModel.load(myModelName) + + // verify model data loaded + assert(loadedModel.labelColumn == model.labelColumn) + assert(loadedModel.uid == model.uid) + val transformedDataset = loadedModel.transform(dataset) + val benchmarkDataset = model.transform(dataset) + assert(verifyResult(transformedDataset, benchmarkDataset)) + } finally { + // delete the file to cleanup + FileUtilities.delTree(dir) + () + } + } + + test("Verify regressor can be trained and scored on airfoil_self_noise-train-csv") { + val fileLocation = + sys.env("DATASETS_HOME") + regressionTrainFilesDirectory + "airfoil_self_noise.train.csv" + val dataset = session.read.format("com.databricks.spark.csv") + .option("header", "true").option("inferSchema", "true") + .option("delimiter", ",").option("treatEmptyValuesAsNulls", "false") + .load(fileLocation) + + val labelColumn = "Scaled sound pressure level" + + val linearRegressor = TrainRegressorTestUtilities.createLinearRegressor(labelColumn) + + TrainRegressorTestUtilities.trainScoreDataset(labelColumn, dataset, linearRegressor) + } + + test("Verify regressor can be trained and scored on CASP-train-csv") { + val fileLocation = + sys.env("DATASETS_HOME") + regressionTrainFilesDirectory + "CASP.train.csv" + val dataset = session.read.format("com.databricks.spark.csv") + .option("header", "true").option("inferSchema", "true") + .option("delimiter", ",").option("treatEmptyValuesAsNulls", "false") + .load(fileLocation) + + val labelColumn = "RMSD" + + val parameters = TrainRegressorTestUtilities.createRandomForestRegressor(labelColumn) + + TrainRegressorTestUtilities.trainScoreDataset(labelColumn, dataset, parameters) + } + + override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = + estimator.asInstanceOf[TrainRegressor].setModel(new LinearRegression()).setLabelCol(mockLabelColumn) + + override def createFitDataset: DataFrame = createMockDataset + + override def schemaForDataset: StructType = ??? + + override def getEstimator(): Estimator[_] = new TrainRegressor() +} + +/** + * Test helper methods for Train Regressor module. + */ +object TrainRegressorTestUtilities { + + def createLinearRegressor(labelColumn: String): Estimator[TrainedRegressorModel] = { + val linearRegressor = new LinearRegression() + .setRegParam(0.3) + .setElasticNetParam(0.8) + val trainRegressor = new TrainRegressor() + trainRegressor + .setModel(linearRegressor) + .set(trainRegressor.labelCol, labelColumn) + } + + def createRandomForestRegressor(labelColumn: String): Estimator[TrainedRegressorModel] = { + val linearRegressor = new RandomForestRegressor() + .setFeatureSubsetStrategy("auto") + .setMaxBins(32) + .setMaxDepth(5) + .setMinInfoGain(0.0) + .setMinInstancesPerNode(1) + .setNumTrees(20) + val trainRegressor = new TrainRegressor() + trainRegressor + .setModel(linearRegressor) + .set(trainRegressor.labelCol, labelColumn) + } + + def trainScoreDataset(labelColumn: String, dataset: DataFrame, trainRegressor: Estimator[TrainedRegressorModel]) + : DataFrame = { + val data = dataset.randomSplit(Seq(0.6, 0.4).toArray, 42) + val trainData = data(0) + val testData = data(1) + + val model = trainRegressor.fit(trainData) + val scoredData = model.transform(testData) + scoredData + } + +} diff --git a/src/utils/build.sbt b/src/utils/build.sbt new file mode 100644 index 0000000000..6d55f118b6 --- /dev/null +++ b/src/utils/build.sbt @@ -0,0 +1 @@ +//> DependsOn: core diff --git a/src/utils/src/main/scala/JarLoadingUtils.scala b/src/utils/src/main/scala/JarLoadingUtils.scala new file mode 100644 index 0000000000..06553105c5 --- /dev/null +++ b/src/utils/src/main/scala/JarLoadingUtils.scala @@ -0,0 +1,139 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.net.URLClassLoader +import java.util.jar.JarFile + +import FileUtilities._ + +import scala.reflect.ClassTag +import scala.reflect._ +import collection.JavaConverters._ + +/** + * Contains logic for loading classes + */ +object JarLoadingUtils { + + private val jarRelPath = "target/scala-" + sys.env("SCALA_VERSION") + private val testRelPath = "test-classes" + private val projectRoots = "project/project-roots.txt" + + private val outputDirs = { + val topDir = List(".", "..").find(root => new File(root, projectRoots).exists) + if (topDir.isEmpty) { + sys.error(s"Could not find roots file at $projectRoots") + } + val rootsFile = new File(topDir.get, projectRoots) + val roots = readFile(rootsFile, _.getLines.toList) + roots.map { root => + new File(new File(topDir.get, root), jarRelPath) + } + } + + private val testOutputDirs = { + outputDirs.flatMap(dir => { + val filePath = new File(dir, testRelPath) + if (filePath.exists()) { + Some(filePath) + } else { + None + } + }) + } + + private val jarFileLocs = outputDirs.flatMap(dir => + FileUtilities.allFiles(dir, file => file.getName.endsWith(".jar"))) + + private val testFileLocs = testOutputDirs.flatMap(dir => + FileUtilities.allFiles(dir, file => file.getName.endsWith(".class")) + .map(file => file.getCanonicalPath.replace(dir.getCanonicalPath + "/", ""))) + + private val jarURLs = jarFileLocs.map(_.toURI.toURL) + + val classLoader = new URLClassLoader(jarURLs.union(testOutputDirs + .map(file => new File(file.getCanonicalPath).toURI.toURL)).toArray, + this.getClass.getClassLoader) + + private lazy val loadedClasses: List[Class[_]] = { + val jarFiles = jarFileLocs.map(jf => new JarFile(jf.getAbsolutePath)) + try { + val classNames = jarFiles.flatMap(_.entries().asScala) + .filter(je => je.getName.endsWith(".class")) + .map(je => je.getName.replace("/", ".").stripSuffix(".class")) + classNames.map(name => classLoader.loadClass(name)) + } finally { + jarFiles.foreach(jf => jf.close()) + } + } + + private lazy val loadedTestClasses: List[Class[_]] = { + val classNames = testFileLocs.map(je => je.stripSuffix(".class").replace("/", ".")) + classNames.map(name => { + try { + classLoader.loadClass(name) + } catch { + case e: Throwable => { println(s"Encountered error $e when loading class"); null } + } + }).filter(_ != null) + } + + private def catchInstantiationErrors[T](clazz: Class[_], func: Function[Class[_], T], debug: Boolean): Option[T] = { + def log(message: String) = { + if (debug) println(message) + } + + try { + Some(func(clazz)) + } catch { + // Classes without default constructor + case ie: InstantiationException => + log(s"Could not generate wrapper without default constructor for " + + s"class ${clazz.getSimpleName}: $ie") + None + // Classes with "private" modifiers on constructors + case iae: IllegalAccessException => + log(s"Could not generate wrapper due to private modifiers or constructors for " + + s"class ${clazz.getSimpleName}: $iae") + None + case ncd: NoClassDefFoundError => + log(s"Could not generate wrapper because no class definition found for class " + + s"${clazz.getSimpleName}: $ncd") + None + case ule: UnsatisfiedLinkError => + log(s"Could not generate wrapper due to link error from: " + + s"${clazz.getSimpleName}: $ule") + None + case e: Exception => + log(s"Could not generate wrapper for class ${clazz.getSimpleName}: ${e.printStackTrace()}") + None + } + } + + def load[T: ClassTag](instantiate: Class[_] => Any, debug: Boolean): List[T] = { + loadedClasses.filter(lc => classTag[T].runtimeClass.isAssignableFrom(lc)).flatMap { lc => + catchInstantiationErrors(lc, instantiate, debug) + }.asInstanceOf[List[T]] + } + + def loadClass[T: ClassTag](debug: Boolean): List[T] = load[T](lc => lc.newInstance(), debug) + + def loadTest[T: ClassTag](instantiate: Class[_] => Any, debug: Boolean): List[T] = { + loadedTestClasses.filter(lc => classTag[T].runtimeClass.isAssignableFrom(lc)).flatMap { lc => + catchInstantiationErrors(lc, instantiate, debug) + }.asInstanceOf[List[T]] + } + + def loadTestClass[T: ClassTag](debug: Boolean): List[T] = loadTest[T](lc => lc.newInstance(), debug) + + def loadObject[T: ClassTag](debug: Boolean): List[T] = load[T]( + lc =>{ + val cons = lc.getDeclaredConstructors()(0) + cons.setAccessible(true) + cons.newInstance()} + , + debug) + +} diff --git a/src/utils/src/main/scala/ObjectUtilities.scala b/src/utils/src/main/scala/ObjectUtilities.scala new file mode 100644 index 0000000000..cc17806321 --- /dev/null +++ b/src/utils/src/main/scala/ObjectUtilities.scala @@ -0,0 +1,71 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import java.io.{InputStream, ObjectInputStream, ObjectOutputStream, ObjectStreamClass} + +import org.apache.hadoop.fs.Path +import org.apache.spark.SparkContext +import FileUtilities._ + +class ObjectInputStreamContextClassLoader(input: InputStream) extends ObjectInputStream(input) { + protected override def resolveClass(desc: ObjectStreamClass): Class[_] = { + try { + Class.forName(desc.getName, false, Thread.currentThread().getContextClassLoader()) + } catch { + case _: ClassNotFoundException => super.resolveClass(desc) + } + } +} + +/** + * Contains logic for reading and writing objects. + */ +object ObjectUtilities { + + /** + * Loads the object from the given path. + * @param corePath The main path for model to load the object from. + * @param objectSubPath The path to the object. + * @param sc The current spark context. + * @tparam ObjectType The type of the object to load. + * @return The loaded object. + */ + def loadObject[ObjectType](corePath: Path, objectSubPath: String, sc: SparkContext): ObjectType = { + val hadoopConf = sc.hadoopConfiguration + val inputPath = new Path(corePath, objectSubPath) + using(Seq(inputPath.getFileSystem(hadoopConf))) { fs => + val inputStream = fs(0).open(inputPath) + using(Seq(new ObjectInputStreamContextClassLoader(inputStream))) { + objectStream => + objectStream(0).readObject().asInstanceOf[ObjectType] + }.get + }.get + } + + /** + * Writes the object to the given path. + * @param objToWrite The object to write. + * @param corePath The main path for model to write the object to. + * @param objectSubPath The path to the object. + * @param sc The current spark context. + * @tparam ObjectType The type of the object to load. + */ + def writeObject[ObjectType](objToWrite: ObjectType, + corePath: Path, + objectSubPath: String, + sc: SparkContext, + overwrite: Boolean): Unit = { + val hadoopConf = sc.hadoopConfiguration + val outputPath = new Path(corePath, objectSubPath) + using(Seq(outputPath.getFileSystem(hadoopConf))) { fs => + val outputStream = fs(0).create(outputPath, overwrite) + using(Seq(new ObjectOutputStream(outputStream))) { + objectStream => + objectStream(0).writeObject(objToWrite) + }.get + }.get + } + +} diff --git a/src/utils/src/main/scala/PipelineUtilities.scala b/src/utils/src/main/scala/PipelineUtilities.scala new file mode 100644 index 0000000000..557e59b5bf --- /dev/null +++ b/src/utils/src/main/scala/PipelineUtilities.scala @@ -0,0 +1,55 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark + +import com.microsoft.ml.spark.FileUtilities.File +import org.apache.hadoop.fs.Path +import org.apache.spark.SparkContext +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods._ + +/** + * Exposes utilities used for saving and loading pipelines. + */ +object PipelineUtilities { + /** + * Saves metadata that is required by spark pipeline model in order to read a model. + * @param uid The id of the PipelineModel saved. + * @param cls The class name. + * @param metadataPath The metadata path. + * @param sc The spark context. + */ + def saveMetadata(uid: String, + cls: String, + metadataPath: String, + sc: SparkContext, + overwrite: Boolean): Unit = { + val metadata = ("class" -> cls) ~ + ("timestamp" -> System.currentTimeMillis()) ~ + ("sparkVersion" -> sc.version) ~ + ("uid" -> uid) ~ + ("paramMap" -> "{}") + + val metadataJson: String = compact(render(metadata)) + val metadataFile = new File(metadataPath) + val fileExists = metadataFile.exists() + if (fileExists) { + if (overwrite) { + metadataFile.delete() + } else { + throw new Exception( + s"Failed to save pipeline, metadata file $metadataPath already exists, please turn on overwrite option") + } + } + sc.parallelize(Seq(metadataJson), 1).saveAsTextFile(metadataPath) + } + + def makeQualifiedPath(sc: SparkContext, path: String): Path = { + val modelPath = new Path(path) + val hadoopConf = sc.hadoopConfiguration + // Note: to get correct working dir, must use root path instead of root + part + val fs = modelPath.getFileSystem(hadoopConf) + modelPath.makeQualified(fs.getUri, fs.getWorkingDirectory) + } +} diff --git a/tools/bin/mml-exec b/tools/bin/mml-exec new file mode 100755 index 0000000000..21925fd447 --- /dev/null +++ b/tools/bin/mml-exec @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +HERE="$(cd "$(dirname "$(realpath "$0")")"; pwd)" + +exe="$1"; shift; args=( "$@" ); set -- +if [[ "$exe" = "" ]]; then + echo "Usage: $(basename "$0") [arguments]" + echo " runs with --package and --repositories flags that are set" + echo " up to use the most recent MMLSpark build." + echo " If is \"jupyter-notebook\" then run the notebook server," + echo " and additional arguments are passed to the \"jupyter notebook\" command." + exit 1 +fi + +if [[ -x "$HERE/../../runme" ]]; then . "$HERE/../../runme" +else echo "Could not find \"runme\"" 1>&2; exit 1; fi + +# If we let spark guess a driver, it can find "python2.7" in the path (eg, the +# system's installation) and use that; so do this to force it to use "python" in +# our path, which is a symlink to the conda python. +if [[ "x$PYSPARK_PYTHON" = "x" ]]; then export PYSPARK_PYTHON="python"; fi + +if [[ "$exe" == "jupyter-notebook" ]]; then + if [[ "${#args[@]}" = 0 ]]; then args="" + else args="$(printf "%q " "${args[@]}")"; fi + export PYSPARK_DRIVER_PYTHON="jupyter" + export PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --ip=* $args" + exe="pyspark"; args=() +fi + +MML_M2REPOS="file:$BUILD_ARTIFACTS/packages/m2,$MAVEN_URL" +MML_PACKAGE="com.microsoft.ml.spark:mmlspark_$SCALA_VERSION:$MML_VERSION" + +exec "$exe" --repositories "$MML_M2REPOS" --packages "$MML_PACKAGE" \ + --master "local[*]" "${args[@]}" diff --git a/tools/build-pr/checkout b/tools/build-pr/checkout new file mode 100755 index 0000000000..e14b314f5a --- /dev/null +++ b/tools/build-pr/checkout @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@" +@ "shared.sh" + +PRDIR="$BASEDIR/.build-pr" +_md "$PRDIR" + +# make it possible to use the main version of these files later too +_ cp -a "$TOOLSDIR/build-pr" "$PRDIR/build-pr" + +if [[ "$BUILDPR" = "" ]]; then exit 0; fi + +echo "##[section] PR Build for #$BUILDPR" + +api "pulls/$BUILDPR" - '.head.sha // error("no such PR")' > /dev/null +sha1="$(api "pulls/$BUILDPR" - '.head.sha')" +repo="$(api "pulls/$BUILDPR" - '.head.repo.full_name')" +ref="$(api "pulls/$BUILDPR" - '.head.ref')" +repourl="https://github.com/$repo" + +printf 'PR BUILD for #%s\n repo: %s\n ref: %s\n sha1: %s\n' \ + "$BUILDPR" "$repo" "$ref" "$sha1" + +git checkout "master" > /dev/null 2>&1 +oldbr="$(git for-each-ref --format="%(refname:short)" "refs/heads/pr-*")" +if [[ "x$oldbr" != "x" ]]; then git branch -D $oldbr; fi + +text="$(jsonq "[A build has started.]($VURL)")" +api "issues/$BUILDPR/comments" -d '{"body":'"$text"'}' - '.id' > "$PRDIR/comment-id" + +_get_T +git fetch "https://$T@github.com/$repo" "$ref:refs/heads/pr-$BUILDPR" +git checkout "pr-$BUILDPR" +git reset --hard "$sha1" + +# useful info in build +{ echo "# This is a build for [github PR #$BUILDPR]($GURL)" + echo "" + echo "Associated Changes (actual ones)" + echo ""; echo "---"; echo "" + git log --format="* [%h]($repourl/commit/%H) [%aN](mailto:%aE) %s" \ + "origin/master..$sha1" + } > "$PRDIR/PR-Build.md" +echo "##vso[task.uploadsummary]$PRDIR/PR-Build.md" + +# variable overrides +prvar() { printf '%s=%q\n' "$1" "$2" >> "$TOOLSDIR/local-config.sh"; } +prvar BUILD_SOURCEVERSION "$sha1" +prvar BUILD_REPOSITORY_NAME "$repo" +prvar BUILD_REPOSITORY_ID "$repourl" +prvar BUILD_REPOSITORY_URI "$repourl" +prvar BUILD_SOURCEBRANCH "refs/heads/$ref" +prvar BUILD_SOURCEBRANCHNAME "$(basename "$ref")" +prvar BUILD_SOURCEVERSIONAUTHOR "$(git log -1 --format="%aN <%aE>")" +prvar BUILD_SOURCEVERSIONMESSAGE "$(git log -1 --format="%s")" diff --git a/tools/build-pr/report b/tools/build-pr/report new file mode 100755 index 0000000000..0173e6573a --- /dev/null +++ b/tools/build-pr/report @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@" + +PRDIR="$BASEDIR/.build-pr" +F="$PRDIR/build-pr/$(basename ${BASH_SOURCE[0]})" + +if [[ "${BASH_SOURCE[0]}" != "$F" ]]; then + if [[ -x "$F" ]]; then exec "$F"; fi; exit +fi + +@ "shared.sh" + +ICONS_URL="https://$MAIN_CONTAINER.blob.core.windows.net/icons" +icon="$ICONS_URL/Robot" +case "${AGENT_JOBSTATUS,,}" in + ( succeeded ) icon+="2.png"; box="![PASS]($icon) Pass" ;; + ( canceled ) icon+="1.png"; box="![CANCEL]($icon) Canceled" ;; + ( failed ) icon+="0.png"; box="![FAIL]($icon) Fail" ;; + ( * ) icon+="1.png"; box="![$AGENT_JOBSTATUS]($icon) Unknown" ;; +esac + +if [[ "$BUILDPR" = "" ]]; then + _ az storage blob copy start --account-name "$MAIN_CONTAINER" \ + --destination-container "icons" --destination-blob "BuildStatus.png" \ + --source-uri "$icon" + exit +fi + +if [[ ! -r "$PRDIR/comment-id" ]]; then exit; fi + +cid="$(< "$PRDIR/comment-id")" + +api "issues/comments/$cid" -X DELETE + +text="$(jsonq "[$box! The build has ${AGENT_JOBSTATUS,,}.]($VURL)")" +api "issues/$BUILDPR/comments" -d '{"body":'"$text"'}' - '.id' > "$PRDIR/comment-id" diff --git a/tools/build-pr/shared.sh b/tools/build-pr/shared.sh new file mode 100644 index 0000000000..09875b3147 --- /dev/null +++ b/tools/build-pr/shared.sh @@ -0,0 +1,47 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +set -e + +cd "$BASEDIR" + +if [[ "$BUILDPR" = "" ]]; then : +elif [[ "$BUILDPR" = *[^0-9]* ]]; then + echo "ERROR: \$BUILDPR should be a number, got: \"$BUILDPR\"" 1>&2 + exit 1 +fi + +T="" +_get_T() { + if [[ "x$T" = "x" ]]; then + T="$(__ az keyvault secret show --vault-name mmlspark-keys --name github-auth \ + | jq -r ".value" | base64 -d)" + fi +} + +declare -A api_cache +api() { + local call="$1"; shift + local curlargs=() x use_cache=1 json="" + while (($# > 0)); do + x="$1"; shift + if [[ "x$x" = "x-" ]]; then break; else use_cache=0; curlargs+=("$x"); fi; + done + if ((use_cache)); then json="${api_cache["${call} ${curlargs[*]}"]}"; fi + if [[ -z "$json" ]]; then + _get_T + json="$(curl --silent --show-error -H "AUTHORIZATION: bearer ${T#*:}" \ + "https://api.github.com/repos/Azure/mmlspark/$call" \ + "${curlargs[@]}")" + if ((use_cache)); then api_cache["${call} ${curlargs[*]}"]="$json"; fi + fi + if (($# == 0)); then echo "$json"; else jq -r "$@" <<<"$json"; fi +} + +jsonq() { # text...; quotes the text as a json string + jq --null-input --arg txt "$*" '$txt' +} + +VURL="${SYSTEM_TASKDEFINITIONSURI%/}/$SYSTEM_TEAMPROJECT" +VURL+="/_build/index?buildId=$BUILD_BUILDID&_a=summary" +GURL="$(api "pulls/$BUILDPR" - '.html_url')" diff --git a/tools/config.sh b/tools/config.sh new file mode 100644 index 0000000000..8a5222e5f8 --- /dev/null +++ b/tools/config.sh @@ -0,0 +1,274 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +################################################################################ +# Environment Configuration +# (See the `defvar` documentation in "utils.sh" too.) + +# Make it possible to have a local installation by setting HOME +defvar -xp HOME; mkdir -p "$HOME" + +# Definition of things that need to be installed. Each one is followed by misc +# settings, where some of the settings can be computed from others. The used +# settings are: +# * ver: The version of the library. This version can be used in other settings +# by using "<{ver}>", it is also available in the `.setup` and `.init` hooks +# as "$ver". +# * lib: The name of the directory (in ~/lib) to install to, defaults to the +# library name in lowercase. +# * envvar: An environment variable prefix to set to the library's version and +# installation directory. Defaults to the library name in uppercase. If this +# is "FOO", then the two variables set are $FOO_VERSION and $FOO_HOME. +# * url: The installer URL. +# * sha256: The expected sha256 of the installer file. +# * instcmd: The installation command for an `sh` installer (should have `"$1"` +# somewhere for the installer file or more likely `bash "$1"`, will be +# `eval`uated). This must be set for sh installers. +# * exes: Executables to symlink to ~/bin. +# * vers: Version info in a format of "cmd|pattern" where the `cmd` part is the +# command to run to get the version (after the library is installed), and the +# output pattern (usually with "<{ver}>", sometimes can also have shell glob +# patterns like "*"). The pattern should identify a complete line in the +# output of `cmd`. +# * bindir: The (relative) sub-directory in which executables are found, +# defaults to "bin". +# * prereq: Prerequisite information in a format of "cmd|msg", where cmd is a +# shell command to run (its output will not be shown), and a message to show +# in case of failure. The message cannot contain "|"s. +# * where: A list of contexts where the library is needed; the contexts are: +# "devel" for developer installation, "build" for just building (eg, on the +# build server), "runtime" for libraries that are needed it a user +# environment. +# In addition, further library-specific setup operations can be put in functions +# named ".setup" and ".init". Both are functions +# run after the library is already populated and its envvar is set and it run in +# its directory (can cd elsewhere), but before executable symlinks are made. +# The .setup function is called to perform setup operation after installation, +# and the .init function is always called when runme starts, so it's useful to +# initialize the environment. + +# First, the common container definition +defvar MAIN_CONTAINER "mmlspark" +# to use the storage directly replace: "azureedge" -> "blob.core.windows" +_main_url() { echo "https://$MAIN_CONTAINER.azureedge.net/$1"; } +# The base URL for our installables +defvar INSTALLER_URL "$(_main_url "installers")" +# Directory for caching installers; if it is empty then no caching is used +defvar INSTALLER_CACHE_DIR "$HOME/.mmlspark_cache" + +INSTALLATIONS=( + + Java ver: "1.8.0" lib: "jdk" + url: "http://cdn.azul.com/zulu/bin/zulu8.21.0.1-jdk8.0.131-linux_x64.tar.gz" + sha256: "17218c6bdd608b5714ffba9d5e28522bb2efc309266ba46232b8b918e6e62133" + exes: "java javac jar javadoc" + vers: "java -version|openjdk version \"<{ver}>_*\"" + where: "devel runtime build" + + SBT ver: "0.13.15" + url: "https://github.com/sbt/sbt/releases/download/v<{ver}>/sbt-<{ver}>.tgz" + sha256: "b6e073d7c201741dcca92cfdd1dd3cd76c42a47dc9d8c8ead8df7117deed7aef" + exes: "sbt" + vers: "sbt -no-colors sbt-version|?info? <{ver}>" + where: "devel build" + + Spark ver: "2.1.1" + url: "https://archive.apache.org/dist/spark/spark-<{ver}>/spark-<{ver}>-bin-hadoop2.7.tgz" + sha256: "372ac4f73221c07696793101007a4f19e31566d1f0d9bd0e5205b6fb5b45bfc2" + exes: "spark-shell spark-sql spark-submit spark-class pyspark sparkR" + vers: "spark-shell --version|* version <{ver}>" + where: "devel runtime build" + + Conda ver: "4.2.12" + url: "https://repo.continuum.io/miniconda/Miniconda3-<{ver}>-Linux-x86_64.sh" + sha256: "c59b3dd3cad550ac7596e0d599b91e75d88826db132e4146030ef471bb434e9a" + instcmd: 'PYTHONPATH="" bash "$1" -b -f -p "$PWD"' + exes: "python python3 ipython ipython3 jupyter conda pip" + vers: "PYTHONDONTWRITEBYTECODE=true conda --version|conda <{ver}>" + where: "devel runtime build" + + DataSets ver: "2017-05-25" + url: "$INSTALLER_URL/datasets-<{ver}>.tgz" + sha256: "9cf58c6d22fa3d3507608c5af23eb791e37bea324d2c98209ae7356becd4ce41" + vers: "cat version|<{ver}>" + where: "devel build" + + # Note: this is different than the version defined in SBT to avoid breaking + # work in progress; but when that's done, we need to sync up the two version + # via a shared version seetting so they cannot diverge. + CNTK ver: "beta12" + url: "$INSTALLER_URL/CNTK-2-0-<{ver}>-0-Linux-64bit-CPU-Only.tar.gz" + sha256: "033c5da4b3034f51d0bde6f0d926f7d075a146b16e7c6148a38cecba928efc6c" + exes: "cntk" + vers: "cntk|*Built time: Feb 22 2017 13:29:08" + bindir: "cntk/bin" + where: "devel build" + + DockerBuildx ver: "0.0.1" + url: "https://github.com/Microsoft/docker-buildx/archive/v<{ver}>.tar.gz" + sha256: "bac3d0036224f4279fc553031849c548296cfae432b3212ea21b2089703b290e" + exes: "docker-buildx" + vers: "docker-buildx -V|<{ver}>" + bindir: "." + where: "devel build" + +) + +# $TESTS holds the specification of tests to run. The syntax is a list of +# `tag`, `+tag` or `-tag`, separated by commas and/or spaces; and `tag` is +# equivalent to `+tag`. The semantics of the specs mimicks the scala semantics +# for tags: we run tests that are tagged with `+tag`s, but not `-tag`s, and if +# there are no `+tag`s then run all tests except for `-tag`s. `all` and `none` +# behave as you'd expect, but they can have additional benefits (e.g., `none` +# will avoid even compiling the tests); avoid using them with other tags. The +# default is `+scala,-extended` for local builds, and `all` for server builds. +# The value is normalized to hold comma-separated `+tag` or `-tag`, except for a +# single `all`/`none` which don't get a sign prefix. $PUBLISH similarly holds +# the specification of things to publish. +defvar -x TESTS "default" +defvar -x PUBLISH "default" +if [[ "$TESTS" = "default" ]]; then + if [[ "$BUILDMODE" = "server" ]]; then TESTS="all"; else TESTS="+scala,-extended"; fi +fi +if [[ "$PUBLISH" = "default" ]]; then + if [[ "$BUILDMODE" = "server" ]]; then PUBLISH="-demo,-docker"; else PUBLISH="none"; fi +fi +# Tag definitions for $TESTS +deftag scala +deftag extended + deftag python extended + deftag e2e extended +deftag linuxonly +# Tag definitions for $PUBLISH +map deftag storage maven pip demo docker + +defvar -p SRCDIR "$BASEDIR/src" +defvar -p BUILD_ARTIFACTS "$BASEDIR/BuildArtifacts" +defvar -p TEST_RESULTS "$BASEDIR/TestResults" + +# Specific installation functions + +SBT.setup() { + local f="$SRCDIR/project/build.properties" txt="sbt.version = $SBT_VERSION" + if [[ ! -e "$f" ]]; then echo "$txt" > "$f"; return; fi + if [[ "x$(< "$f")" != "x$txt" ]]; then failwith "$f exists"; fi +} +defvar SCALA_VERSION "2.11" +defvar SCALA_FULL_VERSION "$SCALA_VERSION.8" +SBT.init() { + setenv SCALA_VERSION "$SCALA_VERSION" + setenv SCALA_FULL_VERSION "$SCALA_FULL_VERSION" +} + +Spark.setup() { + if [[ -e "conf/hive-site.xml" ]]; then failwith "conf/hive-site.xml exists"; fi + { echo "" + echo " " + echo " javax.jdo.option.ConnectionURL" + echo " jdbc:derby:memory:databaseName=metastore_db;create=true" + echo " the URL of the Derby Server database" + echo " " + echo " " + echo " javax.jdo.option.ConnectionDriverName" + echo " org.apache.derby.jdbc.EmbeddedDriver" + echo " " + echo "" + } > "conf/hive-site.xml" +} +Spark.init() { + local f; for f in "python/lib/"*.zip; do + envinit_eval \ + '[[ ":$PYTHONPATH:" != *":$SPARK_HOME/'"$f"':"* ]]' \ + '&& export PYTHONPATH="$PYTHONPATH:$SPARK_HOME/'"$f"'"' + done +} + +Conda.setup() { + show section "Installing Conda & Packages" + _ cp "$TOOLSDIR/mmlspark-packages.spec" . + # Use `--no-update-deps` to avoid updating everything (including conda & + # python) to latest versions; and `--no-deps` is to avoid dependencies that we + # know are not needed, such as QT. + _ ./bin/conda install --name "root" --no-update-deps --no-deps --yes \ + --quiet --file "mmlspark-packages.spec" + if [[ "$BUILDMODE" != "runtime" ]]; then + ./bin/pip install "xmlrunner" "wheel" + else + show section "Minimizing conda directory" + collect_log=2 _ ./bin/conda uninstall -y tk + collect_log=2 _ ./bin/conda clean -y --all + _rm "pkgs" + show command "rm lib/libmkl_....so" + rm -f lib/libmkl_{,vml_}{def,rt,sequential,cmpt,mc{,2,3},avx512{,_mic}}.so + show command "rm **/*.pyc" + rm -rf **/__pycache__/ + rm -f **/*.pyc + show command "strip **/*.so" + # note: running this without output and ignore its exit status, so it can + # fail silently (its stderr is verbose with files it can't strip, and it + # does return an error) + strip **/*.so > /dev/null 2>&1 + fi +} + +_add_to_ld_library_path() { + envinit_eval \ + '[[ ":$LD_LIBRARY_PATH:" != *":'"$1"':"* ]]' \ + '&& export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:'"$1"'"' +} +_req_library_so() { # file.so libname + { /sbin/ldconfig -p | grep -q "$1"; } || + failwith "$1 missing, try apt-get install $2" +} +CNTK.init() { + _req_library_so "libmpi_cxx.so" "libopenmpi1.10" + _req_library_so "libgomp.so" "libgomp1" + _add_to_ld_library_path '$CNTK_HOME/cntk/lib' + _add_to_ld_library_path '$CNTK_HOME/cntk/dependencies/lib' +} + +# Storage for build artifacts +defvar STORAGE_CONTAINER "buildartifacts" +defvar STORAGE_URL "$(_main_url "$STORAGE_CONTAINER")" + +# Container for maven/pip packages +defvar MAVEN_CONTAINER "maven" +defvar -x MAVEN_URL "$(_main_url "$MAVEN_CONTAINER")" +defvar -d MAVEN_PACKAGE "com.microsoft.ml.spark:mmlspark_$SCALA_VERSION:<{MML_VERSION}>" +defvar PIP_CONTAINER "pip" +defvar -x PIP_URL "$(_main_url "$PIP_CONTAINER")" +defvar -d PIP_PACKAGE "mmlspark-<{MML_VERSION}>-py2.py3-none-any.whl" + +# E2E test cluster information +defvar E2E_CLUSTER_NAME "mmlsparktest" +defvar E2E_RESOURCE_GROUP "mmlsparktest" +defvar E2E_CLUSTER_SSH "spark@${E2E_CLUSTER_NAME}-ssh.azurehdinsight.net" +defvar E2E_PARALLEL_RUNS "2" +defvar CLUSTER_SDK_DIR "/mml-sdk" # this is for all clusters + +# Demo cluster information +defvar DEMO_CLUSTER_NAME "mmlsparkdemo" +defvar DEMO_RESOURCE_GROUP "mmlsparkdemo" + +# Public contact email +defvar -x SUPPORT_EMAIL "mmlspark-support@microsoft.com" + +# The following should generally not change + +PROFILE_FILE="$HOME/.mmlspark_profile" +CONF_TRACK_FILE="$HOME/.mmlspark_installed_libs" +ENV_INIT_FILES=(".profile" # first: write here if none of these files exist + ".bash_profile" ".bash_login" ".bashrc" ".zprofile" ".zshrc") +LIB_VERSION_FILE="MMLSPARK_INSTALLED-README.txt" + +CURL_FLAGS="-f --location --retry 20 --retry-max-time 60 --connect-timeout 120" +CURL_FLAGS="$CURL_FLAGS --speed-limit 10 --speed-time 120" +if [[ "$BUILDMODE" = "server" ]]; then CURL_FLAGS="$CURL_FLAGS --silent --show-error" +else CURL_FLAGS="$CURL_FLAGS --progress-bar"; fi + +envinit_eval '[[ ":$PATH:" != *":$HOME/bin:"* ]] && export PATH="$HOME/bin:$PATH"' +envinit_commands+=( + 'ldpaths="$(ldconfig -v 2> /dev/null | while read -r line; do + if [[ "$line" = *: ]]; then echo -n "$line"; fi; done)"' + '[[ ":$LD_LIBRARY_PATH:" != *":$ldpaths"* ]] && export LD_LIBRARY_PATH="$ldpaths$LD_LIBRARY_PATH"' +) diff --git a/tools/docker/Dockerfile b/tools/docker/Dockerfile new file mode 100644 index 0000000000..537678f7ab --- /dev/null +++ b/tools/docker/Dockerfile @@ -0,0 +1,54 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +FROM ubuntu:16.04 + +#META# echo LABEL maintainer="$SUPPORT_EMAIL" + +#META# H="/home/mmlspark"; echo ENV \ + USER="mmlspark" HOME="$H" PATH="$H/bin:\$PATH" + +RUN : '==== create a user (and also hide the random hostname in the prompt)' \ + && echo 'PS1='\''\u:\w\$ '\' >> "/etc/skel/.bashrc" \ + && useradd -c "Microsoft ML for Apache Spark" -U -G root -d "$HOME" -m "$USER" \ + && : '==== install needed packages' \ + && apt-get update --fix-missing \ + && apt-get install -y curl unzip bzip2 libopenmpi1.10 libgomp1 libunwind8 libtiff5 \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR $HOME +USER $USER:$USER + +# add the premade runtime environment +ADD mmlspark.tgz .. + +#BUILD# -t mmlspark:keep-for-cache +#META# rm -f "mmlspark.tgz" # avoid re-hashing a big file + +#META# H="/home/mmlspark"; echo ENV \ + MML_VERSION=$(printf "%q" "$MML_VERSION") \ + MML_BUILD_INFO=$(printf "%q" "$MML_BUILD_INFO") \ + MMLSPARK_JUPYTER_PORT="8888" \ + MML_M2REPOS="file:$H/mml-m2,$MAVEN_URL" \ + MML_PACKAGE="com.microsoft.ml.spark:mmlspark_$SCALA_VERSION:$MML_VERSION" + +ADD notebooks.tgz / +ADD mml-m2.tgz . +ADD bin.tgz . + +RUN : '==== make the notebooks convenient to access and have the right date' \ + && find "/notebooks" -execdir touch "{}" + \ + && ln -s /notebooks . \ + && : '==== disable the jupyter security token' \ + && mkdir ".jupyter" \ + && echo "c.NotebookApp.token = ''" > ".jupyter/jupyter_notebook_config.py" \ + && : '==== pre-populate the ivy cache' \ + && bash -c ". ./.mmlspark_profile; \ + spark-shell --master \"local[*]\" \ + --repositories \"$MML_M2REPOS\" --packages \"$MML_PACKAGE\" < /dev/null" + +# #SQUASH# + +# use CMD to get both "docker run this" and "docker run this command" work +EXPOSE $MMLSPARK_JUPYTER_PORT +CMD ["bin/launcher"] diff --git a/tools/docker/bin/EULA.txt b/tools/docker/bin/EULA.txt new file mode 100644 index 0000000000..fbc1b8ccc6 --- /dev/null +++ b/tools/docker/bin/EULA.txt @@ -0,0 +1,203 @@ +MICROSOFT SOFTWARE LICENSE TERMS +================================ + +MICROSOFT MACHINE LEARNING LIBRARY FOR APACHE SPARK +--------------------------------------------------- + +These license terms are an agreement between you and Microsoft +Corporation (or one of its affiliates). They apply to the software +named above and any Microsoft services or software updates (except to +the extent such services or updates are accompanied by new or additional +terms, in which case those different terms apply prospectively and do +not alter your or Microsoft's rights relating to pre-updated software or +services). IF YOU COMPLY WITH THESE LICENSE TERMS, YOU HAVE THE RIGHTS +BELOW. BY DOWNLOADING OR USING THE SOFTWARE, YOU ACCEPT THESE TERMS. + +1. INSTALLATION AND USE RIGHTS. + + a. General. You may install and use any number of copies of the + software to develop and test your applications. + + b. Third Party Software. The software may include third party + applications that Microsoft, not the third party, licenses to you + under this agreement. Any included notices for third party + applications are for your information only. + + c. Open Source Components. The software may contain third party + copyrighted software licensed under open source licenses with + source code availability obligations. Copies of those licenses + are included in the ThirdPartyNotices file or other accompanying + notices file. You may obtain the complete corresponding source + code from Microsoft if and as required under the relevant open + source license by sending a money order or check for $5.00 to: + Source Code Compliance Team, Microsoft Corporation, 1 Microsoft + Way, Redmond, WA 98052, USA. Please write “source code for + Microsoft Machine Learning Library for Apache Spark” in the memo + line of your payment. You may also find a copy of the source at + http://aka.ms/getsource . + +2. SCOPE OF LICENSE. The software is licensed, not sold. Microsoft + reserves all other rights. Unless applicable law gives you more + rights despite this limitation, you will not (and have no right to): + + a. work around any technical limitations in the software that only + allow you to use it in certain ways; + + b. reverse engineer, decompile, or disassemble the software, or + attempt to do so, except and only to the extent permitted by + licensing terms governing the use of open-source components that + may be included with the software; + + c. remove, minimize, block, or modify any notices of Microsoft or its + suppliers in the software; + + d. use the software in any way that is against the law or to create + or propagate malware; or + + e. share, publish, distribute, or lend the software (except for any + distributable code, subject to the terms above), provide the + software as a stand-alone hosted solution for others to use, or + transfer the software or this agreement to any third party. + +3. EXPORT RESTRICTIONS. You must comply with all domestic and + international export laws and regulations that apply to the software, + which include restrictions on destinations, end users, and end use. + For further information on export restrictions, visit + http://aka.ms/exporting . + +4. SUPPORT SERVICES. Microsoft is not obligated under this agreement to + provide any support services for the software. Any support provided + is “as is”, “with all faults”, and without warranty of any kind. + +5. UPDATES. The software may periodically check for updates, and + download and install them for you. You may obtain updates only from + Microsoft or authorized sources. Microsoft may need to update your + system to provide you with updates. You agree to receive these + automatic updates without any additional notice. Updates may not + include or support all existing software features, services, or + peripheral devices. + +6. ENTIRE AGREEMENT. This agreement, and any other terms Microsoft may + provide for supplements, updates, or third-party applications, is the + entire agreement for the software. + +7. Governing Law and Venue. This Agreement is governed by and construed + in accordance with the laws of the state of Washington, without + reference to its choice of law principles to the contrary. Each + party hereby consents to the jurisdiction and venue of the state and + federal courts located in King County, Washington, with regard to any + suit or claim arising under or by reason of this Agreement. + +8. CONSUMER RIGHTS; REGIONAL VARIATIONS. This agreement describes + certain legal rights. You may have other rights, including consumer + rights, under the laws of your state or country. Separate and apart + from your relationship with Microsoft, you may also have rights with + respect to the party from which you acquired the software. This + agreement does not change those other rights if the laws of your + state or country do not permit it to do so. For example, if you + acquired the software in one of the below regions, or mandatory + country law applies, then the following provisions apply to you: + + a. Australia. You have statutory guarantees under the Australian + Consumer Law and nothing in this agreement is intended to affect + those rights. + + b. Canada. If you acquired this software in Canada, you may stop + receiving updates by turning off the automatic update feature, + disconnecting your device from the Internet (if and when you + re-connect to the Internet, however, the software will resume + checking for and installing updates), or uninstalling the + software. The product documentation, if any, may also specify how + to turn off updates for your specific device or software. + + c. Germany and Austria. + + i. Warranty. The properly licensed software will perform + substantially as described in any Microsoft materials that + accompany the software. However, Microsoft gives no + contractual guarantee in relation to the licensed software. + + ii. Limitation of Liability. In case of intentional conduct, gross + negligence, claims based on the Product Liability Act, as well + as, in case of death or personal or physical injury, Microsoft + is liable according to the statutory law. + + Subject to the foregoing clause ii., Microsoft will only be liable + for slight negligence if Microsoft is in breach of such material + contractual obligations, the fulfillment of which facilitate the + due performance of this agreement, the breach of which would + endanger the purpose of this agreement and the compliance with + which a party may constantly trust in (so-called "cardinal + obligations"). In other cases of slight negligence, Microsoft + will not be liable for slight negligence. + +9. DISCLAIMER OF WARRANTY. THE SOFTWARE IS LICENSED “AS IS.” YOU BEAR + THE RISK OF USING IT. MICROSOFT GIVES NO EXPRESS WARRANTIES, + GUARANTEES, OR CONDITIONS. TO THE EXTENT PERMITTED UNDER APPLICABLE + LAWS, MICROSOFT EXCLUDES ALL IMPLIED WARRANTIES, INCLUDING + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND + NON-INFRINGEMENT. + +10. LIMITATION ON AND EXCLUSION OF DAMAGES. IF YOU HAVE ANY BASIS FOR + RECOVERING DAMAGES DESPITE THE PRECEDING DISCLAIMER OF WARRANTY, YOU + CAN RECOVER FROM MICROSOFT AND ITS SUPPLIERS ONLY DIRECT DAMAGES UP + TO U.S. $5.00. YOU CANNOT RECOVER ANY OTHER DAMAGES, INCLUDING + CONSEQUENTIAL, LOST PROFITS, SPECIAL, INDIRECT, OR INCIDENTAL + DAMAGES. + + This limitation applies to (a) anything related to the software, + services, content (including code) on third party Internet sites, or + third party applications; and (b) claims for breach of contract, + warranty, guarantee, or condition; strict liability, negligence, or + other tort; or any other claim; in each case to the extent permitted + by applicable law. + + It also applies even if Microsoft knew or should have known about + the possibility of the damages. The above limitation or exclusion + may not apply to you because your state, province, or country may + not allow the exclusion or limitation of incidental, consequential, + or other damages. + +Please note: As this software is distributed in Canada, some of the +clauses in this agreement are provided below in French. + +Remarque: Ce logiciel étant distribué au Canada, certaines des clauses +dans ce contrat sont fournies ci-dessous en français. + +EXONÉRATION DE GARANTIE. Le logiciel visé par une licence est offert +«tel quel». Toute utilisation de ce logiciel est à votre seule risque +et péril. Microsoft n’accorde aucune autre garantie expresse. Vous +pouvez bénéficier de droits additionnels en vertu du droit local sur la +protection des consommateurs, que ce contrat ne peut modifier. La ou +elles sont permises par le droit locale, les garanties implicites de +qualité marchande, d’adéquation à un usage particulier et d’absence de +contrefaçon sont exclues. + +LIMITATION DES DOMMAGES-INTÉRÊTS ET EXCLUSION DE RESPONSABILITÉ POUR LES +DOMMAGES. Vous pouvez obtenir de Microsoft et de ses fournisseurs une +indemnisation en cas de dommages directs uniquement à hauteur de 5,00 $ +US. Vous ne pouvez prétendre à aucune indemnisation pour les autres +dommages, y compris les dommages spéciaux, indirects ou accessoires et +pertes de bénéfices. + +Cette limitation concerne: + +* tout ce qui est relié au logiciel, aux services ou au contenu (y + compris le code) figurant sur des sites Internet tiers ou dans des + programmes tiers; et + +* les réclamations au titre de violation de contrat ou de garantie, ou + au titre de responsabilité stricte, de négligence ou d’une autre faute + dans la limite autorisée par la loi en vigueur. + +Elle s’applique également, même si Microsoft connaissait ou devrait +connaître l’éventualité d’un tel dommage. Si votre pays n’autorise pas +l’exclusion ou la limitation de responsabilité pour les dommages +indirects, accessoires ou de quelque nature que ce soit, il se peut que +la limitation ou l’exclusion ci-dessus ne s’appliquera pas à votre +égard. + +EFFET JURIDIQUE. Le présent contrat décrit certains droits juridiques. +Vous pourriez avoir d’autres droits prévus par les lois de votre pays. +Le présent contrat ne modifie pas les droits que vous confèrent les lois +de votre pays si celles-ci ne le permettent pas. diff --git a/tools/docker/bin/eula b/tools/docker/bin/eula new file mode 100755 index 0000000000..9b8cdec066 --- /dev/null +++ b/tools/docker/bin/eula @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +more "$(dirname "$0")/EULA.txt" + +echo "" +echo "" +echo "(Note: you can also use \"-e ACCEPT_EULA=Y\" to indicate agreement.)" +echo "" +read -ep "Do you agree to the EULA? " R +if [[ "x${R,,}" != @("xy"|"xyes") ]]; then echo "Bye." +else ACCEPT_EULA=Y launcher; fi diff --git a/tools/docker/bin/eula.html b/tools/docker/bin/eula.html new file mode 100644 index 0000000000..0286e89b05 --- /dev/null +++ b/tools/docker/bin/eula.html @@ -0,0 +1,54 @@ + +MMLSpark EULA + + + +
+ Please read the following EULA for the MMLSpark Docker Image. +

+ +     + +

+ + (Agreement will be remembered, but you can skip this check completely by + setting ACCEPT_EULA to Yes when starting the + container, e.g., docker run ... -e ACCEPT_EULA=Y ...) +

{TEXT}
+
diff --git a/tools/docker/bin/eula.py b/tools/docker/bin/eula.py new file mode 100755 index 0000000000..9715467593 --- /dev/null +++ b/tools/docker/bin/eula.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +from os import path, environ +from http.server import BaseHTTPRequestHandler, HTTPServer +import sys, threading, codecs + +def read_file(f): + with codecs.open(path.join(path.dirname(__file__), f), "r", + encoding = "utf-8") as inp: + return inp.read() +html = read_file("eula.html").replace("{TEXT}", read_file("EULA.txt")) + +class eulaRequestHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == "/exit-accept": + httpd.exit_code = 0 + threading.Thread(target = httpd.shutdown, daemon = True).start() + elif self.path == "/exit-reject": + httpd.exit_code = 1 + threading.Thread(target = httpd.shutdown, daemon = True).start() + else: + self.send_response(200) + self.send_header("Content-type","text/html") + self.end_headers() + message = html + self.wfile.write(bytes(message, "utf8")) + return + +pvar = "MMLSPARK_JUPYTER_PORT" +port = int(environ[pvar]) if pvar in environ else 8888 + +print("Running EULA server...") +httpd = HTTPServer(("", port), eulaRequestHandler) +httpd.serve_forever() + +print("Done, " + ("accept" if httpd.exit_code == 0 else "reject") + "ing") +sys.exit(httpd.exit_code) diff --git a/tools/docker/bin/launcher b/tools/docker/bin/launcher new file mode 100755 index 0000000000..528dc3c511 --- /dev/null +++ b/tools/docker/bin/launcher @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +shopt -s extglob + +. "$HOME/.mmlspark_profile" +cd "$HOME/notebooks" +echo "spark.sql.warehouse.dir $HOME/spark-warehouse" \ + > "$HOME/lib/spark/conf/spark-defaults.conf" + +if [[ "${ACCEPT_EULA,,}" != @(y|yes) ]]; then + { echo "ERROR: You must accept the End User License Agreement to use this container." + echo "Run this container with \"eula\" to read the EULA." + echo "Set the environment variable ACCEPT_EULA to \"Yes\" (or \"Y\") to accept the" + echo "agreement, e.g., \"docker run -it -e ACCEPT_EULA=Y ...\"." + } 1>&2 + exit 1 + echo "Waiting for EULA agreement"; eula.py || exit 1 +fi + +PYSPARK_DRIVER_PYTHON="jupyter" \ +PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=${MMLSPARK_JUPYTER_PORT:=8888} --ip=*" \ + pyspark --master "local[*]" --repositories "$MML_M2REPOS" --packages "$MML_PACKAGE" diff --git a/tools/docker/build-docker b/tools/docker/build-docker new file mode 100755 index 0000000000..1cdb330f66 --- /dev/null +++ b/tools/docker/build-docker @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@" +main() { + +show section "Building Docker Image" + +_rmcd "$BUILD_ARTIFACTS/docker-work" + +_tgzip() { # outdir workdir (always packs ".") + # avoid tracking times/owners to keep the bits stable (for docker caching) + local out="$1" dir="$2"; shift 2 + if [[ "$dir" != "$out" ]]; then cp -al "$dir" "$out"; fi + tar cf - --mtime 1970-1-1T00:00 --owner=mmlspark --group=mmlspark "$out" \ + | gzip -n9 > "$out.tgz" + rm -rf "$out" +} + +local envtgz="$INSTALLER_CACHE_DIR/$(get_runtime_hash).tgz" +if [[ -r "$envtgz" ]]; then + _ cp -al "$envtgz" "mmlspark.tgz" +else + show - "Creating base environment cache" + docker run --interactive --rm \ + -v "$BASEDIR:/mkenv/src:ro" \ + -v "$INSTALLER_CACHE_DIR:/mkenv/cache:ro" \ + -v "$(pwd):/home" \ + ubuntu:16.04 "/mkenv/src/tools/docker/build-env" \ + 2>&1 | ( IFS=""; while read -r line; do echo "| $line"; done ) + _ cp -al "mmlspark.tgz" "$envtgz" +fi + +_ _tgzip "notebooks" "$BUILD_ARTIFACTS/notebooks/local" +_ _tgzip "mml-m2" "$BUILD_ARTIFACTS/packages/m2" +_ _tgzip "bin" "$TOOLSDIR/docker/bin" +_ cp "$TOOLSDIR/docker/Dockerfile" . + +find . -type f | xargs cksum > ~/tmp/1 + +_ docker system prune -f +_ docker-buildx -t mmlspark . + +_ cd "$BASEDIR" +_rm "$BUILD_ARTIFACTS/docker-work" + +} +main "$@" diff --git a/tools/docker/build-env b/tools/docker/build-env new file mode 100755 index 0000000000..afa0607fe6 --- /dev/null +++ b/tools/docker/build-env @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +# This file is a hack for creating the environment tgz with conda installed in +# the right place (cannot install it in one place and use in another). It would +# be better to do this using a docker multi-stage build, but there is no way to +# tag the intermediate step so it won't be deleted with a `docker system prune`. +# By using a tgz we properly control the caching. + +apt-get update --fix-missing +apt-get install -y curl unzip bzip2 libopenmpi1.10 libgomp1 libunwind8 libtiff5 binutils + +export USER="mmlspark" +export HOME="/home/$USER" + +echo 'PS1='\''\u:\w\$ '\' >> "/etc/skel/.bashrc" +useradd -c "Microsoft ML for Apache Spark" -U -d "$HOME" -m "$USER" +cd "$HOME" + +/mkenv/src/runme BUILDMODE=runtime INSTALLER_CACHE_DIR=/mkenv/cache \ + MML_VERSION="???" MML_BUILD_INFO="???" + +chown -R "$USER:$USER" "$HOME" +cd /home +tar czf "$USER.tgz" "$USER" +rm -rf "$USER" +chown -R "$USER:$USER" "$USER.tgz" diff --git a/tools/hdi/install-mmlspark.sh b/tools/hdi/install-mmlspark.sh new file mode 100755 index 0000000000..5c28b284c0 --- /dev/null +++ b/tools/hdi/install-mmlspark.sh @@ -0,0 +1,165 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +# ----------------------------------------------------------------------------- +# Configurations for installing mmlspark + dependencies on an HDI +# cluster, from a specific storage blob (which is created by the build). + +# These are replaced by the build process. +DOWNLOAD_URL="<=<=fill-in-url=>=>" +MAVEN_PACKAGE="<=<=fill-in-maven-package=>=>" +MAVEN_URL="<=<=fill-in-maven-url=>=>" +PIP_PACKAGE="<=<=fill-in-pip-package=>=>" +SDK_DIR="<=<=fill-in-sdk-dir=>=>" +HDFS_NOTEBOOKS_FOLDER="/HdiNotebooks/Microsoft ML Spark Examples" + +CONDA_ENVS=( "root" "py35" ) + +CNTK_VER="2.0.beta12.0" +CNTK_BASE_URL="https://cntk.ai/PythonWheel/CPU-Only" +CNTK_WHEELS=( # each is "::" + "root::$CNTK_BASE_URL/cntk-$CNTK_VER-cp27-cp27mu-linux_x86_64.whl" + "py35::$CNTK_BASE_URL/cntk-$CNTK_VER-cp35-cp35m-linux_x86_64.whl") + +get_headnodes() { + hdfssite="$(< "/etc/hadoop/conf/hdfs-site.xml")" + host1="${hdfssite#*dfs.namenode.http-address.mycluster.nn1*}" + host2="${hdfssite#*dfs.namenode.http-address.mycluster.nn2*}" + host1="${host1%%:**}"; num1="${host1%%-*}"; num1="${num1#hn}" + host2="${host2%%:**}"; num2="${host2%%-*}"; num2="${num2#hn}" + if [[ "$host1,$host2" = "," ]]; then return; fi + if (($num1 < $num2)); then echo "$host1,$host2"; else echo "$host2,$host1"; fi +} + +get_primary_headnode() { + headnodes="$(get_headnodes)" + echo "${headnodes%%,*}" +} + +# ----------------------------------------------------------------------------- +# Run on all nodes + +# Install prerequisites +apt-get install -y openmpi-bin libunwind8 + +# Install CNTK in Python 2.7 & 3.5 +_anaconda_bin() { local bin="$1"; shift; . "/usr/bin/anaconda/bin/$bin" "$@"; } +for cntk_wheel in "${CNTK_WHEELS[@]}"; do + condaenv="${cntk_wheel%%::*}" wheel="${cntk_wheel#*::}" pkg="$(pip freeze | grep "cntk")" + _anaconda_bin activate "$condaenv" + echo -n "[$condaenv] " + if [[ ! "$pkg" = "cntk"* ]]; then echo "Installing CNTK..."; pip install "$wheel" + elif [[ "$pkg" = *"$CNTK_VER" ]]; then echo "Latest CNTK version is already installed." + else echo "Updating CNTK..."; pip install --upgrade --no-deps "$wheel" + fi + _anaconda_bin deactivate +done + +# Download build artifacts & scripts +tmp="/tmp/mmlinstall-$$" +curlflags="--silent --show-error" +mkdir "$tmp" +echo "Downloading materials..." +curl $curlflags -o "$tmp/BuildArifacts.zip" "$DOWNLOAD_URL/BuildArtifacts.zip" +curl $curlflags -o "$tmp/update_livy.py" "$DOWNLOAD_URL/update_livy.py" +rm -rf "$SDK_DIR"; mkdir -p "$SDK_DIR" +cd "$SDK_DIR"; unzip "$tmp/BuildArifacts.zip"; rm "$tmp/BuildArifacts.zip" + +# Change Livy configuration +# Note: cntk has the same .so files in both version +# Note: we don't need the sdk directory except for the so files (will soon go away) +LD_STRING="/usr/bin/anaconda/lib/python2.7/site-packages/cntk/libs" +LD_STRING+=":$SDK_DIR/sdk" +echo "Updating Livy configurations..." +python "$tmp/update_livy.py" \ + "/home/spark/.sparkmagic/config.json" "$MAVEN_PACKAGE" "$LD_STRING" +rm -rf "$tmp" + +/bin/su livy -c \ + "spark-shell --packages \"$MAVEN_PACKAGE\" --repositories \"$MAVEN_URL\" < /dev/null" + +for env in "${CONDA_ENVS[@]}"; do + _anaconda_bin activate "$condaenv" + pip install "$PIP_PACKAGE" + _anaconda_bin deactivate +done + +# Check whether script is running on headnode +if [[ "$(get_primary_headnode)" != "$(hostname -f)" ]]; then + echo "$(hostname -f) is not primary headnode, exiting." + exit 0 +fi + +# ----------------------------------------------------------------------------- +# Run only on the main head node + +# Copy notebooks to storage +hdfs dfs -rm -f -r -skipTrash "$HDFS_NOTEBOOKS_FOLDER" +hdfs dfs -mkdir -p "$HDFS_NOTEBOOKS_FOLDER" + +# pure bash url encoder +urlencode() { + local str="$1" ch + for ((i=0; i < ${#str}; i++)); do + ch="${str:i:1}" + case "$ch" in + ( [a-zA-Z0-9_.-] ) printf '%s' "$ch" ;; + ( * ) printf '%%%02x' "'$ch" ;; + esac + done + printf '\n' +} + +for f in "$SDK_DIR/notebooks/hdinsight/"*.ipynb; do + hdfs dfs -copyFromLocal "$(urlencode "$f")" "$HDFS_NOTEBOOKS_FOLDER" +done + +# Constants needed for changing Ambari configs +AMBARI_HOST="headnodehost" +AMBARI_PORT="8080" +AMBARI_USER="$(python -c ' +import hdinsight_common.Constants as C +print C.AMBARI_WATCHDOG_USERNAME')" +AMBARI_PASSWD="$(python -c ' +import hdinsight_common.ClusterManifestParser as P, hdinsight_common.Constants as C, base64 +base64pwd = P.parse_local_manifest().ambari_users.usersmap[C.AMBARI_WATCHDOG_USERNAME].password +print base64.b64decode(base64pwd)')" +CLUSTERNAME="$(python -c ' +import hdinsight_common.ClusterManifestParser as P +print P.parse_local_manifest().deployment.cluster_name')" + +# Stop and restart affected services +stop_service_via_rest() { # service-name + local name="$1"; echo "Stopping $name" + local data='{"RequestInfo": {"context" :"Stopping service '"$name"' to install MMLSpark"},' + data+=' "Body": {"ServiceInfo": {"state": "INSTALLED"}}}' + curl $curlflags -u "$AMBARI_USER:$AMBARI_PASSWD" -i -H "X-Requested-By: ambari" -X PUT -d "$data" \ + "http://$AMBARI_HOST:$AMBARI_PORT/api/v1/clusters/$CLUSTERNAME/services/$name" + echo "" +} +start_service_via_rest() { # service-name + local name="$1"; echo "Starting $name" + sleep 2 + local data='{"RequestInfo": {"context" :"Starting service '"$name"' with a new MMLSpark version"},' + data+=' "Body": {"ServiceInfo": {"state": "STARTED"}}}' + local args=($curlflags + -u "$AMBARI_USER:$AMBARI_PASSWD" -i -H "X-Requested-By: ambari" -X PUT -d "$data" + "http://$AMBARI_HOST:$AMBARI_PORT/api/v1/clusters/$CLUSTERNAME/services/$name") + local r="$(curl "${args[@]}")" + if [[ "$r" = *"500 Server Error"* || "$r" = *"internal system exception occurred"* ]]; then + sleep 60 + echo "Retry starting $name" + r="$(curl "${args[@]}")" + fi + echo "$r" + echo "" +} + +# Restart affected services +stop_service_via_rest LIVY +stop_service_via_rest JUPYTER +start_service_via_rest LIVY +start_service_via_rest JUPYTER + +echo "Done." diff --git a/tools/hdi/setup-test-authkey.sh b/tools/hdi/setup-test-authkey.sh new file mode 100755 index 0000000000..8d08960e80 --- /dev/null +++ b/tools/hdi/setup-test-authkey.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +SDK_DIR="<=<=fill-in-sdk-dir=>=>" +NB_DIR="$SDK_DIR/notebooks" # This gets created as root, need to chown to spark + +# This is the public key used by the build to access the test cluster +PUB_KEY="ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC0FUryXQloryZQGXVP9vOqBVsuUWihHs" +PUB_KEY+="YPHvNf8PgR6ctUxPrvdZheAuJ+JLmauZeV2B01lSqCdyhnkwxTKiwLh2dDFx2yruAcXd2" +PUB_KEY+="0MGjD3bc8kC60GxMgRMsRxL6Jgz9FtauLFLxiDuvsRxQcSCBGd+l+pPR/NuFZeSHlmRWC" +PUB_KEY+="mb25fY29tqyitEqytRT9viBA1QpoERSPuzr3DEy3YIJ4BLVen0VYLKMU58L7oyEZxTElm" +PUB_KEY+="7nQMeQKgRBWUZZgCB1pXR3JiTYni/bWP2t9wCWfgfNfSs1oUttt14Libm9NgRbjq2QzN8" +PUB_KEY+="aQtVv1KyAUKOEdPmFqiGCPh1lRvm4KB7MF key-for-VSO" + +cd ~spark || { echo "ERROR: could not find ~spark, aborting" 1>&2; exit 1; } + +# Add public key to authorized key +if [[ ! -f ".ssh/authorized_keys" ]] || ! grep -q " key-for-VSO\$" ".ssh/authorized_keys"; then + echo "Public key not found in authorized keys. Adding key..." + mkdir -p ".ssh" + echo "$PUB_KEY" >> ".ssh/authorized_keys" + chown -R "spark:spark" ".ssh" + chmod 700 ".ssh" +else + echo "Public key already added to authorized keys. Skipping..." +fi + +chown -R "spark:spark" "$NB_DIR" + +. /usr/bin/anaconda/bin/activate +conda update setuptools +pip install --upgrade nbconvert +pip install xmlrunner diff --git a/tools/hdi/update_livy.py b/tools/hdi/update_livy.py new file mode 100755 index 0000000000..3f4a475cf8 --- /dev/null +++ b/tools/hdi/update_livy.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +import sys +import json + +def main(): + if len(sys.argv) != 4: + raise Exception(("Not enough" if len(sys.argv)<4 else "Too many") + " arguments.") + [_, config_file, maven_pkg, ld_lib_path] = sys.argv + with open(config_file) as conf_file: + conf=json.load(conf_file) + conf["session_configs"]["conf"] = {} + conf["session_configs"]["conf"]["spark.jars.packages"] = maven_pkg + with open(config_file, "w") as outfile: + json.dump(conf, outfile, indent=2, sort_keys=True) + +if __name__ == "__main__": + try: + main() + except Exception as exn: + for line in str(exn).split("\n"): + print "[ERROR] {0}".format(line) + sys.exit(1) diff --git a/tools/mmlspark-packages.spec b/tools/mmlspark-packages.spec new file mode 100644 index 0000000000..667cf64056 --- /dev/null +++ b/tools/mmlspark-packages.spec @@ -0,0 +1,67 @@ +cycler=0.10.0 +decorator=4.0.11 +expat=2.1.0 +fontconfig=2.12.1 +glib=2.50.2 +ipykernel=4.6.1 +ipython=6.0.0 +ipython_genutils=0.2.0 +jbig=2.1 +jsonschema=2.6.0 +jupyter_client=5.0.1 +jupyter_console=5.1.0 +jupyter_core=4.3.0 +libffi=3.2.1 +libgcc=5.2.0 +libgfortran=3.0.0 +libiconv=1.14 +libsodium=1.0.10 +matplotlib=2.0.1 +nbformat=4.3.0 +numpy=1.12.1 +olefile=0.44 +pandas=0.19.2 +path.py=10.3.1 +pcre=8.39 +pexpect=4.2.1 +pickleshare=0.7.4 +pillow=4.1.0 +prompt_toolkit=1.0.14 +ptyprocess=0.5.1 +pycosat=0.6.1 +pycrypto=2.6.1 +pyparsing=2.1.4 +python=3.5.2 +python-dateutil=2.6.0 +pytz=2017.2 +pyzmq=16.0.2 +readline=6.2 +requests=2.11.1 +scikit-learn=0.18.1 +scipy=0.19.0 +setuptools=27.2.0 +simplegeneric=0.8.1 +sip=4.18 +six=1.10.0 +tornado=4.5.1 +traitlets=4.3.2 +wcwidth=0.1.7 +wheel=0.29.0 +yaml=0.1.6 +zeromq=4.1.5 +zlib=1.2.8 +notebook=5.0.0 +jinja2=2.9.6 +markupsafe=0.23 +mkl=2017.0.1 +pygments=2.2.0 +jpeg=9b +libpng=1.6.27 +nbconvert=5.1.1 +html5lib=0.999 +bleach=1.5.0 +entrypoints=0.2.2 +mistune=0.7.4 +pandocfilters=1.4.1 +testpath=0.3 +freetype=2.5.5 diff --git a/tools/notebook/postprocess.py b/tools/notebook/postprocess.py new file mode 100755 index 0000000000..498985accf --- /dev/null +++ b/tools/notebook/postprocess.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python + +DEPLOYMENT_KEY = "mml-deploy" +NOTEBOOK_POSTPROC = {} + +def _get_kernel_language(notebook): + name = notebook.metadata.language_info["name"].lower() + if "py" in name: + return "python" + elif "scala" in name: + return "scala" + else: + raise ValueError("Unknown language") + +def _setup_kernel_local(notebook): + if _get_kernel_language(notebook) == "python": + notebook.metadata["kernelspec"] = { + "display_name": "Python [default]", + "language": "python", + "name": "python3"} + notebook.metadata["language_info"] = { + "codemirror_mode": {"name": "ipython", "version": 3.0}, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2"} + return notebook +NOTEBOOK_POSTPROC["local"] = _setup_kernel_local + +def _setup_kernel_hdinsight(notebook): + from nbformat.notebooknode import NotebookNode + if _get_kernel_language(notebook) == "python": + notebook.metadata["kernelspec"] = { + "display_name": "PySpark3", + "language": "", + "name": "pyspark3kernel"} + notebook.metadata["language_info"] = { + "codemirror_mode": {"name": "python", "version": 3}, + "mimetype": "text/x-python", + "name": "pyspark3", + "pygments_lexer": "python3"} + return notebook +NOTEBOOK_POSTPROC["hdinsight"] = _setup_kernel_hdinsight + +def _notebooks_for_target(notebooks, target): + """Returns the subset of `notebooks` that must be deployed to a given + `target`. + :param notebooks: List of (file_name, NotebookNode) tuples. + :param target: Deployment target. + :rtype: List of (file_name, NotebookNode)""" + + from nbformat.notebooknode import NotebookNode + from copy import deepcopy + return [(notebook[0], deepcopy(notebook[1])) for notebook in notebooks + if target in notebook[1].metadata.get(DEPLOYMENT_KEY, target)] + +def _cells_for_target(notebook, target): + """Returns a notebook containing only the cells that must be deployed + to `target`. + :param notebook: NotebookNode containing the cells and other metadata + :param target: Deployment target""" + + notebook["cells"] = [cell for cell in notebook["cells"] + if target in cell.metadata.get(DEPLOYMENT_KEY, target)] + return notebook + +def _postprocessed_notebooks_by_target(notebooks): + """Returns a collection of notebooks for each of the deployment + targets with cells filtered for that target if necessary.""" + + notebooks_by_target = {} + for target in NOTEBOOK_POSTPROC.keys(): + candidate_nb = _notebooks_for_target(notebooks, target) + processed_nb = [(notebook[0], _cells_for_target(notebook[1], target)) + for notebook in candidate_nb] + postprocd_nb = [(notebook[0], NOTEBOOK_POSTPROC[target](notebook[1])) + for notebook in processed_nb] + notebooks_by_target[target] = postprocd_nb + + return notebooks_by_target + +def postprocess_notebooks(input_dir, output_base_dir): + """Scans all notebook files in `input_dir` and outputs + them for each deployment target under `output_base_dir`.""" + + import os + import glob + from nbformat import read, write, NO_CONVERT + notebooks = [(os.path.split(nbfile)[-1], read(nbfile, NO_CONVERT)) + for nbfile in glob.glob(os.path.join(input_dir, "*.ipynb"))] + notebooks_by_target = _postprocessed_notebooks_by_target(notebooks) + + for target, notebooks in notebooks_by_target.items(): + destination_dir = os.path.join(output_base_dir, target) + if not os.path.isdir(destination_dir): + os.makedirs(destination_dir) + for notebook in notebooks: + write(notebook[1], os.path.join(destination_dir, notebook[0])) + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser( + description = "Generate notebooks for each of the deployment targets: %s" + % (", ".join(NOTEBOOK_POSTPROC.keys()))) + parser.add_argument("input_dir", help = "Input directory containing notebooks") + parser.add_argument("output_dir", help = "Output directory for notebooks") + args = parser.parse_args() + postprocess_notebooks(args.input_dir, args.output_dir) diff --git a/tools/notebook/tester/NotebookTestSuite.py b/tools/notebook/tester/NotebookTestSuite.py new file mode 100644 index 0000000000..c17b065c37 --- /dev/null +++ b/tools/notebook/tester/NotebookTestSuite.py @@ -0,0 +1,69 @@ +import unittest + +class NotebookTestSuite(unittest.TestCase): + + # Tese are set if $PROC_SHARD has a "/" value + proc_num, proc_mod = (0, 0) + + def setUp(self): + from nbconvert.preprocessors import ExecutePreprocessor + self.preprocessor = ExecutePreprocessor(timeout=600, enabled=True, allow_errors=False) + + @staticmethod + def _discover_notebooks(): + import os, fnmatch + counter = -1 + for dirpath, dirnames, filenames in os.walk("."): + # skip checkpoint directories + if "ipynb_checkpoints" in dirpath: + continue + dirnames.sort() + filenames.sort() + for notebook_file in fnmatch.filter(filenames, "*.ipynb"): + counter += 1 + if (NotebookTestSuite.proc_num == 0 + or counter % NotebookTestSuite.proc_num == NotebookTestSuite.proc_mod): + yield dirpath, notebook_file + + def _in_pyspark(self): + """ + _in_pyspark: Returns true if this test is run in a context that has access to PySpark + """ + try: + from pyspark.sql import SparkSession + return True + except ImportError: + return False + + def edit_notebook(self, nb): + return nb + + @classmethod + def initialize_tests(cls): + import os, re + proc_shard = re.match("^ *(\d+) */ *(\d+) *$", os.getenv("PROC_SHARD","")) + if proc_shard: + NotebookTestSuite.proc_num = int(proc_shard.group(2)) + NotebookTestSuite.proc_mod = int(proc_shard.group(1)) - 1 + if not NotebookTestSuite.proc_mod < NotebookTestSuite.proc_num: + raise Exception("proc_shard: n should be <= m in n/m") + for dirpath, file_name in NotebookTestSuite._discover_notebooks(): + test_name = "test_" + re.sub("\\W+", "_", file_name) + def make_test(nbfile): + return lambda instance: instance.verify_notebook(nbfile) + setattr(cls, test_name, make_test(os.path.join(dirpath, file_name))) + + def verify_notebook(self, nbfile): + """ + verify_notebook: Runs a notebook and ensures that all cells execute without errors. + """ + from nbformat import read as read_nb, NO_CONVERT + try: + # First newline avoids the confusing "F"/"." output of unittest + print("\nTesting " + nbfile) + nb = read_nb(nbfile, NO_CONVERT) + if self._in_pyspark(): + nb = self.edit_notebook(nb) + self.preprocessor.preprocess(nb, {}) + except Exception as err: + self.fail(err) diff --git a/tools/notebook/tester/TestNotebooksLocally.py b/tools/notebook/tester/TestNotebooksLocally.py new file mode 100644 index 0000000000..e1d8e9c840 --- /dev/null +++ b/tools/notebook/tester/TestNotebooksLocally.py @@ -0,0 +1,36 @@ +import unittest +from NotebookTestSuite import NotebookTestSuite + +class LocalNotebookTests(NotebookTestSuite): + + def edit_notebook(self, nb): + """ + Inject the code needed to setup and shutdown spark and sc magic variables. + """ + from nbformat.notebooknode import NotebookNode + from textwrap import dedent + preamble_node = NotebookNode(cell_type="code", source=dedent(""" + from pyspark.sql import SparkSession + spark = SparkSession.builder.appName("NotebookTestSuite").master("local[*]").getOrCreate() + globals()["spark"] = spark + globals()["sc"] = spark.sparkContext + """)) + epilogue_node = NotebookNode(cell_type="code", source=dedent(""" + try: + spark.stop() + except: + pass + """)) + nb.cells.insert(0, preamble_node) + nb.cells.append(epilogue_node) + return nb + +if __name__ == "__main__": + import os, xmlrunner + LocalNotebookTests.initialize_tests() + outsfx = None + if LocalNotebookTests.proc_num > 0: + outsfx = str(LocalNotebookTests.proc_mod + 1) + result = unittest.main(testRunner=xmlrunner.XMLTestRunner(output=os.getenv("TEST_RESULTS","TestResults"), + outsuffix=outsfx), + failfast=False, buffer=False, catchbreak=False) diff --git a/tools/notebook/tester/TestNotebooksOnHdi.py b/tools/notebook/tester/TestNotebooksOnHdi.py new file mode 100644 index 0000000000..3911e9103d --- /dev/null +++ b/tools/notebook/tester/TestNotebooksOnHdi.py @@ -0,0 +1,48 @@ +import unittest +from NotebookTestSuite import NotebookTestSuite +from nbconvert.preprocessors import ExecutePreprocessor +from nbconvert.preprocessors.execute import CellExecutionError +from textwrap import dedent + +class ExecuteSparkmagicPreprocessor(ExecutePreprocessor): + + def preprocess_cell(self, cell, resources, cell_index): + """ + Executes a single code cell. See base.py for details. + + To execute all cells see :meth:`preprocess`. + """ + if cell.cell_type != "code": + return cell, resources + outputs = self.run_cell(cell) + cell.outputs = outputs + if not self.allow_errors: + for out in outputs: + if out.output_type == "stream" and out.name == "stderr": + pattern = u"""\ + An error occurred while executing the following cell: + ------------------ + {cell.source} + ------------------ + {out.text} + """ + msg = dedent(pattern).format(out=out, cell=cell) + raise CellExecutionError(msg) + return cell, resources + + +class HdiNotebookTests(NotebookTestSuite): + + def setUp(self): + self.preprocessor = ExecuteSparkmagicPreprocessor(timeout=600, enabled=True, + allow_errors=False) + +if __name__ == "__main__": + import os, xmlrunner + HdiNotebookTests.initialize_tests() + outsfx = None + if HdiNotebookTests.proc_num > 0: + outsfx = str(HdiNotebookTests.proc_mod + 1) + result = unittest.main(testRunner=xmlrunner.XMLTestRunner(output=os.getenv("TEST_RESULTS","TestResults"), + outsuffix=outsfx), + failfast=False, buffer=False, catchbreak=False) diff --git a/tools/notebook/tester/parallel_run.sh b/tools/notebook/tester/parallel_run.sh new file mode 100755 index 0000000000..88f19ecf8e --- /dev/null +++ b/tools/notebook/tester/parallel_run.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +# Use this script to run the tests in N parallel processes, where +# notebooks are split among them. It would have been better to combine +# python's unittest with multiprocessing (create a process pool, use +# p.map), but that will be more work, especially for dealing with the +# xml output. + +# Arguments: proc_num py_file [args...] +proc_num="$1"; shift +py_file="$1"; shift + +prefix_lines() { # pfx + local line + while read -r line; do printf "%s| %s\n" "$1" "${line%$'\r'}"; done +} + +onerun() { # id [args...] + local id="$1"; shift + # dump the prefixed output on the correct fd; use script to fake a tty + # so the python process would show progress. + PROC_SHARD="$id" \ + script -qefc "$(printf "%q " python "$py_file" "$@")" /dev/null \ + 1> >(prefix_lines "$id") 2> >(prefix_lines "$id" 1>&2) +} + +procs=() +for ((i=1; i <= proc_num; i++)); do onerun "$i/$proc_num" "$@" & procs+=($!); done + +status=0 +for p in "${procs[@]}"; do wait "$p" || status="$?"; done +exit $status diff --git a/tools/pip/MANIFEST.in b/tools/pip/MANIFEST.in new file mode 100644 index 0000000000..8a2066b50b --- /dev/null +++ b/tools/pip/MANIFEST.in @@ -0,0 +1,5 @@ +# Misc +include LICENSE.txt + +# documentation, if any - there may be a minimal set of docs: +# recursive-include docs *.html *.txt *.js diff --git a/tools/pip/README.txt b/tools/pip/README.txt new file mode 100644 index 0000000000..b73c4462e6 --- /dev/null +++ b/tools/pip/README.txt @@ -0,0 +1,8 @@ +Microsoft ML for Apache Spark +============================= + +This package contains the PySpark library for MMLSpark. + +This library provides spark estimators, transformers, and utility functions +for machine learning on Spark. For more complete documentation, refer to +the MMLSpark repo: https://github.com/Azure/mmlspark . diff --git a/tools/pip/generate-pip.sh b/tools/pip/generate-pip.sh new file mode 100755 index 0000000000..02aed7d7a4 --- /dev/null +++ b/tools/pip/generate-pip.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@" +main() { + +local srcdir="$TOOLSDIR/pip" +local destdir="$BUILD_ARTIFACTS/packages/pip" +local tempdir="$destdir/mmlspark" +local wheelfile="$destdir/$PIP_PACKAGE" + +# Create the package structure in the temp packaging directory +_rmcd "$tempdir" +_ cp "$srcdir/"* .; _rm *.sh +_ unzip -q "$BUILD_ARTIFACTS/sdk/mmlspark.zip" +_ cp "$BASEDIR/LICENSE" "LICENSE.txt" + +# Create the package +_ python setup.py bdist_wheel --universal -d "$destdir" +if [[ -r "$wheelfile" ]]; then show - "Generated wheel: $wheelfile" +else failwith "expected wheel file missing: $wheelfile"; fi + +# Cleanup +_ cd "$destdir" +_rm "$tempdir" + +} +main "$@" diff --git a/tools/pip/setup.py b/tools/pip/setup.py new file mode 100644 index 0000000000..93dd8e1f88 --- /dev/null +++ b/tools/pip/setup.py @@ -0,0 +1,33 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +import setuptools, os + +setuptools.setup( + name = "mmlspark", + version = os.environ["MML_VERSION"], + description = "Microsoft ML for Spark", + long_description = "The Microsoft ML for Apache Spark package provides a python API to scala.", + license = "MIT", + packages = ["mmlspark"], + + # Project's main homepage. + url = "https://github.com/Azure/mmlspark", + # Author details + author = "Microsoft", + author_email = os.environ["SUPPORT_EMAIL"], + + classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Data Scientists", + "Topic :: Software Development :: Datascience Tools", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 3" + ], + + zip_safe = True, + + package_data = {"mmlspark": ["../LICENSE.txt", "../README.txt"]} +) diff --git a/tools/pytests/auto-tests b/tools/pytests/auto-tests new file mode 100755 index 0000000000..def21357f5 --- /dev/null +++ b/tools/pytests/auto-tests @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@" +@ "shared.sh" +main() { + +show section "Running Generated Python Tests" +local testdir="$TEST_RESULTS/generated_pytests" t status=0 +cd "$testdir" +for t in *"_tests.py"; do + printf "\n\n==================== %s ====================\n" "$t" + _pytest "$t" || status=$? +done +if ((status)); then failwith "failures in generated python tests"; fi + +} +main "$@" diff --git a/tools/pytests/notebook-tests b/tools/pytests/notebook-tests new file mode 100755 index 0000000000..5149d3face --- /dev/null +++ b/tools/pytests/notebook-tests @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@" +@ "shared.sh" + +show section "Running Local Notebook Tests" +cd "$TEST_RESULTS/notebook_tests/local" +_ cp -a "$BASEDIR/tools/notebook/tester/"* . +_pytest "TestNotebooksLocally.py" || failwith "failures in local notebook tests" diff --git a/tools/pytests/shared.sh b/tools/pytests/shared.sh new file mode 100644 index 0000000000..94354caf12 --- /dev/null +++ b/tools/pytests/shared.sh @@ -0,0 +1,16 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +_pytest() { + local ret=0 tmperr="/tmp/pytest-stderr-$$" line + # capture stderr and show it on failure, because it looks like log4j is using + # stderr directly, which bypasses unittest's capture of stderr + TEST_RESULTS="$TEST_RESULTS" \ + "$TOOLSDIR/bin/mml-exec" spark-submit "$@" 2> "$tmperr" || { + ret=$? + echo "Standard error for the above failure:" + cat "$tmperr" | while read -r line; do printf " | %s\n" "$line"; done + } + rm -f "$tmperr" + return $ret +} diff --git a/tools/runme/README.txt b/tools/runme/README.txt new file mode 100644 index 0000000000..7eea1b07b0 --- /dev/null +++ b/tools/runme/README.txt @@ -0,0 +1,4 @@ +This directory holds the implementation of the build/install script. +These files are not intended to be used directly, use the toplevel +"runme" script for that. For a description of what it's doing, use +"runme help". diff --git a/tools/runme/build-readme.tmpl b/tools/runme/build-readme.tmpl new file mode 100644 index 0000000000..155e983c34 --- /dev/null +++ b/tools/runme/build-readme.tmpl @@ -0,0 +1,12 @@ +# MMLSpark $MML_VERSION + +* Source: [$BUILD_REPOSITORY_NAME]($BUILD_REPOSITORY_URI), + ${BUILD_SOURCEBRANCH##refs/@(heads/|)} at revision ${BUILD_SOURCEVERSION:0:8} + (by $BUILD_SOURCEVERSIONAUTHOR). + +* Build: $BUILD_DEFINITIONNAME, $BUILD_BUILDNUMBER + (built by $AGENT_NAME on $AGENT_MACHINENAME, $(date +'%F %R')) + +* Info: `$MML_BUILD_INFO` + +Queued by: $BUILD_QUEUEDBY for [$BUILD_REQUESTEDFOR](mailto:$BUILD_REQUESTEDFOREMAIL) diff --git a/tools/runme/build.sh b/tools/runme/build.sh new file mode 100644 index 0000000000..d89b86ed8e --- /dev/null +++ b/tools/runme/build.sh @@ -0,0 +1,249 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +################################################################################ +# Build + +# Since developers usually work in the IDE, most of the build mechanics is done +# by SBT. + +_show_template_line() { + eval show - "$(qstr -not-dollar "${2%$'\n'}")" +} + +_generate_description() { + if [[ "$BUILDMODE" != "server" || "$AGENT_ID" = "" ]]; then return; fi + show section "Generating Build.md" + show command "... > $(qstr "$BUILD_ARTIFACTS/Build.md")" + mapfile -c 1 -C _show_template_line \ + < "$RUNMEDIR/build-readme.tmpl" > "$BUILD_ARTIFACTS/Build.md" + if [[ "$PUBLISH" = "all" ]]; then + printf '\nThis is a publish build.\n' >> "$BUILD_ARTIFACTS/Build.md" + echo "##vso[build.addbuildtag]Publish" + fi + # upload the generated description lazily on exit, so we can add info lines below + echo_exit "##vso[task.uploadsummary]$BUILD_ARTIFACTS/Build.md" +} + +_postprocess_sbt_log() { + # Adapts the SBT output to work nicely with the VSTS build, most of the work + # is for the SPARK output logs + local line rx tag text oIFS="$IFS" + IFS="" # preserve whitespaces + # Prefix finding regexp + rx=$'^(\e[[0-9]+m)?\[?(\e[[0-9]+m)??' + rx+=$'(warning|WARNING|warn|WARN|info|INFO|error|ERROR)' + rx+=$'(\e[[0-9]+m)?\]?(\e[[0-9]+m)? *(.*)' + while read -r line || [[ -n "$line" ]]; do + # Drop time stamps from SPARK output lines + line="${line#[0-9][0-9]/[0-9][0-9]/[0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9] }" + # Highlight a prefix of "[warning]" with optional brackets, and the same + # for "warn"s, "error"s and "info"s (for info, just drop the prefix); do + # that for uppercase also, but *not* mixed since spark shows a line that + # starts with "Info provided" + if [[ "${line}" =~ $rx ]]; then + tag="${BASH_REMATCH[3],,}" + if [[ "$tag" = "warn" ]]; then tag="warning" + elif [[ "$tag" = "info" ]]; then tag="-" + fi + # preserve the line (with escape sequences) when in interactive mode + if [[ "${BUILDMODE}${BASH_REMATCH[1]}" != "server" ]]; then text="$line" + else text="${BASH_REMATCH[6]}" + fi + show "$tag" "$text" + else + echo "$line" + fi + done + IFS="$oIFS" +} + +_prepare_build_artifacts() { + show section "Preparing Build" + _rm "$BUILD_ARTIFACTS" "$TEST_RESULTS" + _ mkdir -p "$BUILD_ARTIFACTS/sdk" "$TEST_RESULTS" + _ cp -a "$BASEDIR/LICENSE" "$BUILD_ARTIFACTS" + _ cp -a "$BASEDIR/LICENSE" "$BUILD_ARTIFACTS/sdk" + echo "$MML_VERSION" > "$BUILD_ARTIFACTS/version" + local paths + # copy only the test notebooks from notebooks/tests to the local test + # directory -- running all notebooks is covered better by the E2E tests + for paths in "samples:$BUILD_ARTIFACTS/notebooks" "tests:$TEST_RESULTS/notebook_tests"; do + _ "$BASEDIR/tools/notebook/postprocess.py" "$BASEDIR/notebooks/${paths%%:*}" "${paths#*:}" + done +} + +_sbt_run() { # sbt-args... + local flags=""; if [[ "$BUILDMODE" = "server" ]]; then flags="-no-colors"; fi + (set -o pipefail; _ sbt $flags "$@" < /dev/null 2>&1 | _postprocess_sbt_log) \ + || exit $? +} + +_sbt_build() { + show section "Running SBT Build" + local owd="$PWD" restore_opt="$(shopt -p nullglob)"; shopt -s nullglob + cd "$SRCDIR" + local rmjars=( **/"target/scala-"*/!(*"-$MML_VERSION")".jar" ) + $restore_opt + if [[ "${#rmjars[@]}" != "0" ]]; then + show command "rm **/target/...stale-jars" + __ rm "${rmjars[@]}" + fi + local TESTS="$TESTS" + if ! should test scala; then TESTS="none" + else # Hide the "+scala" tag + TESTS=",$TESTS,"; TESTS="${TESTS//,+scala,/,}"; TESTS="${TESTS#,}"; TESTS="${TESTS%,}" + if [[ "$TESTS" = "" ]]; then TESTS="all"; fi + fi + _sbt_run "full-build" + # leave only the -assembley jars under the proper name (and the pom files) + local f; for f in "$BUILD_ARTIFACTS/packages/m2/"**; do case "$f" in + ( *-@(javadoc|sources).jar@(|.md5|.sha1) ) _rm "$f" ;; + ( *-assembly.jar@(|.md5|.sha1) ) _ mv "$f" "${f//-assembly.jar/.jar}" ;; + esac; done + cd "$owd" +} + +_upload_to_storage() { # name, pkgdir, container + show section "Publishing $1 Package" + _ az storage blob upload-batch --account-name "$MAIN_CONTAINER" \ + --source "$BUILD_ARTIFACTS/packages/$2" --destination "$3" +} + +_e2e_script_action() { # script-name file-name config-name + local cnf="$1" script_name="$2" file="$3"; shift 3 + local cluster="${cnf}_CLUSTER_NAME" group="${cnf}_RESOURCE_GROUP" + local url="$STORAGE_URL/$MML_VERSION/$file" + collect_log=1 \ + _ azure hdinsight script-action create "${!cluster}" -g "${!group}" \ + -n "$script_name" -u "$url" -t "headnode;workernode" + echo "$collected_log" + if [[ ! "$collected_log" =~ "Operation state: "+"Succeeded" ]]; then + failwith "script action failed" + fi +} +e2ekey="" +_e2e_ssh() { + local cmd keyfile rm_pid ret + cmd=("ssh"); if [[ "$1" = "scp" ]]; then cmd=("$1"); shift; fi + if [[ "$_e2e_key" = "" ]]; then + e2ekey="$(__ az keyvault secret show --vault-name mmlspark-keys --name testcluster-ssh-key)" + e2ekey="${e2ekey##*\"value\": \"}"; e2ekey="${e2ekey%%\"*}"; e2ekey="${e2ekey//\\n/$'\n'}" + fi + keyfile="/dev/shm/k$$"; touch "$keyfile"; chmod 600 "$keyfile"; echo "$e2ekey" > "$keyfile" + cmd+=(-o "StrictHostKeyChecking=no" -i "$keyfile") + if [[ "${cmd[0]}" = "ssh" ]]; then + { sleep 30; rm -f "$keyfile"; } & + rm_pid="$!" + _ -a "${cmd[@]}" "$@"; ret="$?" + kill -9 "$rm_pid" > /dev/null 2>&1; rm -f "$keyfile" + elif [[ "${cmd[0]}" = "scp" ]]; then + _ -a "${cmd[@]}" "$@"; ret="$?" + rm -f "$keyfile" + fi + return $ret +} +_e2e_tests() { + show section "Running E2E Tests" + _e2e_script_action "E2E" "Install MML to E2E Cluster" "install-mmlspark.sh" + _e2e_script_action "E2E" "Setup authorized-keys for E2E" "setup-test-authkey.sh" + local shost="$E2E_CLUSTER_SSH" sdir="$CLUSTER_SDK_DIR/notebooks/hdinsight" + _e2e_ssh scp -p "$TEST_RESULTS/notebook_tests/hdinsight/"* "$shost:$sdir" + _e2e_ssh scp -p "$BASEDIR/tools/notebook/tester/"* "$shost:$sdir" + _e2e_ssh -t -t "$shost" \ + ". /usr/bin/anaconda/bin/activate; \ + cd \"$sdir\"; rm -rf \"../local\"; \ + ./parallel_run.sh 2 \"TestNotebooksOnHdi.py\"" + local ret="$?" + _e2e_ssh scp "$shost:$sdir/TestResults/*" "$TEST_RESULTS" + if ((ret != 0)); then failwith "E2E test failures"; fi +} + +_publish_to_demo_cluster() { + show section "Installing Demo Cluster" + _e2e_script_action "DEMO" "Install MML to Demo Cluster" "install-mmlspark.sh" +} + +_publish_to_dockerhub() { + @ "../docker/build-docker" + local itag="mmlspark:latest" otag otags + otag="microsoft/mmlspark:$MML_VERSION"; otag="${otag//+/_}"; otags=("$otag") + if [[ "$MML_VERSION" = *([0-9.]) ]]; then otags+=( "microsoft/mmlspark:latest" ); fi + show section "Pushing to Dockerhub as ${otags[*]}" + show - "Image info:" + local info="$(docker images "$itag")" + if [[ "$info" != *$'\n'* ]]; then failwith "tag not found: $itag"; fi + info=" | ${info//$'\n'/$'\n | '}" + echo "$info" + local auth user pswd + __ docker logout > /dev/null + auth="$(__ az keyvault secret show --vault-name mmlspark-keys --name dockerhub-auth)" + auth="${auth##*\"value\": \"}"; auth="${auth%%\"*}"; auth="$(base64 -d <<<"$auth")" + user="${auth%%:*}" pswd="${auth#*:}" + ___ docker login -u "$user" -p "$pswd" > /dev/null + unset user pass auth + for otag in "${otags[@]}"; do + show - "Pushing \"$otag\"" + _ docker tag "$itag" "$otag" + _ docker push "$otag" + _ docker rmi "$otag" + done + __ docker logout > /dev/null +} + +_upload_artifacts_to_VSTS() { + if [[ "$BUILDMODE" != "server" ]]; then return; fi + show section "Uploading Build Artifacts to VSTS" + local f d + for f in "$BUILD_ARTIFACTS/"**/*; do + if [[ -d "$f" ]]; then continue; fi + f="${f#$BUILD_ARTIFACTS}"; d="${f%/*}" + echo "##vso[artifact.upload artifactname=Build$d]$BUILD_ARTIFACTS/$f" + done +} + +_upload_artifacts_to_storage() { + show section "Uploading Build Artifacts to Storage" + _ az account show > /dev/null # this fails if not logged-in + local tmp="/tmp/mmlbuild-$$" # temporary place for uploads + mkdir -p "$tmp" + ( cd "$BUILD_ARTIFACTS" + _ zip -qr9 "$tmp/$(basename "$BUILD_ARTIFACTS.zip")" * ) + local f txt + for f in "$TOOLSDIR/hdi/"*; do + txt="$(< "$f")" + txt="${txt//<=<=fill-in-maven-package=>=>/com.microsoft.ml.spark:mmlspark_$SCALA_VERSION:$MML_VERSION}" + txt="${txt//<=<=fill-in-maven-url=>=>/$MAVEN_URL}" + txt="${txt//<=<=fill-in-pip-package=>=>/$PIP_URL/$PIP_PACKAGE}" + txt="${txt//<=<=fill-in-sdk-dir=>=>/$CLUSTER_SDK_DIR}" + txt="${txt//<=<=fill-in-url=>=>/$STORAGE_URL/$MML_VERSION}" + echo "$txt" > "$tmp/$(basename "$f")" + done + _ az storage blob upload-batch --account-name "$MAIN_CONTAINER" \ + --source "$tmp" --destination "$STORAGE_CONTAINER/$MML_VERSION" + _rm "$tmp" + printf '\nCopy the link to [%s](%s) to setup this build on a cluster.' \ + "this HDInsight Script Action" "$STORAGE_URL/$MML_VERSION/install-mmlspark.sh" \ + >> "$BUILD_ARTIFACTS/Build.md" +} + +_full_build() { + show section "Building ($MML_VERSION)" + _ cd "$BASEDIR" + _prepare_build_artifacts + _generate_description + _sbt_build + _ ln -sf "$(realpath --relative-to="$HOME/bin" "$TOOLSDIR/bin/mml-exec")" \ + "$HOME/bin" + should publish maven && _upload_to_storage "Maven" "m2" "$MAVEN_CONTAINER" + should test python && @ "../pytests/auto-tests" + should test python && @ "../pytests/notebook-tests" + should publish pip && @ "../pip/generate-pip.sh" + should publish pip && _upload_to_storage "PIP" "pip" "$PIP_CONTAINER" + should publish storage && _upload_artifacts_to_storage + should test e2e && _e2e_tests + should publish demo && _publish_to_demo_cluster + should publish docker && _publish_to_dockerhub + _upload_artifacts_to_VSTS + return 0 +} diff --git a/tools/runme/install.sh b/tools/runme/install.sh new file mode 100644 index 0000000000..007cab3b1f --- /dev/null +++ b/tools/runme/install.sh @@ -0,0 +1,206 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +################################################################################ +# Environment Installation + +inst_work_done="" +_note_work() { # what, as something that can be shown in "_ done" + # prefer bigger work as the label to keep + for what in "Install" "Update" "Environment update"; do + if [[ "$what" = "$1" ]]; then inst_work_done="$1"; fi + if [[ "$what" = "$inst_work_done" ]]; then return; fi + done +} + +_verify_version() { # [-q] what; prints error if mismatched version + # "-q" => quick mode: just check that the version file matches + local quick="N"; if [[ "x$1" = "x-q" ]]; then quick="Y"; shift; fi + libname="$1"; shift + local lib vers ver; set_install_info_vars "$libname" lib vers ver + local dir="$HOME/lib/$lib" + if [[ ! -d "$dir" ]]; then + echo "$libname is not installed (missing dir: $dir)"; return + fi + local vcmd="" pat1 pat2 actual line ver_file="$dir/$LIB_VERSION_FILE" + if [[ "$vers" != "" ]]; then vcmd="${vers%%|*}"; vers="${vers#*|}"; fi + if [[ "$quick" = "Y" && -r "$ver_file" ]]; then + read -r actual < "$ver_file" + if [[ "$actual" = "$ver" ]]; then return; fi # look for $ver here! + fi + if [[ "$vcmd" = "" ]]; then echo "no version information"; return; fi + actual="$(cd "$dir"; ___ $vcmd 2>&1)" + if [[ $'\n'"$actual"$'\n' != *$'\n'$vers$'\n'* ]]; then # $vers can have globs + printf 'unexpected %s version,\n wanted:\n | %s\n got:\n | %s' \ + "$libname" "$vers" "${actual//$'\n'/$'\n | '}" + fi +} + +_do_envinits() { + cd "$HOME" + local cmd script="" f="$PROFILE_FILE" add_init="N" orig_profile="$MMLSPARK_PROFILE" + # create the init script + for cmd in "${envinit_commands[@]}"; do + script+="$cmd"$'\n' + local var="" val + if [[ "$cmd" = "export "*"="* ]]; then + var="${cmd#export }"; var="${var%%=*}"; val="$(qstr "${!var}")" + fi + # print only commands and setenvs that change values + if [[ "$var" = "" || "$cmd" != "export $var=$val" ]]; then show command "$cmd"; fi + done + eval "$script" + if [[ ! -e "$f" ]]; then add_init="Y"; show section "Creating $f" + elif [[ "${script%$'\n'}" != "$(<"$f")" ]]; then show section "Updating $f" + else return; fi + _note_work "Environment update" + show command "...init code... > \"$f\"" + echo -n "$script" > "$f" + if [[ "$add_init" = "N" ]]; then return; fi + if [[ "$orig_profile" = "yes" ]]; then + show warning "There was no $f file, but \$MMLSPARK_PROFILE is set," + show warning "so the environment was modified in an unexpected way and" + show warning "therefore no shell init files are modified." + return + fi + show section "Adding environment initialization" + local file haveit="N" text fh="$(qstr "$f")" + local cmd="[[ \"\$MMLSPARK_PROFILE\" != \"\" ]] || . $fh" + local qcmd="$(qstr "$cmd")" + for file in "${ENV_INIT_FILES[@]}"; do + if [[ ! -r "$file" ]]; then continue; fi + haveit="Y" # either it's there or we're adding it + text="$(< "$file")" + if [[ "$text" = *"$cmd"* ]]; then continue; fi + show command "...added init line... > \"$file\"" + # add it at the top since some init files have `return`s in the middle + echo "$cmd"$'\n\n'"$text" > "$file" + done + if [[ "$haveit" = "N" ]]; then + show command "echo $qcmd > \"${ENV_INIT_FILES[0]}\"" + echo "$cmd" > "${ENV_INIT_FILES[0]}" + fi + show - "" + show warning "I made your shell initialization load $f, but this" + show warning "shell is still not initialized. Enter \"source $f\"" + show warning "to do so, or start a new terminal." +} + +_unpack_tgz() { + _ tar xzf "$1" --strip-components=1 +} + +_unpack_zip() { + local restore_opt="$(shopt -p dotglob)"; shopt -s dotglob + _ unzip -q "$1" + local paths=( * ) + if [[ "${#paths[@]}" != "1" || ! -d "${paths[0]}" ]]; then + failwith "empty archive or archive with multiple toplevel directories, $1" + fi + show command "mv ${paths[0]}/* .; rmdir ${paths[0]}" + local tmp="...install-tmp-$$" + mv "${paths[0]}" "$tmp" + mv "$tmp"/* . + rmdir "$tmp" + $restore_opt +} + +_unpack_sh() { + if [[ "x$instcmd" != "x" ]]; then eval "_ $instcmd" + else failwith "sh package without instcmd: $1"; fi +} + +_retrieve_file() { # url file sha256 + # Retrieve the $url into $file with a cache left in $INSTALLER_CACHE_DIR; the + # file will actually be a symlink to the cache; if $INSTALLER_CACHE_DIR is + # empty no cache is used; verify sha256 checksum; only verified files are + # cached; files in the cache are assumed to be valid. + local url="$1" target="$2" sha256="$3"; shift 3 + local cache="$INSTALLER_CACHE_DIR/$(basename "$target")" + if [[ -n "$INSTALLER_CACHE_DIR" && -r "$cache" && -r "$cache.sha256" + && "$(< "$cache.sha256")" = "$sha256" ]]; then + _ ln -sf "$cache" "$target"; return + fi + _ curl --output "$target" $CURL_FLAGS "$url" + local sha256sum="$(__ sha256sum "$target")"; sha256sum="${sha256sum%% *}" + if [[ "x$sha256sum" = "x" ]]; then failwith "could not get sha256 checksum"; fi + if [[ "$sha256sum" != "$sha256" ]]; then + failwith "sha256 checksum failed for $target (retrieved from $url)" + fi + if [[ -z "$INSTALLER_CACHE_DIR" ]]; then return; fi + _md "$INSTALLER_CACHE_DIR" + _ mv "$target" "$cache"; echo "$sha256" > "$cache.sha256" + _ ln -s "$cache" "$target" +} + +_install() { # libname + libname="$1"; shift + local lib envvar url sha256 instcmd exes vers ver bindir prereq where + set_install_info_vars "$libname" \ + lib envvar url sha256 instcmd exes vers ver bindir prereq where + if [[ ( "$BUILDMODE" = "server" && " $where " != *" build "* ) + || ( "$BUILDMODE" = "runtime" && " $where " != *" runtime "* ) + || ( " $where " != *" devel "* ) ]]; then return + fi + local dir="$HOME/lib/$lib" + setenv "${envvar}_VERSION" "$ver" + setenv "${envvar}_HOME" "$dir" + if [[ "x$prereq" != "x" ]] && ! eval "${prereq%|*}" > /dev/null 2>&1; then + failwith "$libname: prerequisite failure: ${prereq##*|}" + fi + if [[ "$(_verify_version -q "$libname")" = "" ]]; then + cd "$dir"; call_ifdef "$libname.init" # can use $ver + return + fi + local update="N" Op; if [[ -r "$dir/$LIB_VERSION_FILE" ]]; then update="Y"; fi + if [[ "$update" = "Y" ]]; then Op="Updating"; _note_work "Update" + else Op="Installing"; _note_work "Install"; fi + # avoid output up to here, so there's nothing unless we actually do something + show section "$Op $libname v$ver in $dir" + show command setenv "${envvar}_VERSION" "$ver" + show command setenv "${envvar}_HOME" "$dir" + if [[ "$update" = "Y" && "$(_verify_version "$libname")" = "" ]]; then + show warning "Looks like $libname was already updated, noting new version" + _ cd "$dir" + else + if [[ "$update" = "Y" ]]; then show warning "Removing $dir!"; _rm "$dir"; fi + if [[ -d "$dir" ]]; then failwith "directory exists, please remove it: $dir"; fi + local sfx="$(get_suffix "$url")"; if [[ "$sfx" = "tar.gz" ]]; then sfx="tgz"; fi + local file="/tmp/$lib.$sfx" + _retrieve_file "$url" "$file" "$sha256" + _mcd "$dir" + if [[ "$(type -t _unpack_$sfx)" = "function" ]]; then _unpack_$sfx "$file" + else failwith "unknown package file suffix: $sfx"; fi + _rm "$file" + fi + map call_ifdef "$libname.setup" "$libname.init" # can use $ver + if [[ "$setup_function" != "" ]]; then _ "$setup_function"; fi + show command "...text... > $(qstr "$LIB_VERSION_FILE")" + { echo "$ver" + echo "" + echo "This directory has an installation of $libname v$ver" + echo "It has been created by the MMLSpark build script: as long as this file" + echo "exists, the build script is allowed to remove it for version updates" + echo "when needed. Please do not modify it." + } > "$dir/$LIB_VERSION_FILE" + _ cd "$HOME/bin" + local exe + for exe in $exes; do _ ln -sf "../lib/$lib/$bindir/$exe" "$exe"; done + if [[ "$vers" != "" ]]; then + show debug "verifying $libname installation version" + local err="$(_verify_version "$libname")" + if [[ "$err" != "" ]]; then _rm -rf "$dir"; failwith "$err"; fi + fi +} + +# Main entry point +_install_environment() { + # Common directories + _md "$HOME/bin" "$HOME/lib" + # Installations + map _install "${install_packages[@]}" + _ cd "$BASEDIR" + _rm "$CONF_TRACK_FILE" + # Set vars and setup environment initialization + _do_envinits +} diff --git a/tools/runme/runme.sh b/tools/runme/runme.sh new file mode 100755 index 0000000000..f66917ce8a --- /dev/null +++ b/tools/runme/runme.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +# Load once +if [[ "${RUNME_LOADED:-}" = "$$" ]]; then return; else RUNME_LOADED="$$"; fi + +# extra bash globs, quote expansion of quoted parameters +shopt -s globstar extglob extquote + +# Where are we? +RUNMEDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd)" +TOOLSDIR="$(dirname "$RUNMEDIR")" +BASEDIR="$(dirname "$TOOLSDIR")" + +if [[ "${OS:-}" = "Windows_NT" ]]; then + echo "This script cannot run on Windows (yet)." 1>&2; exit 1 +fi + +# PATH for these scripts: conservative (will include "$HOME/bin" later) +PATH="/usr/bin:/bin" + +# shared for runme and all scriplets +. "$RUNMEDIR/utils.sh" +[[ -r "$TOOLSDIR/local-config.sh" ]] && @ "$TOOLSDIR/local-config.sh" +@ "../config.sh"; _post_config + +# main runme functionality +_runme() { + @ "install.sh" + @ "build.sh" + case "$BUILDMODE" in + ( "build" | "server" ) + _install_environment + _full_build + ;; + ( "setup" | "runtime" ) + _install_environment + ;; + ( "" ) + _install_environment + if [[ "$inst_work_done" = "" ]]; then _full_build; exit; fi + show section "$inst_work_done done" + show warning "You can use the environment now," \ + "or run this script again to build." + ;; + ( * ) + failwith "unknown build mode: $BUILDMODE" + ;; + esac +} diff --git a/tools/runme/show-version b/tools/runme/show-version new file mode 100755 index 0000000000..f014081741 --- /dev/null +++ b/tools/runme/show-version @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@" + +echo "$MML_VERSION" diff --git a/tools/runme/utils.sh b/tools/runme/utils.sh new file mode 100644 index 0000000000..a102502526 --- /dev/null +++ b/tools/runme/utils.sh @@ -0,0 +1,450 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +################################################################################ +# Utilities + +# ---< defvar [opts] var val >-------------------------------------------------- +# Use this to define customize-able variables (no effect if it exists). +# Additional arguments are concatenated (wihtout spaces) to make long value +# settings look nice. Use "-x" to export the variable, "-p" to resolve the +# value to an absolute path from where we are, "-f" to set the value even if +# it's already set. You can also use "-d" to define values with delayed +# references to other variables using "...<{var}>..." -- these will be replaced +# at the end of processing the config file. +_delayed_vars=() +defvar() { + local opts=""; while [[ "x$1" == "x-"* ]]; do opts+="${1:1}"; shift; done + local var="$1" val v; shift + if [[ "$opts" == *"f"* || -z "${!var+x}" ]]; then + val=""; for v; do val+="$v"; done; printf -v "$var" "%s" "$val"; fi + if [[ "$opts" == *"p"* && "x${!var}" != "/"* ]]; then + printf -v "$var" "%s" "$(realpath -m "${!var}")"; fi + if [[ "$opts" == *"x"* ]]; then export "$var"; fi + if [[ "$opts" == *"d"* ]]; then _delayed_vars+=( "$var" ); fi +} +_replace_delayed_vars() { + local var val pfx sfx change=1 + for var in "${_delayed_vars[@]}"; do + val="${!var}" + while [[ "$val" = *"<{"*"}>"* ]]; do + pfx="${val%%"<{"*}"; val="${val#*"<{"}" + sfx="${val#*"}>"}"; val="${val%%"}>"*}" + val="$pfx${!val}$sfx" + printf -v "$var" "%s" "$val" + done + done +} + +# Parse `X=Y` arguments, stop at a "--" +while [[ "$#" -gt 0 ]]; do case "$1" in + ( *"="* ) defvar -f "${1%%=*}" "${1#*=}" ;; + ( "--" ) shift; break ;; + ( "-h" | "--help" | "help" ) + text="$(<"$BASEDIR/runme")" + text="${text#*$'\n'+(#)$'\n# '}"; text="${text%$'\n'+(#)$'\n'*}" + text="${text//$'\n'#?( )/$'\n'}" + echo "$text" + exit + ;; + ( * ) echo "WARNING: ignoring unrecognized argument \"$1\"" 1>&2; sleep 1 ;; +esac; shift; done + +# ---< @ bash-file arg... >----------------------------------------------------- +# Similar to `script` for loading a bash library file, except that the path is +# relative to the file that used `@`. +@ () { + local lib="$1" srcdir="$(dirname ${BASH_SOURCE[1]})"; shift + lib="$(cd "$srcdir"; realpath "$lib")" + if [[ ! -r "$lib" ]]; then failwith "lib: file not found, $lib"; fi + . "$lib" "$@" +} + +# VSTS: +# Details on available environment variables: +# https://www.visualstudio.com/en-us/docs/build/define/variables +# to color output, start an output line with "##[]", these are +# known tags: section, command, error, warning, debug; also, there are +# various meta "##vso[...]" instructions, see: +# https://github.com/Microsoft/vsts-tasks/blob/master/docs/authoring/commands.md + +# ---< show tag message... >---------------------------------------------------- +# Display a message type classified by the given tag, in a way that is proper +# for the build context: on the build server use VSO magic outputs. Accepted +# tags are: "section", "warning", "command", "debug", "error", or "-" for +# generic output. $hide_in_log can be set to a string holding sensitive +# information that should be hidden in the output. The display uses "$HOME" +# instead of the actual value whenever it appears. +first_show="Y" +hide_in_log="" +show() { + local tag="$1"; shift + if [[ "$first_show" = "Y" ]]; then first_show="N"; + elif [[ "x$tag" = "xsection" ]]; then echo "" + fi + if [[ "x$tag" = "x-" ]]; then tag="" + elif [[ "$BUILDMODE" = "server" ]]; then tag="##[$tag]" + else case "$tag" in + ( "section" ) tag="===>>> " ;; + ( "warning" ) tag="*** " ;; + ( "command" ) tag="$ " ;; + ( "debug" ) tag=">> " ;; + ( "error" ) tag="!!! " ;; + ( * ) failwith "this script is broken, don't know about display tag: $tag" ;; + esac; fi + local line="$tag${*//"$HOME"/\$HOME}" + if [[ "$hide_in_log" != "" ]]; then + line="$tag${*//"$hide_in_log"/[...]}" + fi + echo "$line" +} + +# ---< failwith message... >---------------------------------------------------- +# Abort the run with the given error message. +failwith() { show error "Error: $*" 1>&2; exit 1; } + +_killed_handler() { echo ""; failwith "Aborting..."; } +builtin trap _killed_handler 2 3 9 15 + +# ---< map cmd arg... >--------------------------------------------------------- +# Apply $cmd on each of the arguments. +map() { local cmd="$1" arg; shift; for arg; do $cmd "$arg"; done; } + +# ---< echo_exit message... >--------------------------------------------------- +# Echo a message on exit. +_exit_strings=() +_show_exit_strings() { map echo "${_exit_strings[@]}"; } +trap _show_exit_strings 0 +echo_exit() { _exit_strings+=("$*"); } + +# protection from mistakingly overwriting traps in scriplets +trap() { failwith "cannot overwrite traps (in \"trap $*\")"; } + +# ---< qstr [-not-dollar] str... >---------------------------------------------- +# Quotes the input as a shell-parsable string, also using $HOME instead of its +# value (better than printf with "%q" which tends to uglingly backslash spaces). +# If "-not-dollar" then avoid quoting dollar signs. +qstr() { + local replace='\ " ` $' + if [[ "x$1" = "x-not-dollar" ]]; then replace='\ " `'; shift; fi + local str="$*" ch + for ch in $replace; do str="${str//"$ch"/\\$ch}"; done + echo "\"${str//$HOME/\$HOME}\"" +} + +# ---< maybe_qstr str... >------------------------------------------------------ +# Quotes the input as a shell-parsable string (using qstr) only if needed. +maybe_qstr() { + local str="$*" + if [[ "$(printf "%q" "$str")" = "$str" ]]; then echo "$str"; else qstr "$str"; fi +} + +# ---< _ [flags] cmd arg... >--------------------------------------------------- +# Run the given $cmd very carefuly. Exit on error, unless flags have "-a". +# Normally, the command is shown (using "show command") unless flags have "-q". +# If $collect_log is set to 1 then instead of showing the command's stdout it is +# captured in $collected_logs (which can also be used to suppress showing the +# output), or set it to 2 to capture both stdout and stderr (this is better than +# redirecting to /dev/null since that will swallow failure messages as well). +collect_log=0 collected_log="" +declare -A known_exes +_() { + local verbose=1 abortonfail=1 + while [[ "x$1" = "x-"* ]]; do + case "${1#-}" in + ( "q" ) verbose=0 ;; + ( "a" ) abortonfail=0 ;; + ( * ) failwith "internal error, unknown flag for '_': $1" + esac + shift + done + local sets=() + while [[ "$1" =~ ^[A-Za-z_][A-Za-z_0-9]*= ]]; do sets+=( "$1" ); shift; done + local cmd="$1"; shift + local exe="${known_exes[$cmd]}" + if [[ "$exe" = "" ]]; then + exe="$(type -p "$cmd")" + if [[ "$exe" = "" && "$(type -t "$cmd")" != "" ]]; then exe="$cmd"; fi + if [[ "$exe" = "" ]]; then failwith "could not find executable: $cmd"; fi + known_exes[$cmd]="$exe" + fi + if ((verbose)); then + local to_show="" x + for x in "${sets[@]}"; do to_show+=" ${x%%=*}=$(maybe_qstr "${x#*=}")"; done + for x in "$cmd" "$@"; do to_show+=" $(maybe_qstr "$x")"; done + show command "${to_show:1}" + fi + args=( "$@" ) + __run_it__() { + case $collect_log in + ( 2 ) collected_log="$("$exe" "${args[@]}" 2>&1)" ;; + ( 1 ) collected_log="$("$exe" "${args[@]}")" ;; + ( * ) "$exe" "${args[@]}" ;; + esac + } + if [[ "${#sets[@]}" = 0 ]]; then __run_it__ + else # can't put "x=y"s in a variable, so use eval + local pfx="" + for x in "${sets[@]}"; do pfx+="${x%%=*}=$(printf "%q" "${x#*=}") "; done + eval "${pfx}__run_it__" + fi + local ret=$? + if [[ $ret != 0 && $abortonfail -ge 1 ]]; then failwith "failure when running $cmd $*" + else return $ret; fi +} + +# ---< __ cmd arg... >---------------------------------------------------------- +# Convenient shorthand for "_ -q cmd arg..." +__() { _ -q "$@"; } # same, but no command display + +# ---< ___ cmd arg... >--------------------------------------------------------- +# Convenient shorthand for "_ -q -a cmd arg..." +___() { _ -q -a "$@"; } # same, but command display or aborting on failure + +# ---< _rm path... >------------------------------------------------------------ +# Removes a file or directory if it exists. +_rm_() { + if [[ -d "$1" ]]; then _ rm -rf "$1"; elif [[ -e "$1" ]]; then _ rm -f "$1"; fi; +} +_rm() { map _rm_ "$@"; } + +# ---< _md dir >---------------------------------------------------------------- +# Create a directory (with -p) if it doesn't exist. +_md_() { if [[ ! -d "$1" ]]; then _ mkdir -p "$1"; fi; } +_md() { map _md_ "$@"; } + +# ---< _mcd dir >--------------------------------------------------------------- +# Create a directory (with -p) and cd into it. +_mcd() { _md "$1"; _ cd "$1"; } + +# ---< _rmcd dir >-------------------------------------------------------------- +# Same as _mcd, removing the directory if it exists. +_rmcd() { _rm "$1"; _mcd "$1"; } + +# ---< get_suffix path >-------------------------------------------------------- +# Prints the suffix of a given path. Properly deal with filenames that begin +# with a "." and with multiple suffixes (like ".tar.gz"); suffixes are +# alphanumeric with at least one alphabetic character. +get_suffix() { + rx="[^/](([.][a-zA-Z0-9_]*[a-zA-Z][a-zA-Z0-9_]*)+)$" + if [[ "$1" =~ $rx ]]; then echo "${BASH_REMATCH[1]:1}"; fi +} + +# ---< call_ifdef [_] fun arg... >---------------------------------------------- +# If the named function exists, calls it with the given arguments. Calls only +# functions, not external executables or builtins. +call_ifdef() { + local pfx=""; if [[ "$1" = "_" ]]; then pfx="_"; shift; fi + local fun="$1"; shift + if [[ "$(type -t "$fun")" = "function" ]]; then $pfx "$fun" "$@"; fi +} + +# ---< deftag tag [supertag] >-------------------------------------------------- +# Define a tag, possibly as a subtag of supertag (which defaults to `all`). +# Note that tags are shared for both `$TESTS` and `$PUBLISH`. +declare -A _tag_parent +deftag() { + if [[ -n "$2" && "$2" != "all" && -z "${_tag_parent[$2]}" ]]; then + failwith "deftag: unknown parent tag, $2" + fi + _tag_parent[$1]="${2:-all}" +} + +# ---< should what tag... >----------------------------------------------------- +# Returns a zero (success) status if `tag` should be "what"-ed (tested or +# published) according to $TESTS or $PUBLISH, or one (failure) status otherwise. +# If more than one tag is given, follow scalatest semantics: succeed if at least +# one of the tags is included, and none are excluded. Convenient to use as: +# `should test foo && run_foo_test`. +_get_tag_value() { + local ret="${info[$1]}" + if [[ "$ret" = "" ]]; then + if [[ -z "${_tag_parent[$1]}" ]]; then ret="." + else ret="$(_get_tag_value "${_tag_parent[$1]}")"; fi + info[$1]=$ret + fi + echo $ret +} +_get_valid_tag_value() { + if [[ "$1" != "all" && "$1" != "none" && -z "${_tag_parent[$1]}" ]]; then + failwith "should: unknown tag, $1 {{${_tag_parent[$1]}}}" + fi + _get_tag_value "$1" +} +_has_tag() { + local -n info="$1"; shift + if (($# == 0)); then failwith "should: missing tag(s)"; fi + # mimic the scalatest logic: needs one tag included and none excluded + local r="" t; for t; do r+="$(_get_valid_tag_value "$t")"; done + [[ "$r" = *1* && "$r" != *0* ]] +} +should() { + what="$1"; shift + case "$what" in + ( "test" | "publish" ) _has_tag "_${what}_info" "$@" ;; + ( * ) failwith "should: unknown tag info, $what" ;; + esac +} + +# ---< get_install_info libname key >------------------------------------------- +# Print the value for $key in the setup section of $libname. Properly deals +# with various default values. +get_install_info() { + local ret="${_install_info[$1.$2]}" + if [[ "$ret" = "" ]]; then + case "$2" in + ( "lib" ) ret="${1,,}" ;; + ( "envvar" ) ret="${1^^}" ;; + ( "bindir" ) ret="bin" ;; + esac + if [[ "$ret" != "" ]]; then + _install_info[$1.$2]="$ret"; _replace_ver_in_info $1.$2 + fi + fi + echo "$ret" +} + +# ---< set_install_info_vars libname key... >----------------------------------- +# Get the value for each $key in the setup section of $libname, and set the +# variable whose name is $key to this value. +set_install_info_vars() { + local libname="$1" var val; shift + for var; do + printf -v "$var" "%s" "$(get_install_info "$libname" "$var")" + done +} + +# ---< env_eval str... >-------------------------------------------------------- +# Evaluate an expression and make sure that it's also included in the +# user's environment. The commands are held in $envinit_commands which +# can be added to if you want to include something in the environment +# but not evaluate it right now. (This is written out by code in "install.sh".) +envinit_commands=('export MMLSPARK_PROFILE="yes"') +envinit_eval() { envinit_commands+=("$*"); eval "$*"; } + +# ---< setenv var val >--------------------------------------------------------- +# Set an environment variable; include the setting in the user environment too. +setenv() { envinit_eval "export $1=$(qstr "$2")"; } + +# ---< get_runtime_hash >------------------------------------------------------- +# Prints out a hash of the currently configured runtime environment. The hash +# depends on the relevant bits of configuration, including a .setup and .init +# function definitions, if any. +get_runtime_hash() { + local hash="$( + for libname in "${install_packages[@]}"; do + set_install_info_vars "$libname" lib sha256 instcmd exes where + if [[ " $where " != *" runtime "* ]]; then continue; fi + printf "%s\n" "$libname" "$lib" "$sha256" "$instcmd" "$exes" \ + "$(declare -f "$libname.setup" "$libname.init")" + done | sha256sum)" + echo "${hash%% *}" +} + +# ------------------------------------------------------------------------------ +# Internal functions follow + +# Parse tag specs, used for $TESTS +_parse_tags() { + local -n tags="$1" info="$2" + tags="${tags,,}"; tags="${tags// /,}" + while [[ "$tags" != "${tags//,,/,}" ]]; do tags="${tags//,,/,}"; done + tags="${tags#,}"; tags="${tags%,}"; tags=",$tags" + while [[ "$tags" =~ (.*)","([^+-].*) ]]; do # just "tag" is the same as "+tag" + tags="${BASH_REMATCH[1]},+${BASH_REMATCH[2]}"; done + tags="${tags#,}" + if [[ "$tags," =~ [+-], ]]; then + failwith "empty tag in \$$1" + elif [[ "$tags" =~ [+-]([a-zA-Z0-9_]*[^a-zA-Z0-9_,][^,]*) ]]; then + failwith "bad \$$1 tag name: ${BASH_REMATCH[1]}" + fi + local t pos=0 ts="$tags" + ts="${ts//,/ }" ts="${ts//+/1}"; ts="${ts//-/0}" + for t in $ts; do [[ ${t:0:1} = 1 ]] && pos=1; info[${t:1}]=${t:0:1}; done + if ((!pos)); then info[all]=${info[all]:-1}; fi # no positives => all + if [[ "$tags" == "+"@("all"|"none") ]]; then tags="${tags:1}"; fi +} +declare -A _test_info _publish_info +_parse_TESTS() { _parse_tags TESTS _test_info; } +_parse_PUBLISH() { _parse_tags PUBLISH _publish_info; } + +# Defines $MML_VERSION and $MML_BUILD_INFO +_set_build_info() { + local info version + # make it possible to avoid running git + if [[ ! -z "$MML_BUILD_INFO" && ! -z "$MML_VERSION" ]]; then + info="$MML_BUILD_INFO"; version="$MML_VERSION" + else + local owd="$PWD"; cd "$BASEDIR" + # sanity checks for version tags + local t rx="(0|[1-9][0-9]*)"; rx="^v$rx[.]$rx([.]$rx)?$" + for t in $(git tag -l); do + if [[ ! "$t" =~ $rx ]]; then failwith "found a bad tag name \"$t\""; fi + done + if [[ -r "$BUILD_ARTIFACTS/version" ]]; then + # if there is a built version, use it, so that we don't get a new + # version after commits are made + version="$(< "$BUILD_ARTIFACTS/version")" + else + version="$(git describe --dirty=".dirty" --match "v*")" + # convert it to something that works for pip wheels + version="${version#v}"; version="${version/-g/+g}"; version="${version/-/.dev}" + fi + if [[ "$BUILDMODE" != "server" || "$AGENT_ID" = "" ]]; then + if [[ ! -r "$BUILD_ARTIFACTS/version" ]]; then + version="${version/+/+local.}" + fi + info="Local build: ${USERNAME:-$USER} ${BASEDIR:-$PWD}" + local line + info+="$( + git branch --no-color -vv --contains HEAD --merged | \ + while read line; do + if [[ "x$line" = "x*"* ]]; then + line="${line#"* "}"; + if [[ "$line" = *\[*\]* ]]; then line="${line%%\]*}]"; fi + if [[ "$line" = *\(*\)* ]]; then line="${line%%)*})"; fi + echo "//$line"; fi; done)" + if ! git diff-index --quiet HEAD; then info+=" (dirty)"; fi + else + local branch="${BUILD_SOURCEBRANCH#refs/heads/}" + # drop the commit sha1 for builds that are on the main line + if [[ "$BUILDPR:$branch" = ":master" && ! -r "$BUILD_ARTIFACTS/version" ]]; then + version="${version%+g[0-9a-f][0-9a-f]*}" + fi + info="$BUILD_REPOSITORY_NAME/$branch@${BUILD_SOURCEVERSION:0:8}" + info+="; $BUILD_DEFINITIONNAME#$BUILD_BUILDNUMBER" + fi + info="$version: $info" + cd "$owd" + fi + defvar -x MML_VERSION "$version" + defvar -x MML_BUILD_INFO "$info" +} + +# Parse $INSTALLATIONS info +declare -A _install_info +install_packages=() +_parse_install_info() { + local key="" libname="" x keys=1 + for x in "${INSTALLATIONS[@]}"; do + if [[ "$key" != "" ]]; then _install_info[${libname}.${key%:}]="$x" key="" + elif [[ "$x" = *: ]]; then key="$x" keys=1 + elif [[ "$x" != [A-Z]* ]]; then failwith "bad package name: $x" + elif ((!keys)); then failwith "install entry with no keys: $libname" + else libname="$x"; key=""; keys=0; install_packages+=("$x") + fi + done + # replace "<{ver}>"s + for x in "${!_install_info[@]}"; do _replace_ver_in_info "$x"; done +} +_replace_ver_in_info() { # lib.field + _install_info[$1]="${_install_info[$1]//"<{ver}>"/"${_install_info[${1%%.*}.ver]}"}" +} + +_post_config() { + _set_build_info + _parse_install_info + _parse_TESTS + _parse_PUBLISH + _replace_delayed_vars +} diff --git a/tools/tests/tags.sh b/tools/tests/tags.sh new file mode 100755 index 0000000000..d036958364 --- /dev/null +++ b/tools/tests/tags.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@" + +map deftag a b c +num=0 fail=0 + +try() { + local test="$*" + local flags="${test%% => *}" expect="${test##* => }" + flags=$(echo $flags) + local res=$(TESTS="$flags"; unset _test_info; declare -A _test_info + _parse_TESTS + should test a && echo A + should test b && echo B + should test c && echo C + should test a b && echo AB) + res=$(echo { $res }) + ((num++)) + if [[ "$expect" != "$res" ]]; then + ((fail++)) + echo "FAIL: TEST=\"$flags\": expected $expect, got $res" + fi +} + +report() { + if ((fail == 0)); then echo "All tests passed"; exit 0 + else echo "$fail/$num tests failed"; exit 1; fi +} + +# The following is an exhaustive list of all a/b/c options, verified with +# scalatest. To try it: +# import org.scalatest.{FunSuite, Tag} +# object A extends Tag("a"); object B extends Tag("b"); object C extends Tag("c") +# class ExampleSpec extends FunSuite { +# test("A", A) {}; test("B", B) {}; test("C", C) {}; test("AB", A, B) {} +# } +# and then in sbt use -n for + and -l for -, eg: test-only * -- -n a -n b -l c + +try " => { A B C AB }" +try "+a +b +c => { A B C AB }" +try "+a +b => { A B AB }" +try "+a +b -c => { A B AB }" +try " -c => { A B AB }" +try "+a +c => { A C AB }" +try " +b +c => { B C AB }" +try "+a => { A AB }" +try "+a -c => { A AB }" +try " +b => { B AB }" +try " +b -c => { B AB }" +try "-a => { B C }" +try "-a +b +c => { B C }" +try " -b => { A C }" +try "+a -b +c => { A C }" +try "+a -b => { A }" +try " -b -c => { A }" +try "+a -b -c => { A }" +try "-a +b => { B }" +try "-a -c => { B }" +try "-a +b -c => { B }" +try "-a -b => { C }" +try " +c => { C }" +try "-a +c => { C }" +try " -b +c => { C }" +try "-a -b +c => { C }" +try "-a -b -c => { }" + +report