From 7c5e7b676974c21486704e71a3fa793d08f25d1c Mon Sep 17 00:00:00 2001 From: marhamil723 Date: Sun, 30 Jun 2019 13:54:12 -0400 Subject: [PATCH] Get e2e tests working --- build.sbt | 91 +++++++++++++------ .../AzureSearchIndex - Met Artworks.ipynb | 11 +-- .../Classification - Adult Census.ipynb | 9 +- ...fication - Before and After MMLSpark.ipynb | 13 +-- ...eServices - Celebrity Quote Analysis.ipynb | 5 +- ...g - BiLSTM Medical Entity Extraction.ipynb | 4 +- ...ning - CIFAR10 Convolutional Network.ipynb | 4 +- ...arning - Flower Image Classification.ipynb | 29 ++---- .../DeepLearning - Transfer Learning.ipynb | 10 +- ...rk - Working with Arbitrary Web APIs.ipynb | 2 +- ...meterTuning - Fighting Breast Cancer.ipynb | 13 +-- ...antile Regression for Drug Discovery.ipynb | 6 +- ...erpretation - Snow Leopard Detection.ipynb | 13 ++- ...nCV - Pipeline Image Transformations.ipynb | 9 +- ...n - Flight Delays with DataCleaning.ipynb | 8 +- .../samples/Regression - Auto Imports.ipynb | 20 ++-- .../samples/Regression - Flight Delays.ipynb | 6 +- ...parkServing - Deploying a Classifier.ipynb | 8 +- ... - Amazon Book Reviews with Word2Vec.ipynb | 7 +- .../TextAnalytics - Amazon Book Reviews.ipynb | 11 +-- pipeline.yaml | 19 +++- project/Secrets.scala | 8 +- project/plugins.sbt | 3 +- .../ml/nbtest/DatabricksUtilities.scala | 32 ++++--- .../microsoft/ml/nbtest/NotebookTests.scala | 2 + .../ml/spark/codegen/PySparkWrapper.scala | 1 + src/main/python/mmlspark/core/schema/Utils.py | 2 +- .../mmlspark/io/http/AzureSearchWriter.py | 4 +- .../mmlspark/lightgbm/LightGBMClassifier.py | 1 + .../mmlspark/lightgbm/LightGBMRegressor.py | 1 + .../python/mmlspark/stages/UDFTransformer.py | 2 +- .../com/microsoft/ml/spark/Secrets.scala | 1 + 32 files changed, 197 insertions(+), 158 deletions(-) diff --git a/build.sbt b/build.sbt index 2179591daa..27633ac4c9 100644 --- a/build.sbt +++ b/build.sbt @@ -5,23 +5,9 @@ import org.apache.commons.io.FileUtils import scala.sys.process.Process -def getVersion(baseVersion: String): String = { - sys.env.get("MMLSPARK_RELEASE").map(_ =>baseVersion) - .orElse(sys.env.get("BUILD_NUMBER").map(bn => baseVersion + s"_$bn")) - .getOrElse(baseVersion + "-SNAPSHOT") -} - -def getPythonVersion(baseVersion: String): String = { - sys.env.get("MMLSPARK_RELEASE").map(_ =>baseVersion) - .orElse(sys.env.get("BUILD_NUMBER").map(bn => baseVersion + s".dev$bn")) - .getOrElse(baseVersion + ".dev1") -} - -val baseVersion = "0.17.1" val condaEnvName = "mmlspark" name := "mmlspark" organization := "com.microsoft.ml.spark" -version := getVersion(baseVersion) scalaVersion := "2.11.12" val sparkVersion = "2.4.0" @@ -53,11 +39,11 @@ createCondaEnvTask := { val s = streams.value val hasEnv = Process("conda env list").lineStream.toList .map(_.split("\\s+").head).contains(condaEnvName) - if (!hasEnv){ + if (!hasEnv) { Process( "conda env create -f environment.yaml", new File(".")) ! s.log - } else{ + } else { println("Found conda env " + condaEnvName) } } @@ -70,10 +56,18 @@ cleanCondaEnvTask := { new File(".")) ! s.log } +def osPrefix: Seq[String] = { + if (sys.props("os.name").toLowerCase.contains("windows")) { + Seq("cmd", "/C") + } else { + Seq() + } +} + def activateCondaEnv: Seq[String] = { - if(sys.props("os.name").toLowerCase.contains("windows")){ - Seq("cmd", "/C", "activate", condaEnvName, "&&") - }else{ + if (sys.props("os.name").toLowerCase.contains("windows")) { + osPrefix ++ Seq("activate", condaEnvName, "&&") + } else { Seq() //TODO figure out why this doesent work //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&") @@ -86,15 +80,27 @@ val pythonSrcDir = join(genDir.toString, "src", "python") val pythonPackageDir = join(genDir.toString, "package", "python") val pythonTestDir = join(genDir.toString, "test", "python") +def pythonizeVersion(v: String): String = { + if (v.contains("+")){ + v.split("+".head).head + ".dev1" + }else{ + v + } +} + packagePythonTask := { val s = streams.value (run in IntegrationTest2).toTask("").value createCondaEnvTask.value + val destPyDir = join("target", "scala-2.11", "classes", "mmlspark") + if (destPyDir.exists()) FileUtils.forceDelete(destPyDir) + FileUtils.copyDirectory(join(pythonSrcDir.getAbsolutePath, "mmlspark"), destPyDir) + Process( activateCondaEnv ++ Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", s"${pythonPackageDir.absolutePath}"), pythonSrcDir, - "MML_PY_VERSION" -> getPythonVersion(baseVersion)) ! s.log + "MML_PY_VERSION" -> pythonizeVersion(version.value)) ! s.log } val installPipPackageTask = TaskKey[Unit]("installPipPackage", "install python sdk") @@ -105,7 +111,7 @@ installPipPackageTask := { packagePythonTask.value Process( activateCondaEnv ++ Seq("pip", "install", - s"mmlspark-${getPythonVersion(baseVersion)}-py2.py3-none-any.whl"), + s"mmlspark-${pythonizeVersion(version.value)}-py2.py3-none-any.whl"), pythonPackageDir) ! s.log } @@ -117,7 +123,7 @@ testPythonTask := { Process( activateCondaEnv ++ Seq("python", "tools2/run_all_tests.py"), new File("."), - "MML_VERSION" -> getVersion(baseVersion) + "MML_VERSION" -> version.value ) ! s.log } @@ -147,10 +153,36 @@ setupTask := { getDatasetsTask.value } +val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob") +publishBlob := { + val s = streams.value + publishM2.value + val scalaVersionSuffix = scalaVersion.value.split(".".toCharArray.head).dropRight(1).mkString(".") + val nameAndScalaVersion = s"${name.value}_$scalaVersionSuffix" + + val localPackageFolder = join( + Seq(new File(new URI(Resolver.mavenLocal.root)).getAbsolutePath) + ++ organization.value.split(".".toCharArray.head) + ++ Seq(nameAndScalaVersion, version.value): _*).toString + + val blobMavenFolder = organization.value.replace(".", "/") + + s"/$nameAndScalaVersion/${version.value}" + val command = Seq("az", "storage", "blob", "upload-batch", + "--source", localPackageFolder, + "--destination", "maven", + "--destination-path", blobMavenFolder, + "--account-name", "mmlspark", + "--account-key", Secrets.storageKey) + println(command.mkString(" ")) + Process(osPrefix ++ command) ! s.log +} + val settings = Seq( (scalastyleConfig in Test) := baseDirectory.value / "scalastyle-test-config.xml", logBuffered in Test := false, - buildInfoKeys := Seq[BuildInfoKey](name, version, scalaVersion, sbtVersion, baseDirectory, datasetDir), + buildInfoKeys := Seq[BuildInfoKey]( + name, version, scalaVersion, sbtVersion, + baseDirectory, datasetDir), parallelExecution in Test := false, buildInfoPackage := "com.microsoft.ml.spark.build") ++ inConfig(IntegrationTest2)(Defaults.testSettings) @@ -180,20 +212,25 @@ credentials += Credentials("Sonatype Nexus Repository Manager", pgpPassphrase := Some(Secrets.pgpPassword.toCharArray) pgpSecretRing := { val temp = File.createTempFile("secret", ".asc") - new PrintWriter(temp) { write(Secrets.pgpPrivate); close() } + new PrintWriter(temp) { + write(Secrets.pgpPrivate); close() + } temp } pgpPublicRing := { val temp = File.createTempFile("public", ".asc") - new PrintWriter(temp) { write(Secrets.pgpPublic); close() } + new PrintWriter(temp) { + write(Secrets.pgpPublic); close() + } temp } licenses += ("MIT", url("https://github.com/Azure/mmlspark/blob/master/LICENSE")) publishMavenStyle := true publishTo := Some( - if (isSnapshot.value) + if (isSnapshot.value) { Opts.resolver.sonatypeSnapshots - else + } else { Opts.resolver.sonatypeStaging + } ) diff --git a/notebooks/samples/AzureSearchIndex - Met Artworks.ipynb b/notebooks/samples/AzureSearchIndex - Met Artworks.ipynb index 3511db4981..16cf24619e 100644 --- a/notebooks/samples/AzureSearchIndex - Met Artworks.ipynb +++ b/notebooks/samples/AzureSearchIndex - Met Artworks.ipynb @@ -22,12 +22,7 @@ }, "outputs": [], "source": [ - "import numpy as np, pandas as pd, os, sys, time, json, requests\n", - "\n", - "from mmlspark import *\n", - "from pyspark.ml.classification import LogisticRegression\n", - "from pyspark.sql.functions import udf, col\n", - "from pyspark.sql.types import IntegerType, StringType, DoubleType, StructType, StructField, ArrayType\n", + "import os, sys, time, json, requests\n", "from pyspark.ml import Transformer, Estimator, Pipeline\n", "from pyspark.ml.feature import SQLTransformer\n", "from pyspark.sql.functions import lit, udf, col, split" @@ -80,6 +75,9 @@ }, "outputs": [], "source": [ + "from mmlspark.cognitive import DescribeImage\n", + "from mmlspark.stages import SelectColumns\n", + "\n", "#define pipeline\n", "describeImage = DescribeImage()\\\n", " .setSubscriptionKey(VISION_API_KEY)\\\n", @@ -191,6 +189,7 @@ }, "outputs": [], "source": [ + "from mmlspark.io.http import *\n", "data_processed.writeToAzureSearch(options)" ] }, diff --git a/notebooks/samples/Classification - Adult Census.ipynb b/notebooks/samples/Classification - Adult Census.ipynb index 9db9d8f2f4..6f0f6ce71d 100644 --- a/notebooks/samples/Classification - Adult Census.ipynb +++ b/notebooks/samples/Classification - Adult Census.ipynb @@ -18,10 +18,7 @@ "outputs": [], "source": [ "import numpy as np\n", - "import pandas as pd\n", - "import mmlspark\n", - "\n", - "# help(mmlspark)" + "import pandas as pd" ] }, { @@ -65,7 +62,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import TrainClassifier\n", + "from mmlspark.train import TrainClassifier\n", "from pyspark.ml.classification import LogisticRegression\n", "model = TrainClassifier(model=LogisticRegression(), labelCol=\" income\", numFeatures=256).fit(train)\n", "model.write().overwrite().save(\"adultCensusIncomeModel.mml\")" @@ -84,7 +81,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputeModelStatistics, TrainedClassifierModel\n", + "from mmlspark.train import ComputeModelStatistics, TrainedClassifierModel\n", "predictionModel = TrainedClassifierModel.load(\"adultCensusIncomeModel.mml\")\n", "prediction = predictionModel.transform(test)\n", "metrics = ComputeModelStatistics().transform(prediction)\n", diff --git a/notebooks/samples/Classification - Before and After MMLSpark.ipynb b/notebooks/samples/Classification - Before and After MMLSpark.ipynb index 4e70b105f7..4bbf0c5663 100644 --- a/notebooks/samples/Classification - Before and After MMLSpark.ipynb +++ b/notebooks/samples/Classification - Before and After MMLSpark.ipynb @@ -42,13 +42,13 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import mmlspark\n", "from pyspark.sql.types import IntegerType, StringType, StructType, StructField\n", + "import os, urllib\n", "\n", "dataFilePath = \"BookReviewsFromAmazon10K.tsv\"\n", "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n", " StructField(\"text\", StringType(), False)])\n", - "import os, urllib\n", + "\n", "if not os.path.isfile(dataFilePath):\n", " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n", "rawData = spark.createDataFrame(pd.read_csv(dataFilePath, sep=\"\\t\", header=None), textSchema)\n", @@ -92,7 +92,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import UDFTransformer\n", + "from mmlspark.stages import UDFTransformer\n", "wordLength = \"wordLength\"\n", "wordCount = \"wordCount\"\n", "wordLengthTransformer = UDFTransformer(inputCol=\"text\", outputCol=wordLength, udf=wordLengthUDF)\n", @@ -208,7 +208,7 @@ "bestModel = models[metrics.index(bestMetric)]\n", "\n", "# Save model\n", - "bestModel.write().overwrite().save(\"SparkMLExperiment.mmls\")\n", + "bestModel.write().overwrite().save(\"SparkMLExperiment.mml\")\n", "# Get AUC on the validation dataset\n", "scoredVal = bestModel.transform(validation)\n", "print(evaluator.evaluate(scoredVal))" @@ -241,7 +241,8 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import TrainClassifier, FindBestModel, ComputeModelStatistics\n", + "from mmlspark.train import TrainClassifier, ComputeModelStatistics\n", + "from mmlspark.automl import FindBestModel\n", "\n", "# Prepare data for learning\n", "train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=123)\n", @@ -257,7 +258,7 @@ "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=lrmodels).fit(test)\n", "\n", "# Save model\n", - "bestModel.write().overwrite().save(\"MMLSExperiment.mmls\")\n", + "bestModel.write().overwrite().save(\"MMLSExperiment.mml\")\n", "# Get AUC on the validation dataset\n", "predictions = bestModel.transform(validation)\n", "metrics = ComputeModelStatistics().transform(predictions)\n", diff --git a/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb b/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb index e541b2ba31..f318ce573d 100644 --- a/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb +++ b/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb @@ -22,7 +22,7 @@ }, "outputs": [], "source": [ - "from mmlspark import *\n", + "from mmlspark.cognitive import *\n", "from pyspark.ml import PipelineModel\n", "from pyspark.sql.functions import col, udf\n", "from pyspark.ml.feature import SQLTransformer\n", @@ -115,6 +115,8 @@ }, "outputs": [], "source": [ + "from mmlspark.stages import UDFTransformer \n", + "\n", "recognizeText = RecognizeText()\\\n", " .setSubscriptionKey(VISION_API_KEY)\\\n", " .setUrl(\"https://eastus.api.cognitive.microsoft.com/vision/v2.0/recognizeText\")\\\n", @@ -175,6 +177,7 @@ "metadata": {}, "outputs": [], "source": [ + "from mmlspark.stages import SelectColumns\n", "# Select the final coulmns\n", "cleanupColumns = SelectColumns().setCols([\"url\", \"firstCeleb\", \"text\", \"sentimentScore\"])\n", "\n", diff --git a/notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb b/notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb index 243a582332..ddf27fdda2 100644 --- a/notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb +++ b/notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb @@ -31,14 +31,14 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import CNTKModel, ModelDownloader\n", + "from mmlspark.cntk import CNTKModel\n", + "from mmlspark.downloader import ModelDownloader\n", "from pyspark.sql.functions import udf, col\n", "from pyspark.sql.types import IntegerType, ArrayType, FloatType, StringType\n", "from pyspark.sql import Row\n", "\n", "from os.path import abspath, join\n", "import numpy as np\n", - "import pickle\n", "from nltk.tokenize import sent_tokenize, word_tokenize\n", "import os, tarfile, pickle\n", "import urllib.request\n", diff --git a/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb b/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb index 653efdfbc8..41399e6fbb 100644 --- a/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb +++ b/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb @@ -13,7 +13,8 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import CNTKModel, ModelDownloader\n", + "from mmlspark.cntk import CNTKModel\n", + "from mmlspark.downloader import ModelDownloader\n", "from pyspark.sql.functions import udf\n", "from pyspark.sql.types import IntegerType\n", "from os.path import abspath" @@ -104,7 +105,6 @@ "metadata": {}, "outputs": [], "source": [ - "import array\n", "from pyspark.sql.functions import col\n", "from pyspark.sql.types import *\n", "\n", diff --git a/notebooks/samples/DeepLearning - Flower Image Classification.ipynb b/notebooks/samples/DeepLearning - Flower Image Classification.ipynb index d29e15fec8..72fc61ac9f 100644 --- a/notebooks/samples/DeepLearning - Flower Image Classification.ipynb +++ b/notebooks/samples/DeepLearning - Flower Image Classification.ipynb @@ -6,26 +6,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pyspark\n", - "from pyspark.sql.functions import udf, col\n", - "from pyspark.sql.types import IntegerType, StringType, DoubleType\n", "from pyspark.ml import Transformer, Estimator, Pipeline\n", "from pyspark.ml.classification import LogisticRegression\n", - "from mmlspark import *\n", - "\n", - "import numpy as np, pandas as pd, os, sys, time\n", - "from os.path import join, abspath, exists" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "mml-deploy": "hdinsight" - }, - "outputs": [], - "source": [ - "model = ModelDownloader(spark, \"/models\").downloadByName(\"ResNet50\")" + "from mmlspark.downloader import ModelDownloader\n", + "import os, sys, time" ] }, { @@ -36,7 +20,7 @@ }, "outputs": [], "source": [ - "model = ModelDownloader(spark, \"models\").downloadByName(\"ResNet50\")" + "model = ModelDownloader(spark, \"dbfs:/models/\").downloadByName(\"ResNet50\")" ] }, { @@ -66,6 +50,10 @@ "metadata": {}, "outputs": [], "source": [ + "from mmlspark.opencv import ImageTransformer\n", + "from mmlspark.image import UnrollImage, ImageFeaturizer\n", + "from mmlspark.stages import *\n", + "\n", "# Make some featurizers\n", "it = ImageTransformer()\\\n", " .setOutputCol(\"scaled\")\\\n", @@ -183,9 +171,8 @@ "source": [ "import matplotlib.pyplot as plt\n", "from sklearn.metrics import confusion_matrix\n", - "import pandas as pd\n", - "from glob import glob\n", "import numpy as np\n", + "\n", "def evaluate(results, name):\n", " y, y_hat = results[\"labels\"],results[\"prediction\"]\n", " y = [int(l) for l in y]\n", diff --git a/notebooks/samples/DeepLearning - Transfer Learning.ipynb b/notebooks/samples/DeepLearning - Transfer Learning.ipynb index 58fbc6610c..3774cccce4 100644 --- a/notebooks/samples/DeepLearning - Transfer Learning.ipynb +++ b/notebooks/samples/DeepLearning - Transfer Learning.ipynb @@ -23,9 +23,9 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import CNTKModel, ModelDownloader\n", - "import numpy as np, pandas as pd\n", - "import os, urllib, tarfile, pickle, array\n", + "from mmlspark.cntk import CNTKModel\n", + "from mmlspark.downloader import ModelDownloader\n", + "import numpy as np, os, urllib, tarfile, pickle, array\n", "from os.path import abspath\n", "from pyspark.sql.functions import col, udf\n", "from pyspark.sql.types import *\n", @@ -143,7 +143,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import TrainClassifier\n", + "from mmlspark.train import TrainClassifier\n", "from pyspark.ml.classification import RandomForestClassifier\n", "\n", "train,test = featurizedImages.randomSplit([0.75,0.25])\n", @@ -164,7 +164,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputeModelStatistics\n", + "from mmlspark.train import ComputeModelStatistics\n", "predictions = model.transform(test)\n", "metrics = ComputeModelStatistics(evaluationMetric=\"accuracy\").transform(predictions)\n", "metrics.show()" diff --git a/notebooks/samples/HttpOnSpark - Working with Arbitrary Web APIs.ipynb b/notebooks/samples/HttpOnSpark - Working with Arbitrary Web APIs.ipynb index 30e65d6fd5..c37d49517c 100644 --- a/notebooks/samples/HttpOnSpark - Working with Arbitrary Web APIs.ipynb +++ b/notebooks/samples/HttpOnSpark - Working with Arbitrary Web APIs.ipynb @@ -30,7 +30,7 @@ "source": [ "from pyspark.sql.functions import struct\n", "from pyspark.sql.types import *\n", - "from mmlspark import *\n", + "from mmlspark.io.http import *\n", "\n", "df = spark.createDataFrame([(\"foo\",) for x in range(20)], [\"data\"]) \\\n", " .withColumn(\"inputs\", struct(\"data\"))\n", diff --git a/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb b/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb index fa1e7d4ba2..80118f2e58 100644 --- a/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb +++ b/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb @@ -18,7 +18,6 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import mmlspark\n", "from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField" ] }, @@ -67,8 +66,8 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import TuneHyperparameters\n", - "from mmlspark.TrainClassifier import TrainClassifier\n", + "from mmlspark.automl import TuneHyperparameters\n", + "from mmlspark.train import TrainClassifier\n", "from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n", "logReg = LogisticRegression()\n", "randForest = RandomForestClassifier()\n", @@ -92,10 +91,8 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import HyperparamBuilder\n", - "from mmlspark import RangeHyperParam\n", - "from mmlspark import DiscreteHyperParam\n", - "from mmlspark import RandomSpace\n", + "from mmlspark.automl import *\n", + "\n", "paramBuilder = \\\n", " HyperparamBuilder() \\\n", " .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3)) \\\n", @@ -158,7 +155,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputeModelStatistics\n", + "from mmlspark.train import ComputeModelStatistics\n", "prediction = bestModel.transform(test)\n", "metrics = ComputeModelStatistics().transform(prediction)\n", "metrics.limit(10).toPandas()" diff --git a/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb b/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb index ad85d980d0..83447c285e 100644 --- a/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb +++ b/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb @@ -73,7 +73,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import LightGBMRegressor\n", + "from mmlspark.lightgbm import LightGBMRegressor\n", "model = LightGBMRegressor(objective='quantile',\n", " alpha=0.2,\n", " learningRate=0.3,\n", @@ -93,7 +93,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import LightGBMRegressionModel\n", + "from mmlspark.lightgbm import LightGBMRegressionModel\n", "model.saveNativeModel(\"mymodel\")\n", "model = LightGBMRegressionModel.loadNativeModelFromFile(\"mymodel\")" ] @@ -144,7 +144,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputeModelStatistics\n", + "from mmlspark.train import ComputeModelStatistics\n", "metrics = ComputeModelStatistics(evaluationMetric='regression',\n", " labelCol='label',\n", " scoresCol='prediction') \\\n", diff --git a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb b/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb index 58037555a8..4c3ae24c93 100644 --- a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb +++ b/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb @@ -35,8 +35,8 @@ }, "outputs": [], "source": [ - "from mmlspark import *\n", - "from mmlspark import FluentAPI\n", + "from mmlspark.cognitive import *\n", + "from mmlspark.core.spark import FluentAPI\n", "import os\n", "from pyspark.sql.functions import lit\n", "\n", @@ -197,6 +197,10 @@ "from pyspark.ml.feature import StringIndexer\n", "from pyspark.ml.classification import LogisticRegression\n", "from pyspark.sql.functions import udf\n", + "from mmlspark.downloader import ModelDownloader\n", + "from mmlspark.image import ImageFeaturizer \n", + "from mmlspark.stages import UDFTransformer\n", + "from pyspark.sql.types import *\n", "\n", "def getIndex(row):\n", " return float(row[1])\n", @@ -253,6 +257,8 @@ "outputs": [], "source": [ "import urllib.request\n", + "from mmlspark.lime import ImageLIME\n", + "\n", "test_image_url = \"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/snow_leopard1.jpg\"\n", "with urllib.request.urlopen(test_image_url) as url:\n", " barr = url.read()\n", @@ -279,8 +285,7 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import PIL\n", - "import io\n", + "import PIL, io, numpy as np\n", "\n", "def plot_superpixels(row):\n", " image_bytes = row['image']\n", diff --git a/notebooks/samples/OpenCV - Pipeline Image Transformations.ipynb b/notebooks/samples/OpenCV - Pipeline Image Transformations.ipynb index ac7356443a..be4dd82b69 100644 --- a/notebooks/samples/OpenCV - Pipeline Image Transformations.ipynb +++ b/notebooks/samples/OpenCV - Pipeline Image Transformations.ipynb @@ -28,7 +28,8 @@ "source": [ "import mmlspark\n", "import numpy as np\n", - "from mmlspark import toNDArray\n", + "from mmlspark.opencv import toNDArray\n", + "from mmlspark.io import *\n", "\n", "imageDir = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/sampleImages\"\n", "images = spark.read.image().load(imageDir).cache()\n", @@ -138,7 +139,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ImageTransformer\n", + "from mmlspark.opencv import ImageTransformer\n", "\n", "tr = (ImageTransformer() # images are resized and then cropped\n", " .setOutputCol(\"transformed\")\n", @@ -167,7 +168,7 @@ "outputs": [], "source": [ "from pyspark.sql.functions import udf\n", - "from mmlspark import ImageSchema, toNDArray, toImage\n", + "from mmlspark.opencv import ImageSchema, toNDArray, toImage\n", "\n", "def u(row):\n", " array = toNDArray(row) # convert Image to numpy ndarray[height, width, 3]\n", @@ -195,7 +196,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import UnrollImage\n", + "from mmlspark.image import UnrollImage\n", "\n", "unroller = UnrollImage().setInputCol(\"noblue\").setOutputCol(\"unrolled\")\n", "\n", diff --git a/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb b/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb index 1c79118aec..21eb2ab677 100644 --- a/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb +++ b/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb @@ -96,7 +96,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import DataConversion\n", + "from mmlspark.featurize import DataConversion\n", "flightDelay = DataConversion(cols=[\"Quarter\",\"Month\",\"DayofMonth\",\"DayOfWeek\",\n", " \"OriginAirportID\",\"DestAirportID\",\n", " \"CRSDepTime\",\"CRSArrTime\"],\n", @@ -148,7 +148,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import TrainRegressor, TrainedRegressorModel\n", + "from mmlspark.train import TrainRegressor, TrainedRegressorModel\n", "from pyspark.ml.regression import LinearRegression\n", "\n", "trainCat = DataConversion(cols=[\"Carrier\",\"DepTimeBlk\",\"ArrTimeBlk\"],\n", @@ -194,7 +194,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputeModelStatistics\n", + "from mmlspark.train import ComputeModelStatistics\n", "metrics = ComputeModelStatistics().transform(scoredData)\n", "metrics.toPandas()" ] @@ -213,7 +213,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputePerInstanceStatistics\n", + "from mmlspark.train import ComputePerInstanceStatistics\n", "evalPerInstance = ComputePerInstanceStatistics().transform(scoredData)\n", "evalPerInstance.select(\"ArrDelay\", \"Scores\", \"L1_loss\", \"L2_loss\") \\\n", " .limit(10).toPandas()" diff --git a/notebooks/samples/Regression - Auto Imports.ipynb b/notebooks/samples/Regression - Auto Imports.ipynb index f0dcfeb30d..c808aa04ac 100644 --- a/notebooks/samples/Regression - Auto Imports.ipynb +++ b/notebooks/samples/Regression - Auto Imports.ipynb @@ -29,15 +29,6 @@ "using `pandas.read_csv()`" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -54,6 +45,7 @@ "metadata": {}, "outputs": [], "source": [ + "import pandas as pd\n", "from pyspark.sql.types import LongType, StringType, DoubleType, StructType, StructField\n", "\n", "colSchema = (\n", @@ -138,7 +130,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import SummarizeData\n", + "from mmlspark.stages import SummarizeData\n", "summary = SummarizeData().transform(data)\n", "summary.toPandas()" ] @@ -183,7 +175,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import CleanMissingData\n", + "from mmlspark.featurize import CleanMissingData\n", "cols = [\"normalized-losses\", \"stroke\", \"bore\", \"horsepower\",\n", " \"peak-rpm\", \"price\"]\n", "cleanModel = CleanMissingData().setCleaningMode(\"Median\") \\\n", @@ -236,7 +228,7 @@ "# train Poisson Regression Model\n", "from pyspark.ml.regression import GeneralizedLinearRegression\n", "from pyspark.ml import Pipeline\n", - "from mmlspark import TrainRegressor\n", + "from mmlspark.train import TrainRegressor\n", "\n", "glr = GeneralizedLinearRegression(family=\"poisson\", link=\"log\")\n", "poissonModel = TrainRegressor().setModel(glr).setLabelCol(\"price\").setNumFeatures(256)\n", @@ -289,7 +281,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputeModelStatistics\n", + "from mmlspark.train import ComputeModelStatistics\n", "poissonMetrics = ComputeModelStatistics().transform(poissonPrediction)\n", "print(\"Poisson Metrics\")\n", "poissonMetrics.toPandas()" @@ -319,7 +311,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputePerInstanceStatistics\n", + "from mmlspark.train import ComputePerInstanceStatistics\n", "def demonstrateEvalPerInstance(pred):\n", " return ComputePerInstanceStatistics().transform(pred) \\\n", " .select(\"price\", \"Scores\", \"L1_loss\", \"L2_loss\") \\\n", diff --git a/notebooks/samples/Regression - Flight Delays.ipynb b/notebooks/samples/Regression - Flight Delays.ipynb index c54cebdb58..b6f7a6ace0 100644 --- a/notebooks/samples/Regression - Flight Delays.ipynb +++ b/notebooks/samples/Regression - Flight Delays.ipynb @@ -85,7 +85,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import TrainRegressor, TrainedRegressorModel\n", + "from mmlspark.train import TrainRegressor, TrainedRegressorModel\n", "from pyspark.ml.regression import LinearRegression\n", "from pyspark.ml.feature import StringIndexer\n", "# Convert columns to categorical\n", @@ -132,7 +132,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputeModelStatistics\n", + "from mmlspark.train import ComputeModelStatistics\n", "metrics = ComputeModelStatistics().transform(scoredData)\n", "metrics.toPandas()" ] @@ -151,7 +151,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputePerInstanceStatistics\n", + "from mmlspark.train import ComputePerInstanceStatistics\n", "evalPerInstance = ComputePerInstanceStatistics().transform(scoredData)\n", "evalPerInstance.select(\"ArrDelay\", \"Scores\", \"L1_loss\", \"L2_loss\").limit(10).toPandas()" ] diff --git a/notebooks/samples/SparkServing - Deploying a Classifier.ipynb b/notebooks/samples/SparkServing - Deploying a Classifier.ipynb index 431213d5bc..7abf9de330 100644 --- a/notebooks/samples/SparkServing - Deploying a Classifier.ipynb +++ b/notebooks/samples/SparkServing - Deploying a Classifier.ipynb @@ -19,8 +19,7 @@ "source": [ "import sys\n", "import numpy as np\n", - "import pandas as pd\n", - "import mmlspark\n" + "import pandas as pd\n" ] }, { @@ -66,7 +65,7 @@ }, "outputs": [], "source": [ - "from mmlspark import TrainClassifier\n", + "from mmlspark.train import TrainClassifier\n", "from pyspark.ml.classification import LogisticRegression\n", "model = TrainClassifier(model=LogisticRegression(), labelCol=\" income\", numFeatures=256).fit(train)" ] @@ -84,7 +83,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import ComputeModelStatistics, TrainedClassifierModel\n", + "from mmlspark.train import ComputeModelStatistics, TrainedClassifierModel\n", "prediction = model.transform(test)\n", "prediction.printSchema()" ] @@ -114,6 +113,7 @@ "outputs": [], "source": [ "from pyspark.sql.types import *\n", + "from mmlspark.io.http import *\n", "import uuid\n", "\n", "serving_inputs = spark.readStream.server() \\\n", diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb b/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb index f75f1aec4c..bb5a95bec3 100644 --- a/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb +++ b/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb @@ -18,7 +18,6 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import mmlspark\n", "from pyspark.sql.types import IntegerType, StringType, StructType, StructField" ] }, @@ -127,7 +126,7 @@ "outputs": [], "source": [ "from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n", - "from mmlspark.TrainClassifier import TrainClassifier\n", + "from mmlspark.train import TrainClassifier\n", "import itertools\n", "\n", "lrHyperParams = [0.05, 0.2]\n", @@ -164,7 +163,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import FindBestModel\n", + "from mmlspark.automl import FindBestModel\n", "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=trainedModels).fit(ptest)\n", "bestModel.getEvaluationResults().show()\n", "bestModel.getBestModelMetrics().show()\n", @@ -184,7 +183,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark.ComputeModelStatistics import ComputeModelStatistics\n", + "from mmlspark.train import ComputeModelStatistics\n", "predictions = bestModel.transform(pvalidation)\n", "metrics = ComputeModelStatistics().transform(predictions)\n", "print(\"Best model's accuracy on validation set = \"\n", diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb b/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb index a7f70a96d4..db3e445ba4 100644 --- a/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb +++ b/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb @@ -18,7 +18,6 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import mmlspark\n", "from pyspark.sql.types import IntegerType, StringType, StructType, StructField" ] }, @@ -43,7 +42,7 @@ "metadata": {}, "source": [ "Use `TextFeaturizer` to generate our features column. We remove stop words, and use TF-IDF\n", - "to generate 2\u00b2\u2070 sparse features." + "to generate 2²⁰ sparse features." ] }, { @@ -52,7 +51,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark.TextFeaturizer import TextFeaturizer\n", + "from mmlspark.featurize.text import TextFeaturizer\n", "textFeaturizer = TextFeaturizer() \\\n", " .setInputCol(\"text\").setOutputCol(\"features\") \\\n", " .setUseStopWordsRemover(True).setUseIDF(True).setMinDocFreq(5).setNumFeatures(1 << 16).fit(data)" @@ -106,7 +105,7 @@ "lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n", "logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams]\n", "\n", - "from mmlspark.TrainClassifier import TrainClassifier\n", + "from mmlspark.train import TrainClassifier\n", "lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\").fit(train) for lrm in logisticRegressions]" ] }, @@ -123,7 +122,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark import FindBestModel, BestModel\n", + "from mmlspark.automl import FindBestModel, BestModel\n", "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=lrmodels).fit(test)\n", "bestModel.getEvaluationResults().show()\n", "bestModel.getBestModelMetrics().show()\n", @@ -145,7 +144,7 @@ "metadata": {}, "outputs": [], "source": [ - "from mmlspark.ComputeModelStatistics import ComputeModelStatistics\n", + "from mmlspark.train import ComputeModelStatistics\n", "predictions = loadedBestModel.transform(validation)\n", "metrics = ComputeModelStatistics().transform(predictions)\n", "print(\"Best model's accuracy on validation set = \"\n", diff --git a/pipeline.yaml b/pipeline.yaml index 7e72edc13d..b2193226ca 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -15,7 +15,7 @@ jobs: scriptLocation: inlineScript inlineScript: 'sbt scalastyle test:scalastyle' -- job: Publish +- job: PublishAndE2E pool: vmImage: ubuntu-16.04 steps: @@ -24,12 +24,25 @@ jobs: inputs: azureSubscription: 'Findable Incubation(ca9d21ff-2a46-4e8b-bf06-8d65242342e5)' scriptLocation: inlineScript - inlineScript: 'sbt publishSigned' + inlineScript: 'sbt packagePython && sbt publishBlob' env: BUILD_NUMBER: $(Build.BuildId) + - task: AzureCLI@1 + displayName: 'E2E' + inputs: + azureSubscription: 'Findable Incubation(ca9d21ff-2a46-4e8b-bf06-8d65242342e5)' + scriptLocation: inlineScript + inlineScript: 'sbt it:test' + env: + BUILD_NUMBER: $(Build.BuildId) + - task: PublishTestResults@2 + displayName: 'Publish Test Results **/test-reports/*.xml' + inputs: + testResultsFiles: '**/test-reports/*.xml' + failTaskOnFailedTests: true + condition: succeededOrFailed() - job: PythonTests - timeoutInMinutes: 0 pool: vmImage: ubuntu-16.04 steps: diff --git a/project/Secrets.scala b/project/Secrets.scala index 1b2d776ac8..d44abf982e 100644 --- a/project/Secrets.scala +++ b/project/Secrets.scala @@ -19,14 +19,13 @@ object Secrets { } } - private def getSecret(secretName: String): String = { println(s"fetching secret: $secretName") - try{ + try { exec(s"az account set -s $subscriptionID") val secretJson = exec(s"az keyvault secret show --vault-name $kvName --name $secretName") secretJson.parseJson.asJsObject().fields("value").convertTo[String] - }catch { + } catch { case _: IOException => println("WARNING: Could not load secret from keyvault, defaulting to the empty string." + " Please install az command line to perform authorized build steps like publishing") @@ -45,5 +44,6 @@ object Secrets { lazy val pgpPrivate: String = new String(Base64.getDecoder.decode( getSecret("pgp-private").getBytes("UTF-8"))) lazy val pgpPassword: String = getSecret("pgp-pw") + lazy val storageKey: String = getSecret("storage-key") -} \ No newline at end of file +} diff --git a/project/plugins.sbt b/project/plugins.sbt index c7e0d8af1a..d0b34041a1 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -2,4 +2,5 @@ addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.2") addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0") addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3") -addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1") \ No newline at end of file +addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1") +addSbtPlugin("com.dwijnand" % "sbt-dynver" % "3.1.0") diff --git a/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala b/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala index 7d7af503fa..5798d63ef8 100644 --- a/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala +++ b/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala @@ -4,10 +4,15 @@ package com.microsoft.ml.nbtest import java.io.{File, FileInputStream} +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter import java.util.concurrent.TimeoutException import com.microsoft.ml.spark.core.env.StreamUtilities._ import com.microsoft.ml.nbtest.SprayImplicits._ +import com.microsoft.ml.spark.Secrets +import com.microsoft.ml.spark.build.BuildInfo +import com.microsoft.ml.spark.core.env.FileUtilities import org.apache.commons.io.IOUtils import org.apache.http.client.config.RequestConfig import org.apache.http.client.methods.{HttpGet, HttpPost} @@ -22,10 +27,11 @@ import scala.sys.process.Process import com.microsoft.ml.spark.core.env.StreamUtilities._ import scala.concurrent.blocking +//noinspection ScalaStyle object DatabricksUtilities { lazy val requestTimeout = 60000 - lazy val requestConfig = RequestConfig.custom() + lazy val requestConfig: RequestConfig = RequestConfig.custom() .setConnectTimeout(requestTimeout) .setConnectionRequestTimeout(requestTimeout) .setSocketTimeout(requestTimeout) @@ -36,27 +42,23 @@ object DatabricksUtilities { // ADB Info val region = "southcentralus" - val token = sys.env("MML_ADB_TOKEN") + val token: String = sys.env.getOrElse("MML_ADB_TOKEN", Secrets.adbToken) val authValue: String = "Basic " + BaseEncoding.base64() .encode(("token:" + token).getBytes("UTF-8")) val baseURL = s"https://$region.azuredatabricks.net/api/2.0/" val clusterName = "Test Cluster" lazy val clusterId: String = getClusterIdByName(clusterName) - val folder = "/MMLSparkBuild/Build1" + + val folder = s"/MMLSparkBuild/build_${BuildInfo.version}" // MMLSpark info - val topDir = new File(new File(getClass.getResource("/").toURI), "") - val showVersionScript = new File(topDir, "tools/runme/show-version") - val mmlVersion = sys.env.getOrElse("MML_VERSION", Process(showVersionScript.toString).!!.trim) - //val mmlVersion = "0.12.dev13" // Uncomment this line to test against a custom version - assert(mmlVersion != "0.0", s"This version $mmlVersion is only for local usage") - val scalaVersion = sys.env("SCALA_VERSION") - val version = s"com.microsoft.ml.spark:mmlspark_$scalaVersion:$mmlVersion" + val truncatedScalaVersion: String = BuildInfo.scalaVersion + .split(".".toCharArray.head).dropRight(1).mkString(".") + val version = s"com.microsoft.ml.spark:${BuildInfo.name}_$truncatedScalaVersion:${BuildInfo.version}" + val repository = "https://mmlspark.azureedge.net/maven" val libraries: String = List( - Map("maven" -> Map( - "coordinates" -> version, - "repo" -> "https://mmlspark.azureedge.net/maven")), + Map("maven" -> Map("coordinates" -> version, "repo" -> repository)), Map("pypi" -> Map("package" -> "nltk")) ).toJson.compactPrint @@ -64,7 +66,7 @@ object DatabricksUtilities { val timeoutInMillis: Int = 25 * 60 * 1000 val notebookFiles: Array[File] = Option( - new File(topDir, "BuildArtifacts/notebooks/hdinsight").getCanonicalFile.listFiles() + FileUtilities.join(BuildInfo.baseDirectory, "notebooks", "samples").getCanonicalFile.listFiles() ).get def retry[T](backoffs: List[Int], f: () => T): T = { @@ -211,7 +213,7 @@ object DatabricksUtilities { def monitorJob(runId: Integer, timeout: Int, - interval: Int = 2000, + interval: Int = 8000, logLevel: Int = 1): Future[Unit] = { Future { var finalState: Option[String] = None diff --git a/src/it/scala/com/microsoft/ml/nbtest/NotebookTests.scala b/src/it/scala/com/microsoft/ml/nbtest/NotebookTests.scala index 08507154ef..ab3a1f2e46 100644 --- a/src/it/scala/com/microsoft/ml/nbtest/NotebookTests.scala +++ b/src/it/scala/com/microsoft/ml/nbtest/NotebookTests.scala @@ -33,6 +33,8 @@ class NotebookTests extends TestBase { assert(listInstalledLibraries(clusterId).isEmpty, "Cluster already has libraries installed") println("Installing libraries") installLibraries(clusterId) + println(s"Creating folder $folder") + workspaceMkDir(folder) println(s"Submitting jobs") val jobIds = notebookFiles.map(uploadAndSubmitNotebook) println(s"Submitted ${jobIds.length} for execution: ${jobIds.toList}") diff --git a/src/it/scala/com/microsoft/ml/spark/codegen/PySparkWrapper.scala b/src/it/scala/com/microsoft/ml/spark/codegen/PySparkWrapper.scala index d348e473bd..4dbaeb248e 100644 --- a/src/it/scala/com/microsoft/ml/spark/codegen/PySparkWrapper.scala +++ b/src/it/scala/com/microsoft/ml/spark/codegen/PySparkWrapper.scala @@ -48,6 +48,7 @@ abstract class PySparkParamsWrapper(entryPoint: Params, |from pyspark.ml.param.shared import * |from pyspark import keyword_only |from pyspark.ml.util import JavaMLReadable, JavaMLWritable + |from mmlspark.core.serialize.java_params_patch import * |$importClassString |from pyspark.ml.common import inherit_doc |$importsString diff --git a/src/main/python/mmlspark/core/schema/Utils.py b/src/main/python/mmlspark/core/schema/Utils.py index ef615e892b..f30d75636e 100644 --- a/src/main/python/mmlspark/core/schema/Utils.py +++ b/src/main/python/mmlspark/core/schema/Utils.py @@ -77,7 +77,7 @@ def _transfer_params_from_java(self): if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) # SPARK-14931: Only check set com.microsoft.ml.spark.core.serialize.params back to avoid default com.microsoft.ml.spark.core.serialize.params mismatch. - complex_param_class = sc._gateway.jvm.org.apache.spark.ml.param.ComplexParam._java_lang_class + complex_param_class = sc._gateway.jvm.com.microsoft.ml.spark.core.serialize.ComplexParam._java_lang_class is_complex_param = complex_param_class.isAssignableFrom(java_param.getClass()) if self._java_obj.isSet(java_param): if is_complex_param: diff --git a/src/main/python/mmlspark/io/http/AzureSearchWriter.py b/src/main/python/mmlspark/io/http/AzureSearchWriter.py index 138e0de381..b305f55bb1 100644 --- a/src/main/python/mmlspark/io/http/AzureSearchWriter.py +++ b/src/main/python/mmlspark/io/http/AzureSearchWriter.py @@ -15,14 +15,14 @@ def streamToAzureSearch(df, options=dict()): jvm = SparkContext.getOrCreate()._jvm - writer = jvm.com.microsoft.ml.spark.io.http.AzureSearchWriter + writer = jvm.com.microsoft.ml.spark.cognitive.AzureSearchWriter return writer.stream(df._jdf, options) setattr(pyspark.sql.DataFrame, 'streamToAzureSearch', streamToAzureSearch) def writeToAzureSearch(df, options=dict()): jvm = SparkContext.getOrCreate()._jvm - writer = jvm.com.microsoft.ml.spark.io.http.AzureSearchWriter + writer = jvm.com.microsoft.ml.spark.cognitive.AzureSearchWriter writer.write(df._jdf, options) setattr(pyspark.sql.DataFrame, 'writeToAzureSearch', writeToAzureSearch) diff --git a/src/main/python/mmlspark/lightgbm/LightGBMClassifier.py b/src/main/python/mmlspark/lightgbm/LightGBMClassifier.py index ae32856857..cc25eeda4b 100644 --- a/src/main/python/mmlspark/lightgbm/LightGBMClassifier.py +++ b/src/main/python/mmlspark/lightgbm/LightGBMClassifier.py @@ -13,6 +13,7 @@ from pyspark import SparkContext from pyspark.ml.common import inherit_doc from pyspark.ml.wrapper import JavaParams +from mmlspark.core.serialize.java_params_patch import * @inherit_doc class LightGBMClassifier(_LightGBMClassifier): diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRegressor.py b/src/main/python/mmlspark/lightgbm/LightGBMRegressor.py index 32ed327437..8f47c9210d 100644 --- a/src/main/python/mmlspark/lightgbm/LightGBMRegressor.py +++ b/src/main/python/mmlspark/lightgbm/LightGBMRegressor.py @@ -13,6 +13,7 @@ from pyspark import SparkContext from pyspark.ml.common import inherit_doc from pyspark.ml.wrapper import JavaParams +from mmlspark.core.serialize.java_params_patch import * @inherit_doc class LightGBMRegressor(_LightGBMRegressor): diff --git a/src/main/python/mmlspark/stages/UDFTransformer.py b/src/main/python/mmlspark/stages/UDFTransformer.py index 2231588de9..beb9b20651 100644 --- a/src/main/python/mmlspark/stages/UDFTransformer.py +++ b/src/main/python/mmlspark/stages/UDFTransformer.py @@ -31,7 +31,7 @@ class UDFTransformer(ComplexParamsMixin, JavaMLReadable, JavaMLWritable, JavaTra @keyword_only def __init__(self, inputCol=None, inputCols=None, outputCol=None, udf=None): super(UDFTransformer, self).__init__() - self._java_obj = self._new_java_obj("com.microsoft.ml.spark.UDFTransformer") + self._java_obj = self._new_java_obj("com.microsoft.ml.spark.stages.UDFTransformer") self.inputCol = Param(self, "inputCol", "inputCol: The name of the input column (default: )") self.inputCols = Param(self, "inputCols", "inputCols: The names of the input columns (default: )") self.outputCol = Param(self, "outputCol", "outputCol: The name of the output column") diff --git a/src/test/scala/com/microsoft/ml/spark/Secrets.scala b/src/test/scala/com/microsoft/ml/spark/Secrets.scala index 4e98e7f1ed..dc809d179a 100644 --- a/src/test/scala/com/microsoft/ml/spark/Secrets.scala +++ b/src/test/scala/com/microsoft/ml/spark/Secrets.scala @@ -39,5 +39,6 @@ object Secrets { lazy val powerbiURL: String = getSecret("powerbi-url") lazy val speechApiKey: String = getSecret("speech-api-key") lazy val visionApiKey: String = getSecret("vision-api-key") + lazy val adbToken: String = getSecret("adb-token") }