From 7abf8399d4da35c87b5f649e0956d753eeac835b Mon Sep 17 00:00:00 2001 From: Xinyue Ruan Date: Thu, 8 Sep 2022 11:09:22 +0800 Subject: [PATCH] downgrade 3.1 --- build.sbt | 12 +++-- .../cognitive/split1/TextAnalyticsSuite.scala | 2 - .../ml/featurize/CleanMissingData.scala | 2 +- .../source/image/PatchedImageFileFormat.scala | 2 +- .../cyber/utils/test_spark_utils.py | 2 +- .../azure/synapse/ml/codegen/RTestGen.scala | 2 +- .../ml/nbtest/DatabricksUtilities.scala | 12 +++-- .../synapse/ml/nbtest/SynapseUtilities.scala | 2 +- environment.yml | 2 +- .../AIsample - Time Series Forecasting.ipynb | 54 +------------------ pipeline.yaml | 7 ++- start | 2 +- tools/docker/demo/Dockerfile | 2 +- tools/docker/minimal/Dockerfile | 2 +- tools/dotnet/dotnetSetup.sh | 8 +-- tools/helm/livy/Dockerfile | 2 +- tools/helm/spark/Dockerfile | 2 +- tools/helm/zeppelin/Dockerfile | 2 +- tools/tests/run_r_tests.R | 2 +- 19 files changed, 37 insertions(+), 84 deletions(-) diff --git a/build.sbt b/build.sbt index 8655ef401f..2a10cc1006 100644 --- a/build.sbt +++ b/build.sbt @@ -9,19 +9,21 @@ import scala.xml.transform.{RewriteRule, RuleTransformer} import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} val condaEnvName = "synapseml" -val sparkVersion = "3.2.2" +val sparkVersion = "3.1.3" name := "synapseml" ThisBuild / organization := "com.microsoft.azure" -ThisBuild / scalaVersion := "2.12.15" +ThisBuild / scalaVersion := "2.12.10" val scalaMajorVersion = 2.12 val excludes = Seq( ExclusionRule("org.apache.spark", s"spark-tags_$scalaMajorVersion"), - ExclusionRule("org.scalatest") + ExclusionRule("org.scalatest"), + ExclusionRule("org.json4s", s"json4s-ast_$scalaMajorVersion"), ) val coreDependencies = Seq( + "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.12.5", "org.apache.spark" %% "spark-core" % sparkVersion % "compile", "org.apache.spark" %% "spark-mllib" % sparkVersion % "compile", "org.apache.spark" %% "spark-avro" % sparkVersion % "provided", @@ -29,11 +31,11 @@ val coreDependencies = Seq( "org.scalatest" %% "scalatest" % "3.0.5" % "test") val extraDependencies = Seq( "org.scalactic" %% "scalactic" % "3.0.5", - "io.spray" %% "spray-json" % "1.3.5", + "io.spray" %% "spray-json" % "1.3.2", "com.jcraft" % "jsch" % "0.1.54", "org.apache.httpcomponents" % "httpclient" % "4.5.6", "org.apache.httpcomponents" % "httpmime" % "4.5.6", - "com.linkedin.isolation-forest" %% "isolation-forest_3.2.0" % "2.0.8" + "com.linkedin.isolation-forest" %% "isolation-forest_3.0.0" % "1.0.1" ).map(d => d excludeAll (excludes: _*)) val dependencies = coreDependencies ++ extraDependencies diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/cognitive/split1/TextAnalyticsSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/cognitive/split1/TextAnalyticsSuite.scala index 728e1b9f90..08bbe0a2ea 100644 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/cognitive/split1/TextAnalyticsSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/cognitive/split1/TextAnalyticsSuite.scala @@ -12,8 +12,6 @@ import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.functions.col import org.apache.spark.sql.{DataFrame, Row} -import java.util.concurrent.TimeoutException - trait TextEndpoint { lazy val textKey: String = sys.env.getOrElse("TEXT_API_KEY", Secrets.CognitiveApiKey) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/CleanMissingData.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/CleanMissingData.scala index c263394beb..cf3760beef 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/CleanMissingData.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/CleanMissingData.scala @@ -127,7 +127,7 @@ class CleanMissingData(override val uid: String) extends Estimator[CleanMissingD // Verify columns are supported for imputation verifyColumnsSupported(dataset, colsToClean) val row = - dataset.select(columns.map(column => call_udf("percentile_approx", + dataset.select(columns.map(column => callUDF("percentile_approx", column, lit(0.5))): _*) .collect()(0) rowToValues(row) diff --git a/core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala b/core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala index d160e88ccb..ca2c9756d6 100644 --- a/core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala +++ b/core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala @@ -104,7 +104,7 @@ class PatchedImageFileFormat extends ImageFileFormat with Serializable with Logg val bytes = try { IOUtils.toByteArray(stream) } finally { - IOUtils.close(stream) + IOUtils.closeQuietly(stream) } val resultOpt = catchFlakiness(5)(ImageSchema.decode(origin, bytes)) diff --git a/core/src/test/python/synapsemltest/cyber/utils/test_spark_utils.py b/core/src/test/python/synapsemltest/cyber/utils/test_spark_utils.py index 72e0d5721d..bfa0954504 100644 --- a/core/src/test/python/synapsemltest/cyber/utils/test_spark_utils.py +++ b/core/src/test/python/synapsemltest/cyber/utils/test_spark_utils.py @@ -57,10 +57,10 @@ def test_zip_with_index_sort_by_column_within_partitions(self): order_by_col="user", ) expected = [ + ("OrgB", "Joe", 0), ("OrgA", "Alice", 0), ("OrgA", "Bob", 1), ("OrgA", "Joe", 2), - ("OrgB", "Joe", 0), ] assert result.collect() == expected diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala index 2a8048fa59..687da36b33 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala @@ -105,7 +105,7 @@ object RTestGen { | "spark.sql.shuffle.partitions=10", | "spark.sql.crossJoin.enabled=true") | - |sc <- spark_connect(master = "local", version = "3.2.2", config = conf) + |sc <- spark_connect(master = "local", version = "3.1.3", config = conf) | |""".stripMargin, StandardOpenOption.CREATE) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala index ebd39f1306..19e8034779 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala @@ -29,10 +29,10 @@ object DatabricksUtilities { // ADB Info val Region = "eastus" - val PoolName = "synapseml-build-10.4" - val GpuPoolName = "synapseml-build-10.4-gpu" - val AdbRuntime = "10.4.x-scala2.12" - val AdbGpuRuntime = "10.4.x-gpu-ml-scala2.12" + val PoolName = "synapseml-build-9.1" + val GpuPoolName = "synapseml-build-9.1-gpu" + val AdbRuntime = "9.1.x-scala2.12" + val AdbGpuRuntime = "9.1.x-gpu-ml-scala2.12" val NumWorkers = 5 val AutoTerminationMinutes = 15 @@ -81,7 +81,9 @@ object DatabricksUtilities { val ParallelizableNotebooks: Seq[File] = NotebookFiles.filterNot(_.isDirectory) - val CPUNotebooks: Seq[File] = ParallelizableNotebooks.filterNot(_.getAbsolutePath.contains("simple_deep_learning")) + val CPUNotebooks: Seq[File] = ParallelizableNotebooks + .filterNot(_.getAbsolutePath.contains("simple_deep_learning")) + .filterNot(_.getAbsolutePath.contains("GeospatialServices")) // This service doesn't work on databricks for spark3.1 val GPUNotebooks: Seq[File] = ParallelizableNotebooks.filter(_.getAbsolutePath.contains("simple_deep_learning")) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala index d3eaa20a69..a490588f50 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala @@ -272,7 +272,7 @@ object SynapseUtilities { | "nodeSizeFamily": "MemoryOptimized", | "provisioningState": "Succeeded", | "sessionLevelPackagesEnabled": "true", - | "sparkVersion": "3.2" + | "sparkVersion": "3.1" | } |} |""".stripMargin diff --git a/environment.yml b/environment.yml index 94a4a5bcfc..dc9a37d8cb 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: - default dependencies: - python=3.8.8 - - pyspark=3.2.2 + - pyspark=3.1.3 - requests=2.26.0 - pip=21.3 - r-base=4.1.1 diff --git a/notebooks/community/aisamples/AIsample - Time Series Forecasting.ipynb b/notebooks/community/aisamples/AIsample - Time Series Forecasting.ipynb index 2ca39dab04..752b940c74 100644 --- a/notebooks/community/aisamples/AIsample - Time Series Forecasting.ipynb +++ b/notebooks/community/aisamples/AIsample - Time Series Forecasting.ipynb @@ -398,58 +398,6 @@ "source": [ "display(df_p)" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 4: Log and Load Model with MLFlow\n", - "We can now store the trained model for later use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# setup mlflow\n", - "import mlflow\n", - "import trident.mlflow\n", - "from trident.mlflow import get_sds_url\n", - "\n", - "EXPERIMENT_NAME = \"aisample-timeseries\"\n", - "\n", - "mlflow.set_tracking_uri(get_sds_url())\n", - "mlflow.set_registry_uri(get_sds_url())\n", - "mlflow.set_experiment(EXPERIMENT_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# log the model and parameters\n", - "model_name = f\"{EXPERIMENT_NAME}-prophet\"\n", - "with mlflow.start_run() as run:\n", - " mlflow.prophet.log_model(m, model_name, registered_model_name=model_name)\n", - " mlflow.log_params({\"seasonality_mode\": \"multiplicative\", \"mcmc_samples\": 1000})\n", - " model_uri = f\"runs:/{run.info.run_id}/{model_name}\"\n", - " print(\"Model saved in run %s\" % run.info.run_id)\n", - " print(f\"Model URI: {model_uri}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load the model back\n", - "loaded_model = mlflow.prophet.load_model(model_uri)" - ] } ], "metadata": { @@ -471,4 +419,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file diff --git a/pipeline.yaml b/pipeline.yaml index 10b8ebe7cb..fa38c5922b 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -5,6 +5,7 @@ trigger: branches: include: - master + - spark3.1 paths: exclude: - README.md @@ -16,6 +17,7 @@ pr: branches: include: - master + - spark3.1 paths: exclude: - README.md @@ -99,8 +101,6 @@ jobs: matrix: databricks-cpu: TEST-CLASS: "com.microsoft.azure.synapse.ml.nbtest.DatabricksCPUTests" - databricks-gpu: - TEST-CLASS: "com.microsoft.azure.synapse.ml.nbtest.DatabricksGPUTests" synapse: TEST-CLASS: "com.microsoft.azure.synapse.ml.nbtest.SynapseTests" steps: @@ -449,6 +449,9 @@ jobs: scriptLocation: inlineScript scriptType: bash inlineScript: | + echo "removing ivy2 cache" + sudo rm -rf ~/.ivy2/cache + sbt publishLocal export SBT_OPTS="-Xms2G -Xmx4G -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=4G -Xss5M -Duser.timezone=GMT" source activate synapseml (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) diff --git a/start b/start index 709d1f93b8..c0964f8189 100644 --- a/start +++ b/start @@ -1,7 +1,7 @@ #!/bin/bash export OPENMPI_VERSION="3.1.2" -export SPARK_VERSION="3.2.2" +export SPARK_VERSION="3.1.3" export HADOOP_VERSION="2.7" export SYNAPSEML_VERSION="0.10.1" # Binder compatibility version diff --git a/tools/docker/demo/Dockerfile b/tools/docker/demo/Dockerfile index 2a1f22eab8..020b26c163 100644 --- a/tools/docker/demo/Dockerfile +++ b/tools/docker/demo/Dockerfile @@ -3,7 +3,7 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04 ARG SYNAPSEML_VERSION=0.10.1 ARG DEBIAN_FRONTEND=noninteractive -ENV SPARK_VERSION=3.2.2 +ENV SPARK_VERSION=3.1.3 ENV HADOOP_VERSION=2.7 ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION} ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64 diff --git a/tools/docker/minimal/Dockerfile b/tools/docker/minimal/Dockerfile index 3fdada1684..f835edd91c 100644 --- a/tools/docker/minimal/Dockerfile +++ b/tools/docker/minimal/Dockerfile @@ -3,7 +3,7 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04 ARG SYNAPSEML_VERSION=0.10.1 ARG DEBIAN_FRONTEND=noninteractive -ENV SPARK_VERSION=3.2.2 +ENV SPARK_VERSION=3.1.3 ENV HADOOP_VERSION=2.7 ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION} ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64 diff --git a/tools/dotnet/dotnetSetup.sh b/tools/dotnet/dotnetSetup.sh index 1244caf479..bb1cddbce1 100644 --- a/tools/dotnet/dotnetSetup.sh +++ b/tools/dotnet/dotnetSetup.sh @@ -20,11 +20,11 @@ echo "##vso[task.setvariable variable=DOTNET_WORKER_DIR]$DOTNET_WORKER_DIR" # Install Sleet dotnet tool install -g sleet -# Install Apache Spark-3.2 -curl https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz -o spark-3.2.0-bin-hadoop3.2.tgz +# Install Apache Spark-3.1 +curl https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz -o spark-3.1.2-bin-hadoop3.2.tgz mkdir ~/bin -tar -xzvf spark-3.2.0-bin-hadoop3.2.tgz -C ~/bin -export SPARK_HOME=~/bin/spark-3.2.0-bin-hadoop3.2/ +tar -xzvf spark-3.1.2-bin-hadoop3.2.tgz -C ~/bin +export SPARK_HOME=~/bin/spark-3.1.2-bin-hadoop3.2/ export PATH=$SPARK_HOME/bin:$PATH echo "##vso[task.setvariable variable=SPARK_HOME]$SPARK_HOME" echo "##vso[task.setvariable variable=PATH]$SPARK_HOME/bin:$PATH" diff --git a/tools/helm/livy/Dockerfile b/tools/helm/livy/Dockerfile index 97c2632e55..cb1301eea2 100644 --- a/tools/helm/livy/Dockerfile +++ b/tools/helm/livy/Dockerfile @@ -3,7 +3,7 @@ LABEL maintainer="Dalitso Banda dalitsohb@gmail.com" # Get Spark from US Apache mirror. ENV APACHE_SPARK_VERSION 2.4.5 -ENV HADOOP_VERSION 3.2.1 +ENV HADOOP_VERSION 3.1.3 RUN echo "$LOG_TAG Getting SPARK_HOME" && \ apt-get update && \ diff --git a/tools/helm/spark/Dockerfile b/tools/helm/spark/Dockerfile index 5ca50eb518..d9507a488f 100644 --- a/tools/helm/spark/Dockerfile +++ b/tools/helm/spark/Dockerfile @@ -3,7 +3,7 @@ LABEL maintainer="Dalitso Banda dalitsohb@gmail.com" # Get Spark from US Apache mirror. ENV APACHE_SPARK_VERSION 2.4.5 -ENV HADOOP_VERSION 3.2.1 +ENV HADOOP_VERSION 3.1.3 RUN echo "$LOG_TAG Getting SPARK_HOME" && \ apt-get update && \ diff --git a/tools/helm/zeppelin/Dockerfile b/tools/helm/zeppelin/Dockerfile index 6f92ed0203..c2e8e3ca5f 100644 --- a/tools/helm/zeppelin/Dockerfile +++ b/tools/helm/zeppelin/Dockerfile @@ -3,7 +3,7 @@ LABEL maintainer="Dalitso Banda dalitsohb@gmail.com" # Get Spark from US Apache mirror. ENV APACHE_SPARK_VERSION 2.4.5 -ENV HADOOP_VERSION 3.2.1 +ENV HADOOP_VERSION 3.1.3 RUN echo "$LOG_TAG Getting SPARK_HOME" && \ apt-get update && \ diff --git a/tools/tests/run_r_tests.R b/tools/tests/run_r_tests.R index 954c7a58d7..74d606a0fc 100644 --- a/tools/tests/run_r_tests.R +++ b/tools/tests/run_r_tests.R @@ -3,6 +3,6 @@ if (!require("sparklyr")) { library("sparklyr") } -spark_install(version = "3.2.2", hadoop_version = "3.2") +spark_install(version = "3.1.3", hadoop_version = "3.2") options("testthat.output_file" = "../../../../r-test-results.xml") devtools::test(reporter = JunitReporter$new())