fix: Make SynapseE2E Tests work now with Spark 3.2 (#1362)

* Trying to use only pool with Spark 3.2 * Updating install instructions for synapse to use 0.9.4 * Changing syntax to grab ipynb files * Line breaking to comply with styling * Changing ipynb filter for windows * Fixing string new line syntax * Improvements to SynapseTests * Adding more spark pools 3.2 * Adjusting list tests not to assert * Improving dev doc, livyPayLoad * Changing SynapseWS to mmlsparkppe * Changing synapse URL to dogfood * Removing dogfood from token acquisition * Fixing exludes syntax * Adding 2 more Apache Spark Pools * Improving the developer docs * Adjusting identation on developer-readme * Bumping Synapse test timeout to 40 min * Applying PR feedback Co-authored-by: Serena Ruan <82044803+serena-ruan@users.noreply.github.com>
microsoft · Feb 7, 2022 · f070c2e · f070c2e
1 parent 47f0c8f
commit f070c2e
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 46 deletions.
diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala
@@ -26,6 +26,14 @@ object FileUtilities {
     val CREATE = S.CREATE
   }
 
+  def recursiveListFiles(f: File): Array[File] = {
+    val these = f.listFiles()
+    these ++ these
+      .filter(_.isDirectory)
+      .flatMap(recursiveListFiles)
+      .filter(!_.isDirectory)
+  }
+
   def allFiles(dir: File, pred: (File => Boolean) = null): Array[File] = {
     def loop(dir: File): Array[File] = {
       val (dirs, files) = dir.listFiles.sorted.partition(_.isDirectory)

diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala
@@ -16,14 +16,16 @@ import scala.sys.process.Process
 /** Tests to validate fuzzing of modules. */
 class SynapseTests extends TestBase {
 
-  ignore("Synapse") {
+  test("Synapse") {
 
     val os = sys.props("os.name").toLowerCase
     os match {
       case x if x contains "windows" =>
-        exec("conda activate synapseml && jupyter nbconvert --to script .\\notebooks\\*.ipynb")
+        exec("conda activate synapseml " +
+        "&& jupyter nbconvert --to script .\\notebooks\\features\\**\\*.ipynb")
       case _ =>
-        Process(s"conda init bash; conda activate synapseml; jupyter nbconvert --to script ./notebooks/*.ipynb")
+        Process(s"conda init bash; conda activate synapseml; " +
+        "jupyter nbconvert --to script ./notebooks/features/**/*.ipynb")
     }
 
     SynapseUtilities.listPythonFiles().map(f => {
@@ -33,8 +35,13 @@ class SynapseTests extends TestBase {
       new File(f).renameTo(new File(newPath))
     })
 
-    val workspaceName = "mmlspark"
-    val sparkPools = Array("buildpool", "buildpool2", "buildpool3")
+    val workspaceName = "mmlsparkppe"
+    val sparkPools = Array(
+      "e2etstspark32i1",
+      "e2etstspark32i2",
+      "e2etstspark32i3",
+      "e2etstspark32i4",
+      "e2etstspark32i5")
 
     val livyBatchJobs = SynapseUtilities.listPythonJobFiles()
       .filterNot(_.contains(" "))
@@ -43,7 +50,7 @@ class SynapseTests extends TestBase {
         val poolName = SynapseUtilities.monitorPool(workspaceName, sparkPools)
         val livyUrl = "https://" +
           workspaceName +
-          ".dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/" +
+          ".dev.azuresynapse-dogfood.net/livyApi/versions/2019-11-01-preview/sparkPools/" +
           poolName +
           "/batches"
         val livyBatch: LivyBatch = SynapseUtilities.uploadAndSubmitNotebook(livyUrl, f)

diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala
@@ -51,18 +51,19 @@ object SynapseUtilities extends HasHttpClient {
   lazy val Token: String = getSynapseToken
 
   val Folder = s"build_${BuildInfo.version}/scripts"
-  val TimeoutInMillis: Int = 20 * 60 * 1000
+  val TimeoutInMillis: Int = 30 * 60 * 1000 // 30 minutes
   val StorageAccount: String = "mmlsparkeuap"
-  val StorageContainer: String = "synapse"
+  val StorageContainer: String = "mmlsparkppefs"
   val TenantId: String = "72f988bf-86f1-41af-91ab-2d7cd011db47"
   val ClientId: String = "85dde348-dd2b-43e5-9f5a-22262af45332"
 
   def listPythonFiles(): Array[String] = {
-    Option(
-      FileUtilities
-        .join(BuildInfo.baseDirectory.getParent, "notebooks")
+    Option({
+      val rootDirectory = FileUtilities
+        .join(BuildInfo.baseDirectory.getParent, "notebooks/features")
         .getCanonicalFile
-        .listFiles()
+
+      FileUtilities.recursiveListFiles(rootDirectory)
         .filter(_.getAbsolutePath.endsWith(".py"))
         .filter(_.getAbsolutePath.contains("-"))
         .filterNot(_.getAbsolutePath.contains("CyberML"))
@@ -73,35 +74,40 @@ object SynapseUtilities extends HasHttpClient {
         .filterNot(_.getAbsolutePath.contains("Overview"))
         .filterNot(_.getAbsolutePath.contains("ModelInterpretation"))
         .filterNot(_.getAbsolutePath.contains("Interpretability"))
-        .map(file => file.getAbsolutePath))
-      .get
-      .sorted
+        .map(file => file.getAbsolutePath)
+    })
+    .get
+    .sorted
   }
 
   def listPythonJobFiles(): Array[String] = {
-    Option(
-      FileUtilities
-        .join(BuildInfo.baseDirectory.getParent, "notebooks")
-        .getCanonicalFile
-        .listFiles()
-        .filter(_.getAbsolutePath.endsWith(".py"))
-        .filterNot(_.getAbsolutePath.contains("-"))
-        .filterNot(_.getAbsolutePath.contains(" "))
-        .map(file => file.getAbsolutePath))
-      .get
-      .sorted
+    Option({
+        val rootDirectory = FileUtilities
+          .join(BuildInfo.baseDirectory.getParent, "notebooks/features")
+          .getCanonicalFile
+
+        FileUtilities.recursiveListFiles(rootDirectory)
+          .filter(_.getAbsolutePath.endsWith(".py"))
+          .filterNot(_.getAbsolutePath.contains("-"))
+          .filterNot(_.getAbsolutePath.contains(" "))
+          .map(file => file.getAbsolutePath)
+    })
+    .get
+    .sorted
   }
 
   def listNoteBookFiles(): Array[String] = {
-    Option(
-      FileUtilities
-        .join(BuildInfo.baseDirectory.getParent, "notebooks")
+    Option({
+      val rootDirectory = FileUtilities
+        .join(BuildInfo.baseDirectory.getParent, "notebooks/features")
         .getCanonicalFile
-        .listFiles()
+
+      FileUtilities.recursiveListFiles(rootDirectory)
         .filter(_.getAbsolutePath.endsWith(".ipynb"))
-        .map(file => file.getAbsolutePath))
-      .get
-      .sorted
+        .map(file => file.getAbsolutePath)
+    })
+    .get
+    .sorted
   }
 
   def postMortem(batch: LivyBatch, livyUrl: String): LivyBatch = {
@@ -122,7 +128,7 @@ object SynapseUtilities extends HasHttpClient {
   def showSubmittingJobs(workspaceName: String, poolName: String): Applications = {
     val uri: String =
       "https://" +
-        s"$workspaceName.dev.azuresynapse.net" +
+        s"$workspaceName.dev.azuresynapse-dogfood.net" +
         "/monitoring/workloadTypes/spark/applications" +
         "?api-version=2020-10-01-preview" +
         "&filter=(((state%20eq%20%27Queued%27)%20or%20(state%20eq%20%27Submitting%27))" +
@@ -152,7 +158,7 @@ object SynapseUtilities extends HasHttpClient {
       readyPool
     }
     else {
-      println(s"None spark pool is ready to submit job, waiting 10s")
+      println(s"No spark pool is ready to submit a new job, waiting 10s")
       blocking {
         Thread.sleep(10000)
       }
@@ -243,7 +249,8 @@ object SynapseUtilities extends HasHttpClient {
     val excludes: String = "org.scala-lang:scala-reflect," +
       "org.apache.spark:spark-tags_2.12," +
       "org.scalactic:scalactic_2.12," +
-      "org.scalatest:scalatest_2.12"
+      "org.scalatest:scalatest_2.12," +
+      "org.slf4j:slf4j-api"
 
     val livyPayload: String =
       s"""
@@ -257,7 +264,7 @@ object SynapseUtilities extends HasHttpClient {
          | "numExecutors" : 2,
          | "conf" :
          |     {
-         |         "spark.jars.packages" : "com.microsoft.azure:synapseml:${BuildInfo.version}",
+         |         "spark.jars.packages" : "com.microsoft.azure:synapseml_2.12:${BuildInfo.version}",
          |         "spark.jars.repositories" : "https://mmlspark.azureedge.net/maven",
          |         "spark.jars.excludes": "$excludes",
          |         "spark.driver.userClassPathFirst": "true",

diff --git a/website/docs/reference/developer-readme.md b/website/docs/reference/developer-readme.md
@@ -8,20 +8,32 @@ description: SynapseML Development Setup
 # SynapseML Development Setup
 
 1) [Install SBT](https://www.scala-sbt.org/1.x/docs/Setup.html)
-    - Make sure to download JDK 11 if you don't have it
-3) Fork the repository on github
-    - This is required if you would like to make PRs. If you choose the fork option, replace the clone link below with that of your fork.
-2) Git Clone your fork, or the repo directly
-    - `git clone https://github.com/Microsoft/SynapseML.git`
-    - NOTE: If you would like to contribute to synapseml regularly, add your fork as a remote named ``origin`` and Microsoft/SynapseML as a remote named ``upstream``
-3) Run sbt to compile and grab datasets
+    - Make sure to download [JDK 11](https://www.oracle.com/java/technologies/javase/jdk11-archive-downloads.html) if you don't have it
+2) Fork the repository on github
+    - See how to here: [Fork a repo - GitHub Docs](https://docs.github.com/en/get-started/quickstart/fork-a-repo)
+3) Clone your fork
+    - `git clone https://github.com/<your GitHub handle>/SynapseML.git`
+    - This will automatically add your fork as the default remote, called `origin`
+4) Add another Git Remote to track the original SynapseML repo. It's recommended to call it `upstream`:
+    - `git remote add upstream https://github.com/microsoft/SynapseML.git`
+    - See more about Git remotes here: [Git - Working with remotes](https://git-scm.com/book/en/v2/Git-Basics-Working-with-Remotes)
+5) Run sbt to compile and grab datasets
     - `cd synapseml`
     - `sbt setup`
-4) [Install IntelliJ](https://www.jetbrains.com/idea/download)
+6) [Install IntelliJ](https://www.jetbrains.com/idea/download)
     - Install Scala plugins during install
-5) Configure IntelliJ
+7) Configure IntelliJ
     - **OPEN** the synapseml directory
     - If the project does not automatically import,click on `build.sbt` and import project
+8) Prepare your Python Environment
+    - Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
+    - Activate the `synapseml` conda environment by running `conda env create -f environment.yaml` from the `synapseml` directory.
+
+> NOTE
+> 
+> If you will be regularly contributing to the SynapseML repo, you'll want to keep your fork synced with the
+> upstream repository. Please read [this GitHub doc](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork)
+> to know more and learn techniques about how to do it.
 
 # Publishing and Using Build Secrets