diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala index f2b2907f9e..c59c133b3e 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/FileUtilities.scala @@ -26,6 +26,14 @@ object FileUtilities { val CREATE = S.CREATE } + def recursiveListFiles(f: File): Array[File] = { + val these = f.listFiles() + these ++ these + .filter(_.isDirectory) + .flatMap(recursiveListFiles) + .filter(!_.isDirectory) + } + def allFiles(dir: File, pred: (File => Boolean) = null): Array[File] = { def loop(dir: File): Array[File] = { val (dirs, files) = dir.listFiles.sorted.partition(_.isDirectory) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala index 54009bfdf5..cdd409f520 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala @@ -16,14 +16,16 @@ import scala.sys.process.Process /** Tests to validate fuzzing of modules. */ class SynapseTests extends TestBase { - ignore("Synapse") { + test("Synapse") { val os = sys.props("os.name").toLowerCase os match { case x if x contains "windows" => - exec("conda activate synapseml && jupyter nbconvert --to script .\\notebooks\\*.ipynb") + exec("conda activate synapseml " + + "&& jupyter nbconvert --to script .\\notebooks\\features\\**\\*.ipynb") case _ => - Process(s"conda init bash; conda activate synapseml; jupyter nbconvert --to script ./notebooks/*.ipynb") + Process(s"conda init bash; conda activate synapseml; " + + "jupyter nbconvert --to script ./notebooks/features/**/*.ipynb") } SynapseUtilities.listPythonFiles().map(f => { @@ -33,8 +35,13 @@ class SynapseTests extends TestBase { new File(f).renameTo(new File(newPath)) }) - val workspaceName = "mmlspark" - val sparkPools = Array("buildpool", "buildpool2", "buildpool3") + val workspaceName = "mmlsparkppe" + val sparkPools = Array( + "e2etstspark32i1", + "e2etstspark32i2", + "e2etstspark32i3", + "e2etstspark32i4", + "e2etstspark32i5") val livyBatchJobs = SynapseUtilities.listPythonJobFiles() .filterNot(_.contains(" ")) @@ -43,7 +50,7 @@ class SynapseTests extends TestBase { val poolName = SynapseUtilities.monitorPool(workspaceName, sparkPools) val livyUrl = "https://" + workspaceName + - ".dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/" + + ".dev.azuresynapse-dogfood.net/livyApi/versions/2019-11-01-preview/sparkPools/" + poolName + "/batches" val livyBatch: LivyBatch = SynapseUtilities.uploadAndSubmitNotebook(livyUrl, f) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala index 580435161f..7581553917 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala @@ -51,18 +51,19 @@ object SynapseUtilities extends HasHttpClient { lazy val Token: String = getSynapseToken val Folder = s"build_${BuildInfo.version}/scripts" - val TimeoutInMillis: Int = 20 * 60 * 1000 + val TimeoutInMillis: Int = 30 * 60 * 1000 // 30 minutes val StorageAccount: String = "mmlsparkeuap" - val StorageContainer: String = "synapse" + val StorageContainer: String = "mmlsparkppefs" val TenantId: String = "72f988bf-86f1-41af-91ab-2d7cd011db47" val ClientId: String = "85dde348-dd2b-43e5-9f5a-22262af45332" def listPythonFiles(): Array[String] = { - Option( - FileUtilities - .join(BuildInfo.baseDirectory.getParent, "notebooks") + Option({ + val rootDirectory = FileUtilities + .join(BuildInfo.baseDirectory.getParent, "notebooks/features") .getCanonicalFile - .listFiles() + + FileUtilities.recursiveListFiles(rootDirectory) .filter(_.getAbsolutePath.endsWith(".py")) .filter(_.getAbsolutePath.contains("-")) .filterNot(_.getAbsolutePath.contains("CyberML")) @@ -73,35 +74,40 @@ object SynapseUtilities extends HasHttpClient { .filterNot(_.getAbsolutePath.contains("Overview")) .filterNot(_.getAbsolutePath.contains("ModelInterpretation")) .filterNot(_.getAbsolutePath.contains("Interpretability")) - .map(file => file.getAbsolutePath)) - .get - .sorted + .map(file => file.getAbsolutePath) + }) + .get + .sorted } def listPythonJobFiles(): Array[String] = { - Option( - FileUtilities - .join(BuildInfo.baseDirectory.getParent, "notebooks") - .getCanonicalFile - .listFiles() - .filter(_.getAbsolutePath.endsWith(".py")) - .filterNot(_.getAbsolutePath.contains("-")) - .filterNot(_.getAbsolutePath.contains(" ")) - .map(file => file.getAbsolutePath)) - .get - .sorted + Option({ + val rootDirectory = FileUtilities + .join(BuildInfo.baseDirectory.getParent, "notebooks/features") + .getCanonicalFile + + FileUtilities.recursiveListFiles(rootDirectory) + .filter(_.getAbsolutePath.endsWith(".py")) + .filterNot(_.getAbsolutePath.contains("-")) + .filterNot(_.getAbsolutePath.contains(" ")) + .map(file => file.getAbsolutePath) + }) + .get + .sorted } def listNoteBookFiles(): Array[String] = { - Option( - FileUtilities - .join(BuildInfo.baseDirectory.getParent, "notebooks") + Option({ + val rootDirectory = FileUtilities + .join(BuildInfo.baseDirectory.getParent, "notebooks/features") .getCanonicalFile - .listFiles() + + FileUtilities.recursiveListFiles(rootDirectory) .filter(_.getAbsolutePath.endsWith(".ipynb")) - .map(file => file.getAbsolutePath)) - .get - .sorted + .map(file => file.getAbsolutePath) + }) + .get + .sorted } def postMortem(batch: LivyBatch, livyUrl: String): LivyBatch = { @@ -122,7 +128,7 @@ object SynapseUtilities extends HasHttpClient { def showSubmittingJobs(workspaceName: String, poolName: String): Applications = { val uri: String = "https://" + - s"$workspaceName.dev.azuresynapse.net" + + s"$workspaceName.dev.azuresynapse-dogfood.net" + "/monitoring/workloadTypes/spark/applications" + "?api-version=2020-10-01-preview" + "&filter=(((state%20eq%20%27Queued%27)%20or%20(state%20eq%20%27Submitting%27))" + @@ -152,7 +158,7 @@ object SynapseUtilities extends HasHttpClient { readyPool } else { - println(s"None spark pool is ready to submit job, waiting 10s") + println(s"No spark pool is ready to submit a new job, waiting 10s") blocking { Thread.sleep(10000) } @@ -243,7 +249,8 @@ object SynapseUtilities extends HasHttpClient { val excludes: String = "org.scala-lang:scala-reflect," + "org.apache.spark:spark-tags_2.12," + "org.scalactic:scalactic_2.12," + - "org.scalatest:scalatest_2.12" + "org.scalatest:scalatest_2.12," + + "org.slf4j:slf4j-api" val livyPayload: String = s""" @@ -257,7 +264,7 @@ object SynapseUtilities extends HasHttpClient { | "numExecutors" : 2, | "conf" : | { - | "spark.jars.packages" : "com.microsoft.azure:synapseml:${BuildInfo.version}", + | "spark.jars.packages" : "com.microsoft.azure:synapseml_2.12:${BuildInfo.version}", | "spark.jars.repositories" : "https://mmlspark.azureedge.net/maven", | "spark.jars.excludes": "$excludes", | "spark.driver.userClassPathFirst": "true", diff --git a/website/docs/reference/developer-readme.md b/website/docs/reference/developer-readme.md index d711c04b5f..897b5828f0 100644 --- a/website/docs/reference/developer-readme.md +++ b/website/docs/reference/developer-readme.md @@ -8,20 +8,32 @@ description: SynapseML Development Setup # SynapseML Development Setup 1) [Install SBT](https://www.scala-sbt.org/1.x/docs/Setup.html) - - Make sure to download JDK 11 if you don't have it -3) Fork the repository on github - - This is required if you would like to make PRs. If you choose the fork option, replace the clone link below with that of your fork. -2) Git Clone your fork, or the repo directly - - `git clone https://github.com/Microsoft/SynapseML.git` - - NOTE: If you would like to contribute to synapseml regularly, add your fork as a remote named ``origin`` and Microsoft/SynapseML as a remote named ``upstream`` -3) Run sbt to compile and grab datasets + - Make sure to download [JDK 11](https://www.oracle.com/java/technologies/javase/jdk11-archive-downloads.html) if you don't have it +2) Fork the repository on github + - See how to here: [Fork a repo - GitHub Docs](https://docs.github.com/en/get-started/quickstart/fork-a-repo) +3) Clone your fork + - `git clone https://github.com//SynapseML.git` + - This will automatically add your fork as the default remote, called `origin` +4) Add another Git Remote to track the original SynapseML repo. It's recommended to call it `upstream`: + - `git remote add upstream https://github.com/microsoft/SynapseML.git` + - See more about Git remotes here: [Git - Working with remotes](https://git-scm.com/book/en/v2/Git-Basics-Working-with-Remotes) +5) Run sbt to compile and grab datasets - `cd synapseml` - `sbt setup` -4) [Install IntelliJ](https://www.jetbrains.com/idea/download) +6) [Install IntelliJ](https://www.jetbrains.com/idea/download) - Install Scala plugins during install -5) Configure IntelliJ +7) Configure IntelliJ - **OPEN** the synapseml directory - If the project does not automatically import,click on `build.sbt` and import project +8) Prepare your Python Environment + - Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) + - Activate the `synapseml` conda environment by running `conda env create -f environment.yaml` from the `synapseml` directory. + +> NOTE +> +> If you will be regularly contributing to the SynapseML repo, you'll want to keep your fork synced with the +> upstream repository. Please read [this GitHub doc](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork) +> to know more and learn techniques about how to do it. # Publishing and Using Build Secrets