Skip to content

Commit

Permalink
fix: Make SynapseE2E Tests work now with Spark 3.2 (#1362)
Browse files Browse the repository at this point in the history
* Trying to use only pool with Spark 3.2

* Updating install instructions for synapse to use 0.9.4

* Changing syntax to grab ipynb files

* Line breaking to comply with styling

* Changing ipynb filter for windows

* Fixing string new line syntax

* Improvements to SynapseTests

* Adding more spark pools 3.2

* Adjusting list tests not to assert

* Improving dev doc, livyPayLoad

* Changing SynapseWS to mmlsparkppe

* Changing synapse URL to dogfood

* Removing dogfood from token acquisition

* Fixing exludes syntax

* Adding 2 more Apache Spark Pools

* Improving the developer docs

* Adjusting identation on developer-readme

* Bumping Synapse test timeout to 40 min

* Applying PR feedback

Co-authored-by: Serena Ruan <82044803+serena-ruan@users.noreply.github.com>
  • Loading branch information
riserrad and serena-ruan committed Feb 7, 2022
1 parent 47f0c8f commit f070c2e
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ object FileUtilities {
val CREATE = S.CREATE
}

def recursiveListFiles(f: File): Array[File] = {
val these = f.listFiles()
these ++ these
.filter(_.isDirectory)
.flatMap(recursiveListFiles)
.filter(!_.isDirectory)
}

def allFiles(dir: File, pred: (File => Boolean) = null): Array[File] = {
def loop(dir: File): Array[File] = {
val (dirs, files) = dir.listFiles.sorted.partition(_.isDirectory)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@ import scala.sys.process.Process
/** Tests to validate fuzzing of modules. */
class SynapseTests extends TestBase {

ignore("Synapse") {
test("Synapse") {

val os = sys.props("os.name").toLowerCase
os match {
case x if x contains "windows" =>
exec("conda activate synapseml && jupyter nbconvert --to script .\\notebooks\\*.ipynb")
exec("conda activate synapseml " +
"&& jupyter nbconvert --to script .\\notebooks\\features\\**\\*.ipynb")
case _ =>
Process(s"conda init bash; conda activate synapseml; jupyter nbconvert --to script ./notebooks/*.ipynb")
Process(s"conda init bash; conda activate synapseml; " +
"jupyter nbconvert --to script ./notebooks/features/**/*.ipynb")
}

SynapseUtilities.listPythonFiles().map(f => {
Expand All @@ -33,8 +35,13 @@ class SynapseTests extends TestBase {
new File(f).renameTo(new File(newPath))
})

val workspaceName = "mmlspark"
val sparkPools = Array("buildpool", "buildpool2", "buildpool3")
val workspaceName = "mmlsparkppe"
val sparkPools = Array(
"e2etstspark32i1",
"e2etstspark32i2",
"e2etstspark32i3",
"e2etstspark32i4",
"e2etstspark32i5")

val livyBatchJobs = SynapseUtilities.listPythonJobFiles()
.filterNot(_.contains(" "))
Expand All @@ -43,7 +50,7 @@ class SynapseTests extends TestBase {
val poolName = SynapseUtilities.monitorPool(workspaceName, sparkPools)
val livyUrl = "https://" +
workspaceName +
".dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/" +
".dev.azuresynapse-dogfood.net/livyApi/versions/2019-11-01-preview/sparkPools/" +
poolName +
"/batches"
val livyBatch: LivyBatch = SynapseUtilities.uploadAndSubmitNotebook(livyUrl, f)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,19 @@ object SynapseUtilities extends HasHttpClient {
lazy val Token: String = getSynapseToken

val Folder = s"build_${BuildInfo.version}/scripts"
val TimeoutInMillis: Int = 20 * 60 * 1000
val TimeoutInMillis: Int = 30 * 60 * 1000 // 30 minutes
val StorageAccount: String = "mmlsparkeuap"
val StorageContainer: String = "synapse"
val StorageContainer: String = "mmlsparkppefs"
val TenantId: String = "72f988bf-86f1-41af-91ab-2d7cd011db47"
val ClientId: String = "85dde348-dd2b-43e5-9f5a-22262af45332"

def listPythonFiles(): Array[String] = {
Option(
FileUtilities
.join(BuildInfo.baseDirectory.getParent, "notebooks")
Option({
val rootDirectory = FileUtilities
.join(BuildInfo.baseDirectory.getParent, "notebooks/features")
.getCanonicalFile
.listFiles()

FileUtilities.recursiveListFiles(rootDirectory)
.filter(_.getAbsolutePath.endsWith(".py"))
.filter(_.getAbsolutePath.contains("-"))
.filterNot(_.getAbsolutePath.contains("CyberML"))
Expand All @@ -73,35 +74,40 @@ object SynapseUtilities extends HasHttpClient {
.filterNot(_.getAbsolutePath.contains("Overview"))
.filterNot(_.getAbsolutePath.contains("ModelInterpretation"))
.filterNot(_.getAbsolutePath.contains("Interpretability"))
.map(file => file.getAbsolutePath))
.get
.sorted
.map(file => file.getAbsolutePath)
})
.get
.sorted
}

def listPythonJobFiles(): Array[String] = {
Option(
FileUtilities
.join(BuildInfo.baseDirectory.getParent, "notebooks")
.getCanonicalFile
.listFiles()
.filter(_.getAbsolutePath.endsWith(".py"))
.filterNot(_.getAbsolutePath.contains("-"))
.filterNot(_.getAbsolutePath.contains(" "))
.map(file => file.getAbsolutePath))
.get
.sorted
Option({
val rootDirectory = FileUtilities
.join(BuildInfo.baseDirectory.getParent, "notebooks/features")
.getCanonicalFile

FileUtilities.recursiveListFiles(rootDirectory)
.filter(_.getAbsolutePath.endsWith(".py"))
.filterNot(_.getAbsolutePath.contains("-"))
.filterNot(_.getAbsolutePath.contains(" "))
.map(file => file.getAbsolutePath)
})
.get
.sorted
}

def listNoteBookFiles(): Array[String] = {
Option(
FileUtilities
.join(BuildInfo.baseDirectory.getParent, "notebooks")
Option({
val rootDirectory = FileUtilities
.join(BuildInfo.baseDirectory.getParent, "notebooks/features")
.getCanonicalFile
.listFiles()

FileUtilities.recursiveListFiles(rootDirectory)
.filter(_.getAbsolutePath.endsWith(".ipynb"))
.map(file => file.getAbsolutePath))
.get
.sorted
.map(file => file.getAbsolutePath)
})
.get
.sorted
}

def postMortem(batch: LivyBatch, livyUrl: String): LivyBatch = {
Expand All @@ -122,7 +128,7 @@ object SynapseUtilities extends HasHttpClient {
def showSubmittingJobs(workspaceName: String, poolName: String): Applications = {
val uri: String =
"https://" +
s"$workspaceName.dev.azuresynapse.net" +
s"$workspaceName.dev.azuresynapse-dogfood.net" +
"/monitoring/workloadTypes/spark/applications" +
"?api-version=2020-10-01-preview" +
"&filter=(((state%20eq%20%27Queued%27)%20or%20(state%20eq%20%27Submitting%27))" +
Expand Down Expand Up @@ -152,7 +158,7 @@ object SynapseUtilities extends HasHttpClient {
readyPool
}
else {
println(s"None spark pool is ready to submit job, waiting 10s")
println(s"No spark pool is ready to submit a new job, waiting 10s")
blocking {
Thread.sleep(10000)
}
Expand Down Expand Up @@ -243,7 +249,8 @@ object SynapseUtilities extends HasHttpClient {
val excludes: String = "org.scala-lang:scala-reflect," +
"org.apache.spark:spark-tags_2.12," +
"org.scalactic:scalactic_2.12," +
"org.scalatest:scalatest_2.12"
"org.scalatest:scalatest_2.12," +
"org.slf4j:slf4j-api"

val livyPayload: String =
s"""
Expand All @@ -257,7 +264,7 @@ object SynapseUtilities extends HasHttpClient {
| "numExecutors" : 2,
| "conf" :
| {
| "spark.jars.packages" : "com.microsoft.azure:synapseml:${BuildInfo.version}",
| "spark.jars.packages" : "com.microsoft.azure:synapseml_2.12:${BuildInfo.version}",
| "spark.jars.repositories" : "https://mmlspark.azureedge.net/maven",
| "spark.jars.excludes": "$excludes",
| "spark.driver.userClassPathFirst": "true",
Expand Down
30 changes: 21 additions & 9 deletions website/docs/reference/developer-readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,32 @@ description: SynapseML Development Setup
# SynapseML Development Setup

1) [Install SBT](https://www.scala-sbt.org/1.x/docs/Setup.html)
- Make sure to download JDK 11 if you don't have it
3) Fork the repository on github
- This is required if you would like to make PRs. If you choose the fork option, replace the clone link below with that of your fork.
2) Git Clone your fork, or the repo directly
- `git clone https://github.com/Microsoft/SynapseML.git`
- NOTE: If you would like to contribute to synapseml regularly, add your fork as a remote named ``origin`` and Microsoft/SynapseML as a remote named ``upstream``
3) Run sbt to compile and grab datasets
- Make sure to download [JDK 11](https://www.oracle.com/java/technologies/javase/jdk11-archive-downloads.html) if you don't have it
2) Fork the repository on github
- See how to here: [Fork a repo - GitHub Docs](https://docs.github.com/en/get-started/quickstart/fork-a-repo)
3) Clone your fork
- `git clone https://github.com/<your GitHub handle>/SynapseML.git`
- This will automatically add your fork as the default remote, called `origin`
4) Add another Git Remote to track the original SynapseML repo. It's recommended to call it `upstream`:
- `git remote add upstream https://github.com/microsoft/SynapseML.git`
- See more about Git remotes here: [Git - Working with remotes](https://git-scm.com/book/en/v2/Git-Basics-Working-with-Remotes)
5) Run sbt to compile and grab datasets
- `cd synapseml`
- `sbt setup`
4) [Install IntelliJ](https://www.jetbrains.com/idea/download)
6) [Install IntelliJ](https://www.jetbrains.com/idea/download)
- Install Scala plugins during install
5) Configure IntelliJ
7) Configure IntelliJ
- **OPEN** the synapseml directory
- If the project does not automatically import,click on `build.sbt` and import project
8) Prepare your Python Environment
- Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
- Activate the `synapseml` conda environment by running `conda env create -f environment.yaml` from the `synapseml` directory.

> NOTE
>
> If you will be regularly contributing to the SynapseML repo, you'll want to keep your fork synced with the
> upstream repository. Please read [this GitHub doc](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork)
> to know more and learn techniques about how to do it.
# Publishing and Using Build Secrets

Expand Down

0 comments on commit f070c2e

Please sign in to comment.