feat: Added the DeepVisionClassifier a simple API for deep transfer…

… learning and fine-tuning of a variety of vision backbones (#1518) * feat: add black as hooks for changes to python files * use black to auto-format previous python files * feat: add deep learning * add DeepVisionClassifier * update tests * format * update tests ; * add sample notebook for dl * format notebook * remove extra comments * move dl to separate folder * remove unused pacakge * reconstruct dl folder and add setup.py; update notebook; tested on databricks * address comment * change deep learning tests to gpu tests * address comments * remove unused imports * refactor notebook & horovod_installation script * refactor GPU tests * refactor some defaults and add DeepVisionModel * rebase one TorchEstimator * fix args pass problem * delete useless files * clean up tests and refactor classes, fix keyword_only issue * move to deep-learning folder * fix tiny things * move notebook inside features folder * add GPU tests into pipeline * apply codeGen to deepLearning project * update gpu tests * fix scalastyle * move dl tests into synapsemltest folder * remove spark require list * add torchvision & horovod into environment.yml * update environment.yml * update environment.yml * fixing R * fix format * update sparklyr version * skip deep_vision_classifier test for now * create PredictionParams and extend it * format * update notebook * update gpu notebook pool * fixing Rtests * change r-sparklyr version to 1.7.6 * update adb gpu runtime * add installation of whl package in gpu notebook * update r-sparklyr version * fixing R tests * fix pipeline * fixing Rtests * fixing Rtests * reverting changes to sparklyr * fixing R * fix Rtests * fix torch version * fix typo * fix publish * address comments * address comments * update format * add missing " * update databricks tests to show cluster name and change numWorkers for gpu tests * fix executor cannot find synapse module problem. remove pip install process * add pytorch_lightning into environment.yml file * fix sphinx doc generation for dl * format conf * add documents of deep vision classifier on website * update dataset url to our blob since the official website lost connection
microsoft · Jul 28, 2022 · 44c8ed5 · 44c8ed5
1 parent e4f0883
commit 44c8ed5
Show file tree

Hide file tree

Showing 26 changed files with 1,558 additions and 92 deletions.
diff --git a/.gitignore b/.gitignore
@@ -69,3 +69,14 @@ node_modules/
 .bsp
 website/.docusaurus
 null/
+
+# pytorch_lightning logs
+**/lightning_logs/*
+
+# pytest
+.pytest_cache/
+
+# python wheel
+**/build/*
+**/dist/*
+**/*.egg-info/*
diff --git a/build.sbt b/build.sbt
@@ -197,7 +197,7 @@ generateDotnetDoc := {
   val doxygenHelperFile = join(dotnetSrcDir, "DoxygenHelper.txt")
   if (doxygenHelperFile.exists()) FileUtils.forceDelete(doxygenHelperFile)
   FileUtils.writeStringToFile(doxygenHelperFile, fileContent, "utf-8")
-  runCmd(Seq("bash", "-c","cat DoxygenHelper.txt >> Doxyfile", ""), dotnetSrcDir)
+  runCmd(Seq("bash", "-c", "cat DoxygenHelper.txt >> Doxyfile", ""), dotnetSrcDir)
   runCmd(Seq("doxygen"), dotnetSrcDir)
 }
 
@@ -270,9 +270,11 @@ publishPypi := {
 
 val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala, python and dotnet")
 publishDocs := {
-  generatePythonDoc.value
-  (root / Compile / unidoc).value
-  generateDotnetDoc.value
+  Def.sequential(
+    generatePythonDoc,
+    generateDotnetDoc,
+    (root / Compile / unidoc)
+  ).value
   val html =
     """
       |<html><body><pre style="font-size: 150%;">
@@ -382,10 +384,10 @@ lazy val cognitive = (project in file("cognitive"))
       "com.azure" % "azure-ai-textanalytics" % "5.1.4"
     ),
     dependencyOverrides ++= Seq(
-      "com.fasterxml.jackson.core" %  "jackson-databind" % "2.12.5",
-      "com.fasterxml.jackson.core" %  "jackson-core" % "2.12.5",
-      "com.fasterxml.jackson.core" %  "jackson-annotations" % "2.12.5",
-      "com.fasterxml.jackson.dataformat"  %  "jackson-dataformat-xml" % "2.12.5",
+      "com.fasterxml.jackson.core" % "jackson-databind" % "2.12.5",
+      "com.fasterxml.jackson.core" % "jackson-core" % "2.12.5",
+      "com.fasterxml.jackson.core" % "jackson-annotations" % "2.12.5",
+      "com.fasterxml.jackson.dataformat" % "jackson-dataformat-xml" % "2.12.5",
       "com.fasterxml.jackson.datatype" % "jackson-datatype-jsr310" % "2.12.5"
     ),
     name := "synapseml-cognitive"

diff --git a/core/src/main/python/synapse/doc/conf.py b/core/src/main/python/synapse/doc/conf.py
@@ -12,11 +12,13 @@
 # ones.
 extensions = [
     "sphinx.ext.autodoc",
+    "sphinx.ext.doctest",
     "sphinx.ext.intersphinx",
     "sphinx.ext.mathjax",
     "sphinx.ext.ifconfig",
     "sphinx.ext.viewcode",
     "sphinx.ext.napoleon",
+    "sphinx_paramlinks",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -106,26 +108,17 @@
 
 
 # Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {"https://docs.python.org/": None}
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "torch": ("https://pytorch.org/docs/stable/", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
+    "pytorch_lightning": ("https://pytorch-lightning.readthedocs.io/en/stable/", None),
+    "torchvision": ("https://pytorch.org/vision/stable/", None),
+}
 # intersphinx_mapping = { "scala": ("/scala/index.html", None) }
 
-# -- Mock out pandas+numpy that can't be found ----------------------------
-import sys
-
-try:
-    from unittest.mock import MagicMock  # python >= 3.3
-except ImportError:
-    from mock import Mock as MagicMock  # older
-
-
-class Mock(MagicMock):
-    @classmethod
-    def __getattr__(cls, name):
-        return MagicMock()
-
-
-MOCK_MODULES = ["numpy", "pandas"]
-sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
+# -- Mock out pandas that can't be found ----------------------------
+autodoc_mock_imports = ["pandas"]
 
 # -- Setup AutoStructify --------------------------------------------------
 # Use this if we ever want to use markdown pages instead of rst pages.

diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/CodeGen.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/CodeGen.scala
@@ -6,6 +6,7 @@ package com.microsoft.azure.synapse.ml.codegen
 import com.microsoft.azure.synapse.ml.build.BuildInfo
 import com.microsoft.azure.synapse.ml.codegen.CodegenConfigProtocol._
 import com.microsoft.azure.synapse.ml.codegen.DotnetCodegen.dotnetGen
+import com.microsoft.azure.synapse.ml.codegen.GenerationUtils.indent
 import com.microsoft.azure.synapse.ml.core.env.FileUtilities._
 import com.microsoft.azure.synapse.ml.core.utils.JarLoadingUtils.instantiateServices
 import org.apache.commons.io.FileUtils
@@ -88,6 +89,7 @@ object CodeGen {
           |Suggests:
           |    testthat (>= 3.0.0)
           |Config/testthat/edition: 3
+          |Encoding: UTF-8
           |""".stripMargin)
 
     val scalaVersion = BuildInfo.scalaVersion.split(".".toCharArray).dropRight(1).mkString(".")
@@ -134,11 +136,25 @@ object CodeGen {
   }
 
   //noinspection ScalaStyle
+  //scalastyle:off
   def generatePyPackageData(conf: CodegenConfig): Unit = {
     if (!conf.pySrcDir.exists()) {
       conf.pySrcDir.mkdir()
     }
     val extraPackage = if (conf.name.endsWith("core")){" + [\"mmlspark\"]"}else{""}
+    val requireList = if(conf.name.contains("deep-learning")) {
+      s"""MINIMUM_SUPPORTED_PYTHON_VERSION = "3.8"""".stripMargin
+    } else ""
+    val extraRequirements = if (conf.name.contains("deep-learning")) {
+      s"""extras_require={"extras": [
+         |    "cmake",
+         |    "horovod==0.25.0",
+         |    "pytorch_lightning>=1.5.0,<1.5.10",
+         |    "torch==1.11.0",
+         |    "torchvision>=0.12.0"
+         |]},
+         |python_requires=f">={MINIMUM_SUPPORTED_PYTHON_VERSION}",""".stripMargin
+    } else ""
     writeFile(join(conf.pySrcDir, "setup.py"),
       s"""
          |# Copyright (C) Microsoft Corporation. All rights reserved.
@@ -149,6 +165,8 @@ object CodeGen {
          |import codecs
          |import os.path
          |
+         |$requireList
+         |
          |setup(
          |    name="${conf.name}",
          |    version="${conf.pythonizedVersion}",
@@ -171,10 +189,17 @@ object CodeGen {
          |    ],
          |    zip_safe=True,
          |    package_data={"synapseml": ["../LICENSE.txt", "../README.txt"]},
+         |    project_urls={
+         |        "Website": "https://microsoft.github.io/SynapseML/",
+         |        "Documentation": "https://mmlspark.blob.core.windows.net/docs/${conf.pythonizedVersion}/pyspark/index.html",
+         |        "Source Code": "https://github.com/Microsoft/SynapseML",
+         |    },
+         |${indent(extraRequirements, 1)}
          |)
          |
          |""".stripMargin)
   }
+  //scalastyle:on
 
   def rGen(conf: CodegenConfig): Unit = {
     println(s"Generating R for ${conf.jarName}")

diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/DotnetWrappable.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/DotnetWrappable.scala
@@ -275,7 +275,9 @@ trait DotnetWrappable extends BaseWrappable {
     val srcFolders = importPath.mkString(".")
       .replaceAllLiterally("com.microsoft.azure.synapse.ml", "synapse.ml").split(".".toCharArray)
     val srcDir = FileUtilities.join((Seq(conf.dotnetSrcDir.toString) ++ srcFolders.toSeq): _*)
-    srcDir.mkdirs()
+    if (!srcDir.exists()) {
+      srcDir.mkdirs()
+    }
     Files.write(
       FileUtilities.join(srcDir, dotnetClassName + ".cs").toPath,
       dotnetClass().getBytes(StandardCharsets.UTF_8))

diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksTests.scala
@@ -3,53 +3,18 @@
 
 package com.microsoft.azure.synapse.ml.nbtest
 
-import com.microsoft.azure.synapse.ml.core.test.base.TestBase
 import com.microsoft.azure.synapse.ml.nbtest.DatabricksUtilities._
 
 import java.util.concurrent.TimeUnit
-import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
 import scala.concurrent.Await
 import scala.concurrent.duration.Duration
 import scala.language.existentials
 
-class DatabricksTests extends TestBase {
+class DatabricksTests extends DatabricksTestHelper {
 
-  val clusterId: String = createClusterInPool(ClusterName, PoolId)
-  val jobIdsToCancel: mutable.ListBuffer[Int] = mutable.ListBuffer[Int]()
-
-  println("Checking if cluster is active")
-  tryWithRetries(Seq.fill(60 * 15)(1000).toArray) { () =>
-    assert(isClusterActive(clusterId))
-  }
-  println("Installing libraries")
-  installLibraries(clusterId)
-  tryWithRetries(Seq.fill(60 * 3)(1000).toArray) { () =>
-    assert(areLibrariesInstalled(clusterId))
-  }
-  println(s"Creating folder $Folder")
-  workspaceMkDir(Folder)
-
-  println(s"Submitting jobs")
-  val parNotebookRuns: Seq[DatabricksNotebookRun] = ParallelizableNotebooks.map(uploadAndSubmitNotebook(clusterId, _))
-  parNotebookRuns.foreach(notebookRun => jobIdsToCancel.append(notebookRun.runId))
-
-  println(s"Submitted ${parNotebookRuns.length} for execution: ${parNotebookRuns.map(_.runId).toList}")
-
-  assert(parNotebookRuns.nonEmpty)
-
-  parNotebookRuns.foreach(run => {
-    println(s"Testing ${run.notebookName}")
-
-    test(run.notebookName) {
-      val result = Await.ready(
-        run.monitor(logLevel = 0),
-        Duration(TimeoutInMillis.toLong, TimeUnit.MILLISECONDS)).value.get
-
-      if (!result.isSuccess){
-        throw result.failed.get
-      }
-    }
-  })
+  val clusterId: String = createClusterInPool(ClusterName, AdbRuntime, NumWorkers, PoolId, "[]")
+  val jobIdsToCancel: ListBuffer[Int] = databricksTestHelper(clusterId, Libraries, CPUNotebooks)
 
   println(s"Submitting nonparallelizable job...")
   NonParallelizableNotebooks.toIterator.foreach(notebook => {
@@ -68,16 +33,12 @@ class DatabricksTests extends TestBase {
   })
 
   protected override def afterAll(): Unit = {
-    println("Suite DatabricksTests finished. Running afterAll procedure...")
-    jobIdsToCancel.foreach(cancelRun)
-
-    deleteCluster(clusterId)
-    println(s"Deleted cluster with Id $clusterId.")
+    afterAllHelper(jobIdsToCancel, clusterId, ClusterName)
 
     super.afterAll()
   }
 
-  ignore("list running jobs for convenievce") {
+  ignore("list running jobs for convenience") {
     val obj = databricksGet("jobs/runs/list?active_only=true&limit=1000")
     println(obj)
   }