Skip to content

Commit

Permalink
feat: Added the DeepVisionClassifier a simple API for deep transfer…
Browse files Browse the repository at this point in the history
… learning and fine-tuning of a variety of vision backbones (#1518)

* feat: add black as hooks for changes to python files

* use black to auto-format previous python files

* feat: add deep learning

* add DeepVisionClassifier

* update tests

* format

* update tests
;

* add sample notebook for dl

* format notebook

* remove extra comments

* move dl to separate folder

* remove unused pacakge

* reconstruct dl folder and add setup.py; update notebook; tested on databricks

* address comment

* change deep learning tests to gpu tests

* address comments

* remove unused imports

* refactor notebook & horovod_installation script

* refactor GPU tests

* refactor some defaults and add DeepVisionModel

* rebase one TorchEstimator

* fix args pass problem

* delete useless files

* clean up tests and refactor classes, fix keyword_only issue

* move to deep-learning folder

* fix tiny things

* move notebook inside features folder

* add GPU tests into pipeline

* apply codeGen to deepLearning project

* update gpu tests

* fix scalastyle

* move dl tests into synapsemltest folder

* remove spark require list

* add torchvision & horovod into environment.yml

* update environment.yml

* update environment.yml

* fixing R

* fix format

* update sparklyr version

* skip deep_vision_classifier test for now

* create PredictionParams and extend it

* format

* update notebook

* update gpu notebook pool

* fixing Rtests

* change r-sparklyr version to 1.7.6

* update adb gpu runtime

* add installation of whl package in gpu notebook

* update r-sparklyr version

* fixing R tests

* fix pipeline

* fixing Rtests

* fixing Rtests

* reverting changes to sparklyr

* fixing R

* fix Rtests

* fix torch version

* fix typo

* fix publish

* address comments

* address comments

* update format

* add missing "

* update databricks tests to show cluster name and change numWorkers for gpu tests

* fix executor cannot find synapse module problem. remove pip install process

* add pytorch_lightning into environment.yml file

* fix sphinx doc generation for dl

* format conf

* add documents of deep vision classifier on website

* update dataset url to our blob since the official website lost connection
  • Loading branch information
serena-ruan committed Jul 28, 2022
1 parent e4f0883 commit 44c8ed5
Show file tree
Hide file tree
Showing 26 changed files with 1,558 additions and 92 deletions.
11 changes: 11 additions & 0 deletions .gitignore
Expand Up @@ -69,3 +69,14 @@ node_modules/
.bsp
website/.docusaurus
null/

# pytorch_lightning logs
**/lightning_logs/*

# pytest
.pytest_cache/

# python wheel
**/build/*
**/dist/*
**/*.egg-info/*
18 changes: 10 additions & 8 deletions build.sbt
Expand Up @@ -197,7 +197,7 @@ generateDotnetDoc := {
val doxygenHelperFile = join(dotnetSrcDir, "DoxygenHelper.txt")
if (doxygenHelperFile.exists()) FileUtils.forceDelete(doxygenHelperFile)
FileUtils.writeStringToFile(doxygenHelperFile, fileContent, "utf-8")
runCmd(Seq("bash", "-c","cat DoxygenHelper.txt >> Doxyfile", ""), dotnetSrcDir)
runCmd(Seq("bash", "-c", "cat DoxygenHelper.txt >> Doxyfile", ""), dotnetSrcDir)
runCmd(Seq("doxygen"), dotnetSrcDir)
}

Expand Down Expand Up @@ -270,9 +270,11 @@ publishPypi := {

val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala, python and dotnet")
publishDocs := {
generatePythonDoc.value
(root / Compile / unidoc).value
generateDotnetDoc.value
Def.sequential(
generatePythonDoc,
generateDotnetDoc,
(root / Compile / unidoc)
).value
val html =
"""
|<html><body><pre style="font-size: 150%;">
Expand Down Expand Up @@ -382,10 +384,10 @@ lazy val cognitive = (project in file("cognitive"))
"com.azure" % "azure-ai-textanalytics" % "5.1.4"
),
dependencyOverrides ++= Seq(
"com.fasterxml.jackson.core" % "jackson-databind" % "2.12.5",
"com.fasterxml.jackson.core" % "jackson-core" % "2.12.5",
"com.fasterxml.jackson.core" % "jackson-annotations" % "2.12.5",
"com.fasterxml.jackson.dataformat" % "jackson-dataformat-xml" % "2.12.5",
"com.fasterxml.jackson.core" % "jackson-databind" % "2.12.5",
"com.fasterxml.jackson.core" % "jackson-core" % "2.12.5",
"com.fasterxml.jackson.core" % "jackson-annotations" % "2.12.5",
"com.fasterxml.jackson.dataformat" % "jackson-dataformat-xml" % "2.12.5",
"com.fasterxml.jackson.datatype" % "jackson-datatype-jsr310" % "2.12.5"
),
name := "synapseml-cognitive"
Expand Down
29 changes: 11 additions & 18 deletions core/src/main/python/synapse/doc/conf.py
Expand Up @@ -12,11 +12,13 @@
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.doctest",
"sphinx.ext.intersphinx",
"sphinx.ext.mathjax",
"sphinx.ext.ifconfig",
"sphinx.ext.viewcode",
"sphinx.ext.napoleon",
"sphinx_paramlinks",
]

# Add any paths that contain templates here, relative to this directory.
Expand Down Expand Up @@ -106,26 +108,17 @@


# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {"https://docs.python.org/": None}
intersphinx_mapping = {
"python": ("https://docs.python.org/3", None),
"torch": ("https://pytorch.org/docs/stable/", None),
"numpy": ("https://numpy.org/doc/stable/", None),
"pytorch_lightning": ("https://pytorch-lightning.readthedocs.io/en/stable/", None),
"torchvision": ("https://pytorch.org/vision/stable/", None),
}
# intersphinx_mapping = { "scala": ("/scala/index.html", None) }

# -- Mock out pandas+numpy that can't be found ----------------------------
import sys

try:
from unittest.mock import MagicMock # python >= 3.3
except ImportError:
from mock import Mock as MagicMock # older


class Mock(MagicMock):
@classmethod
def __getattr__(cls, name):
return MagicMock()


MOCK_MODULES = ["numpy", "pandas"]
sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
# -- Mock out pandas that can't be found ----------------------------
autodoc_mock_imports = ["pandas"]

# -- Setup AutoStructify --------------------------------------------------
# Use this if we ever want to use markdown pages instead of rst pages.
Expand Down
Expand Up @@ -6,6 +6,7 @@ package com.microsoft.azure.synapse.ml.codegen
import com.microsoft.azure.synapse.ml.build.BuildInfo
import com.microsoft.azure.synapse.ml.codegen.CodegenConfigProtocol._
import com.microsoft.azure.synapse.ml.codegen.DotnetCodegen.dotnetGen
import com.microsoft.azure.synapse.ml.codegen.GenerationUtils.indent
import com.microsoft.azure.synapse.ml.core.env.FileUtilities._
import com.microsoft.azure.synapse.ml.core.utils.JarLoadingUtils.instantiateServices
import org.apache.commons.io.FileUtils
Expand Down Expand Up @@ -88,6 +89,7 @@ object CodeGen {
|Suggests:
| testthat (>= 3.0.0)
|Config/testthat/edition: 3
|Encoding: UTF-8
|""".stripMargin)

val scalaVersion = BuildInfo.scalaVersion.split(".".toCharArray).dropRight(1).mkString(".")
Expand Down Expand Up @@ -134,11 +136,25 @@ object CodeGen {
}

//noinspection ScalaStyle
//scalastyle:off
def generatePyPackageData(conf: CodegenConfig): Unit = {
if (!conf.pySrcDir.exists()) {
conf.pySrcDir.mkdir()
}
val extraPackage = if (conf.name.endsWith("core")){" + [\"mmlspark\"]"}else{""}
val requireList = if(conf.name.contains("deep-learning")) {
s"""MINIMUM_SUPPORTED_PYTHON_VERSION = "3.8"""".stripMargin
} else ""
val extraRequirements = if (conf.name.contains("deep-learning")) {
s"""extras_require={"extras": [
| "cmake",
| "horovod==0.25.0",
| "pytorch_lightning>=1.5.0,<1.5.10",
| "torch==1.11.0",
| "torchvision>=0.12.0"
|]},
|python_requires=f">={MINIMUM_SUPPORTED_PYTHON_VERSION}",""".stripMargin
} else ""
writeFile(join(conf.pySrcDir, "setup.py"),
s"""
|# Copyright (C) Microsoft Corporation. All rights reserved.
Expand All @@ -149,6 +165,8 @@ object CodeGen {
|import codecs
|import os.path
|
|$requireList
|
|setup(
| name="${conf.name}",
| version="${conf.pythonizedVersion}",
Expand All @@ -171,10 +189,17 @@ object CodeGen {
| ],
| zip_safe=True,
| package_data={"synapseml": ["../LICENSE.txt", "../README.txt"]},
| project_urls={
| "Website": "https://microsoft.github.io/SynapseML/",
| "Documentation": "https://mmlspark.blob.core.windows.net/docs/${conf.pythonizedVersion}/pyspark/index.html",
| "Source Code": "https://github.com/Microsoft/SynapseML",
| },
|${indent(extraRequirements, 1)}
|)
|
|""".stripMargin)
}
//scalastyle:on

def rGen(conf: CodegenConfig): Unit = {
println(s"Generating R for ${conf.jarName}")
Expand Down
Expand Up @@ -275,7 +275,9 @@ trait DotnetWrappable extends BaseWrappable {
val srcFolders = importPath.mkString(".")
.replaceAllLiterally("com.microsoft.azure.synapse.ml", "synapse.ml").split(".".toCharArray)
val srcDir = FileUtilities.join((Seq(conf.dotnetSrcDir.toString) ++ srcFolders.toSeq): _*)
srcDir.mkdirs()
if (!srcDir.exists()) {
srcDir.mkdirs()
}
Files.write(
FileUtilities.join(srcDir, dotnetClassName + ".cs").toPath,
dotnetClass().getBytes(StandardCharsets.UTF_8))
Expand Down
Expand Up @@ -3,53 +3,18 @@

package com.microsoft.azure.synapse.ml.nbtest

import com.microsoft.azure.synapse.ml.core.test.base.TestBase
import com.microsoft.azure.synapse.ml.nbtest.DatabricksUtilities._

import java.util.concurrent.TimeUnit
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.concurrent.Await
import scala.concurrent.duration.Duration
import scala.language.existentials

class DatabricksTests extends TestBase {
class DatabricksTests extends DatabricksTestHelper {

val clusterId: String = createClusterInPool(ClusterName, PoolId)
val jobIdsToCancel: mutable.ListBuffer[Int] = mutable.ListBuffer[Int]()

println("Checking if cluster is active")
tryWithRetries(Seq.fill(60 * 15)(1000).toArray) { () =>
assert(isClusterActive(clusterId))
}
println("Installing libraries")
installLibraries(clusterId)
tryWithRetries(Seq.fill(60 * 3)(1000).toArray) { () =>
assert(areLibrariesInstalled(clusterId))
}
println(s"Creating folder $Folder")
workspaceMkDir(Folder)

println(s"Submitting jobs")
val parNotebookRuns: Seq[DatabricksNotebookRun] = ParallelizableNotebooks.map(uploadAndSubmitNotebook(clusterId, _))
parNotebookRuns.foreach(notebookRun => jobIdsToCancel.append(notebookRun.runId))

println(s"Submitted ${parNotebookRuns.length} for execution: ${parNotebookRuns.map(_.runId).toList}")

assert(parNotebookRuns.nonEmpty)

parNotebookRuns.foreach(run => {
println(s"Testing ${run.notebookName}")

test(run.notebookName) {
val result = Await.ready(
run.monitor(logLevel = 0),
Duration(TimeoutInMillis.toLong, TimeUnit.MILLISECONDS)).value.get

if (!result.isSuccess){
throw result.failed.get
}
}
})
val clusterId: String = createClusterInPool(ClusterName, AdbRuntime, NumWorkers, PoolId, "[]")
val jobIdsToCancel: ListBuffer[Int] = databricksTestHelper(clusterId, Libraries, CPUNotebooks)

println(s"Submitting nonparallelizable job...")
NonParallelizableNotebooks.toIterator.foreach(notebook => {
Expand All @@ -68,16 +33,12 @@ class DatabricksTests extends TestBase {
})

protected override def afterAll(): Unit = {
println("Suite DatabricksTests finished. Running afterAll procedure...")
jobIdsToCancel.foreach(cancelRun)

deleteCluster(clusterId)
println(s"Deleted cluster with Id $clusterId.")
afterAllHelper(jobIdsToCancel, clusterId, ClusterName)

super.afterAll()
}

ignore("list running jobs for convenievce") {
ignore("list running jobs for convenience") {
val obj = databricksGet("jobs/runs/list?active_only=true&limit=1000")
println(obj)
}
Expand Down

0 comments on commit 44c8ed5

Please sign in to comment.