From 4d1e8e967a67a5b9f62f7e11469bdc1d4d544d71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20P=C3=A9rez-Garc=C3=ADa?= Date: Tue, 17 May 2022 13:25:04 +0100 Subject: [PATCH] Remove trailing whitespaces --- .devcontainer/devcontainer.json | 6 +-- .devcontainer/noop.txt | 2 +- .github/workflows/check-pr-title.yml | 2 +- CONTRIBUTING.md | 4 +- SECURITY.md | 2 +- docs/source/authentication.md | 30 +++++++------- docs/source/commandline_tools.md | 28 ++++++------- docs/source/datasets.md | 30 +++++++------- docs/source/downloading.md | 8 ++-- docs/source/examples.md | 60 +++++++++++++-------------- docs/source/first_steps.md | 32 +++++++------- docs/source/hyperdrive.md | 4 +- docs/source/logging.md | 4 +- docs/source/lowpriority.md | 2 +- docs/source/runner.md | 16 +++---- docs/source/self_supervised_models.md | 16 +++---- hi-ml-azure/package_description.md | 2 +- hi-ml-azure/pytest.ini | 2 +- 18 files changed, 125 insertions(+), 125 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 6d739ffc4..4e49e15fb 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,7 +2,7 @@ // https://github.com/microsoft/vscode-dev-containers/tree/v0.187.0/containers/python-3-miniconda { "name": "hi-ml", - "build": { + "build": { "context": "..", "dockerfile": "Dockerfile", "args": { @@ -12,7 +12,7 @@ }, // Set *default* container specific settings.json values on container create. - "settings": { + "settings": { "python.pythonPath": "/opt/conda/bin/python", "python.languageServer": "Pylance", "python.linting.enabled": true, @@ -42,7 +42,7 @@ // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. // "remoteUser": "vscode" - + // Extra settings to start the docker container in order to use libfuse, required for locally mounting datasets. // More info: https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.filedataset?view=azure-ml-py#mount-mount-point-none----kwargs- "runArgs": [ diff --git a/.devcontainer/noop.txt b/.devcontainer/noop.txt index abee19541..a3cb3cca1 100644 --- a/.devcontainer/noop.txt +++ b/.devcontainer/noop.txt @@ -1,3 +1,3 @@ This file is copied into the container along with environment.yml* from the -parent folder. This is done to prevent the Dockerfile COPY instruction from +parent folder. This is done to prevent the Dockerfile COPY instruction from failing if no environment.yml is found. \ No newline at end of file diff --git a/.github/workflows/check-pr-title.yml b/.github/workflows/check-pr-title.yml index 0c5b693fb..a786d5022 100644 --- a/.github/workflows/check-pr-title.yml +++ b/.github/workflows/check-pr-title.yml @@ -1,5 +1,5 @@ name: 'Check PR Title' -on: +on: pull_request: types: [edited, opened, synchronize, reopened] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 40e4b3b5d..6403375d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,7 +85,7 @@ class Foo: This is the class description. The following block will be pretty-printed by Sphinx. Note the space between >>> and the code! - + Usage example: >>> from module import Foo >>> foo = Foo(bar=1.23) @@ -107,7 +107,7 @@ class Foo: if enclosed in double backtick. This method can raise a :exc:`ValueError`. - + :param arg: This is a description for the method argument. Long descriptions should be indented. """ diff --git a/SECURITY.md b/SECURITY.md index f7b89984f..1f9d3d442 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -14,7 +14,7 @@ Instead, please report them to the Microsoft Security Response Center (MSRC) at If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). -You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: diff --git a/docs/source/authentication.md b/docs/source/authentication.md index e012a94ef..4dbda068b 100644 --- a/docs/source/authentication.md +++ b/docs/source/authentication.md @@ -2,14 +2,14 @@ ## Authentication -The `hi-ml` package uses two possible ways of authentication with Azure. +The `hi-ml` package uses two possible ways of authentication with Azure. The default is what is called "Interactive Authentication". When you submit a job to Azure via `hi-ml`, this will use the credentials you used in the browser when last logging into Azure. If there are no credentials yet, you should see instructions printed out to the console about how to log in using your browser. -We recommend using Interactive Authentication. +We recommend using Interactive Authentication. -Alternatively, you can use a so-called Service Principal, for example within build pipelines. +Alternatively, you can use a so-called Service Principal, for example within build pipelines. ## Service Principal Authentication @@ -19,7 +19,7 @@ training runs from code, for example from within an Azure pipeline. You can find [here](https://docs.microsoft.com/en-us/azure/active-directory/develop/app-objects-and-service-principals). If you would like to use Service Principal, you will need to create it in Azure first, and then store 3 pieces -of information in 3 environment variables — please see the instructions below. When all the 3 environment variables are in place, +of information in 3 environment variables — please see the instructions below. When all the 3 environment variables are in place, your Azure submissions will automatically use the Service Principal to authenticate. @@ -29,28 +29,28 @@ your Azure submissions will automatically use the Service Principal to authentic 1. Navigate to `App registrations` (use the top search bar to find it). 1. Click on `+ New registration` on the top left of the page. 1. Choose a name for your application e.g. `MyServicePrincipal` and click `Register`. - 1. Once it is created you will see your application in the list appearing under `App registrations`. This step might take - a few minutes. - 1. Click on the resource to access its properties. In particular, you will need the application ID. - You can find this ID in the `Overview` tab (accessible from the list on the left of the page). - 1. Create an environment variable called `HIML_SERVICE_PRINCIPAL_ID`, and set its value to the application ID you + 1. Once it is created you will see your application in the list appearing under `App registrations`. This step might take + a few minutes. + 1. Click on the resource to access its properties. In particular, you will need the application ID. + You can find this ID in the `Overview` tab (accessible from the list on the left of the page). + 1. Create an environment variable called `HIML_SERVICE_PRINCIPAL_ID`, and set its value to the application ID you just saw. - 1. You need to create an application secret to access the resources managed by this service principal. - On the pane on the left find `Certificates & Secrets`. Click on `+ New client secret` (bottom of the page), note down your token. - Warning: this token will only appear once at the creation of the token, you will not be able to re-display it again later. + 1. You need to create an application secret to access the resources managed by this service principal. + On the pane on the left find `Certificates & Secrets`. Click on `+ New client secret` (bottom of the page), note down your token. + Warning: this token will only appear once at the creation of the token, you will not be able to re-display it again later. 1. Create an environment variable called `HIML_SERVICE_PRINCIPAL_PASSWORD`, and set its value to the token you just added. ### Providing permissions to the Service Principal -Now that your service principal is created, you need to give permission for it to access and manage your AzureML workspace. +Now that your service principal is created, you need to give permission for it to access and manage your AzureML workspace. To do so: 1. Go to your AzureML workspace. To find it you can type the name of your workspace in the search bar above. 1. On the `Overview` page, there is a link to the Resource Group that contains the workspace. Click on that. 1. When on the Resource Group, navigate to `Access control`. Then click on `+ Add` > `Add role assignment`. A pane will appear on the the right. Select `Role > Contributor`. In the `Select` field type the name of your Service Principal and select it. Finish by clicking `Save` at the bottom of the pane. - - + + ### Azure Tenant ID The last remaining piece is the Azure tenant ID, which also needs to be available in an environment variable. To get that ID: diff --git a/docs/source/commandline_tools.md b/docs/source/commandline_tools.md index aa116900f..4658de494 100644 --- a/docs/source/commandline_tools.md +++ b/docs/source/commandline_tools.md @@ -6,8 +6,8 @@ From the command line, run the command ```himl-tb``` -specifying one of -`[--experiment] [--latest_run_file] [--run]` +specifying one of +`[--experiment] [--latest_run_file] [--run]` This will start a TensorBoard session, by default running on port 6006. To use an alternative port, specify this with `--port`. @@ -21,16 +21,16 @@ If you choose to specify `--experiment`, you can also specify `--num_runs` to vi If your AML config path is not ROOT_DIR/config.json, you must also specify `--config_file`. -To see an example of how to create TensorBoard logs using PyTorch on AML, see the +To see an example of how to create TensorBoard logs using PyTorch on AML, see the [AML submitting script](examples/9/aml_sample.rst) which submits the following [pytorch sample script](examples/9/pytorch_sample.rst). Note that to run this, you'll need to create an environment with pytorch and tensorboard as dependencies, as a minimum. See an [example conda environemnt](examples/9/tensorboard_env.rst). This will create an experiment named 'tensorboard_test' on your Workspace, with a single run. Go to outputs + logs -> outputs to see the tensorboard events file. ## Download files from AML Runs -From the command line, run the command +From the command line, run the command ```himl-download``` -specifying one of -`[--experiment] [--latest_run_file] [--run]` +specifying one of +`[--experiment] [--latest_run_file] [--run]` If `--experiment` is provided, the most recent Run from this experiment will be downloaded. If `--latest_run_file` is provided, the script will expect to find a RunId in this file. @@ -46,29 +46,29 @@ If your AML config path is not `ROOT_DIR/config.json`, you must also specify `-- ## Creating your own command line tools When creating your own command line tools that interact with the Azure ML ecosystem, you may wish to use the - `AmlRunScriptConfig` class for argument parsing. This gives you a quickstart way for accepting command line arguments to + `AmlRunScriptConfig` class for argument parsing. This gives you a quickstart way for accepting command line arguments to specify the following - + - experiment: a string representing the name of an Experiment, from which to retrieve AML runs - tags: to filter the runs within the given experiment - num_runs: to define the number of most recent runs to return from the experiment - run: to instead define one or more run ids from which to retrieve runs (also supports the older format of run recovery ideas although these are obsolete now) - latest_run_file: to instead provide a path to a file containing the id of your latest run, for retrieval. - config_path: to specify a config.json file in which your workspace settings are defined - -You can extend this list of arguments by creating a child class that inherits from AMLRunScriptConfig. + +You can extend this list of arguments by creating a child class that inherits from AMLRunScriptConfig. ### Defining your own argument types Additional arguments can have any of the following types: `bool`, `integer`, `float`, `string`, `list`, `class/class instance` -with no additional work required. You can also define your own custom type, by providing a custom class in your code that -inherits from `CustomTypeParam`. It must define 2 methods: -1. `_validate(self, x: Any)`: which should raise a `ValueError` if x is not of the type you expect, and should also make a call +with no additional work required. You can also define your own custom type, by providing a custom class in your code that +inherits from `CustomTypeParam`. It must define 2 methods: +1. `_validate(self, x: Any)`: which should raise a `ValueError` if x is not of the type you expect, and should also make a call `super()._validate(val)` 2. `from_string(self, y: string)` which takes in the command line arg as a string (`y`) and returns an instance of the type that you want. For example, if your custom type is a tuple, this method should create a tuple from the input string and return that. An example of a custom type can be seen in our own custom type: `RunIdOrListParam`, which accepts a string representing one or more -run ids (or run recovery ids) and returns either a List or a single RunId object (or RunRecoveryId object if appropriate) +run ids (or run recovery ids) and returns either a List or a single RunId object (or RunRecoveryId object if appropriate) ### Example: diff --git a/docs/source/datasets.md b/docs/source/datasets.md index 731986320..5af8da2e8 100644 --- a/docs/source/datasets.md +++ b/docs/source/datasets.md @@ -11,12 +11,12 @@ to one dataset. ### AzureML Data Stores -Secondly, there are data stores. This is a concept coming from Azure Machine Learning, described +Secondly, there are data stores. This is a concept coming from Azure Machine Learning, described [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-access-data). Data stores provide access to one blob storage account. They exist so that the credentials to access blob storage do not have to be passed around -in the code - rather, the credentials are stored in the data store once and for all. +in the code - rather, the credentials are stored in the data store once and for all. -You can view all data stores in your AzureML workspace by clicking on one of the bottom icons in the left-hand +You can view all data stores in your AzureML workspace by clicking on one of the bottom icons in the left-hand navigation bar of the AzureML studio. One of these data stores is designated as the default data store. @@ -27,11 +27,11 @@ Thirdly, there are datasets. Again, this is a concept coming from Azure Machine * A data store * A set of files accessed through that data store -You can view all datasets in your AzureML workspace by clicking on one of the icons in the left-hand +You can view all datasets in your AzureML workspace by clicking on one of the icons in the left-hand navigation bar of the AzureML studio. ### Preparing data -To simplify usage, the `hi-ml` package creates AzureML datasets for you. All you need to do is to +To simplify usage, the `hi-ml` package creates AzureML datasets for you. All you need to do is to * Create a blob storage account for your data, and within it, a container for your data. * Create a data store that points to that storage account, and store the credentials for the blob storage account in it @@ -54,7 +54,7 @@ What will happen under the hood? is no dataset of that name, it will create one from all the files in blob storage in folder "my_folder". The dataset will be created using the data store provided, "my_datastore". * Once the script runs in AzureML, it will download the dataset "my_folder" to a temporary folder. -* You can access this temporary location by `run_info.input_datasets[0]`, and read the files from it. +* You can access this temporary location by `run_info.input_datasets[0]`, and read the files from it. More complicated setups are described below. @@ -77,11 +77,11 @@ output_folder = run_info.output_datasets[0] ``` Your script can now read files from `input_folder`, transform them, and write them to `output_folder`. The latter will be a folder on the temp file system of the machine. At the end of the script, the contents of that temp folder -will be uploaded to blob storage, and registered as a dataset. +will be uploaded to blob storage, and registered as a dataset. ### Mounting and downloading An input dataset can be downloaded before the start of the actual script run, or it can be mounted. When mounted, -the files are accessed via the network once needed - this is very helpful for large datasets where downloads would +the files are accessed via the network once needed - this is very helpful for large datasets where downloads would create a long waiting time before the job start. Similarly, an output dataset can be uploaded at the end of the script, or it can be mounted. Mounting here means that @@ -89,7 +89,7 @@ all files will be written to blob storage already while the script runs (rather Note: If you are using mounted output datasets, you should NOT rename files in the output folder. -Mounting and downloading can be triggered by passing in `DatasetConfig` objects for the `input_datasets` argument, +Mounting and downloading can be triggered by passing in `DatasetConfig` objects for the `input_datasets` argument, like this: ```python @@ -105,14 +105,14 @@ output_folder = run_info.output_datasets[0] ### Local execution For debugging, it is essential to have the ability to run a script on a local machine, outside of AzureML. -Clearly, your script needs to be able to access data in those runs too. +Clearly, your script needs to be able to access data in those runs too. There are two ways of achieving that: Firstly, you can specify an equivalent local folder in the `DatasetConfig` objects: ```python from pathlib import Path from health_azure import DatasetConfig, submit_to_azure_if_needed -input_dataset = DatasetConfig(name="my_folder", +input_dataset = DatasetConfig(name="my_folder", datastore="my_datastore", local_folder=Path("/datasets/my_folder_local")) run_info = submit_to_azure_if_needed(..., @@ -134,8 +134,8 @@ AzureML has the capability to download/mount a dataset to such a fixed location. trigger that behaviour via an additional option in the `DatasetConfig` objects: ```python from health_azure import DatasetConfig, submit_to_azure_if_needed -input_dataset = DatasetConfig(name="my_folder", - datastore="my_datastore", +input_dataset = DatasetConfig(name="my_folder", + datastore="my_datastore", use_mounting=True, target_folder="/tmp/mnist") run_info = submit_to_azure_if_needed(..., @@ -147,12 +147,12 @@ input_folder = run_info.input_datasets[0] This is also true when running locally - if `local_folder` is not specified and an AzureML workspace can be found, then the dataset will be downloaded or mounted to the `target_folder`. ### Dataset versions -AzureML datasets can have versions, starting at 1. You can view the different versions of a dataset in the AzureML +AzureML datasets can have versions, starting at 1. You can view the different versions of a dataset in the AzureML workspace. In the `hi-ml` toolbox, you would always use the latest version of a dataset unless specified otherwise. If you do need a specific version, use the `version` argument in the `DatasetConfig` objects: ```python from health_azure import DatasetConfig, submit_to_azure_if_needed -input_dataset = DatasetConfig(name="my_folder", +input_dataset = DatasetConfig(name="my_folder", datastore="my_datastore", version=7) run_info = submit_to_azure_if_needed(..., diff --git a/docs/source/downloading.md b/docs/source/downloading.md index 68cb6dcc0..c80a21c2c 100644 --- a/docs/source/downloading.md +++ b/docs/source/downloading.md @@ -21,7 +21,7 @@ end up with the files ["my_outputs/abc/def/1.txt", "my_outputs/abc/2.txt"] If you wish to specify the file name(s) to be downloaded, you can do so with the "prefix" parameter. E.g. prefix="outputs" will download all files within the "output" folder, if such a folder exists within your Run. -There is an additional parameter, "validate_checksum" which defaults to False. If True, will validate +There is an additional parameter, "validate_checksum" which defaults to False. If True, will validate MD5 hash of the data arriving (in chunks) to that being sent. Note that if your code is running in a distributed manner, files will only be downloaded onto nodes with local rank = 0. @@ -43,7 +43,7 @@ All files within the checkpoint directory will be downloaded into the folder spe Since checkpoint files are often large and therefore prone to corruption during download, by default, this function will validate the MD5 hash of the data downloaded (in chunks) compared to that being sent. Note that if your code is running in a distributed manner, files will only be downloaded onto nodes with local rank = 0. -E.g. if you have 2 nodes each running 4 processes, the file will be downloaded by CPU/GPU 0 on each of the 2 nodes. +E.g. if you have 2 nodes each running 4 processes, the file will be downloaded by CPU/GPU 0 on each of the 2 nodes. All processes will be synchronized to only exit the downloading method once it has completed on all nodes/ranks. @@ -67,7 +67,7 @@ function with file_prefix="foo/bar" and output_folder="outputs", you would end u files ["outputs/foo/bar/1.txt", "outputs/foo/bar/2.txt"] This function takes additional parameters "overwrite" and "show_progress". If True, overwrite will overwrite any existing local files with the same path. If False and there is a duplicate file, it will skip this file. -If show_progress is set to True, the progress of the file download will be visible in the terminal. +If show_progress is set to True, the progress of the file download will be visible in the terminal. ## Uploading files to an Azure ML Datastore @@ -84,4 +84,4 @@ Note that the path to local data must be a folder, not a single path. The folder following paths uploaded to your Datastore: ["baz/1.txt", "baz/2.txt"] This function takes additional parameters "overwrite" and "show_progress". If True, overwrite will overwrite any existing remote files with the same path. If False and there is a duplicate file, it will skip this file. -If show_progress is set to True, the progress of the file upload will be visible in the terminal. \ No newline at end of file +If show_progress is set to True, the progress of the file upload will be visible in the terminal. \ No newline at end of file diff --git a/docs/source/examples.md b/docs/source/examples.md index 14806f74b..04dccca1a 100644 --- a/docs/source/examples.md +++ b/docs/source/examples.md @@ -1,8 +1,8 @@ # Examples -**Note**: All examples below contain links to sample scripts that are also included in the repository. +**Note**: All examples below contain links to sample scripts that are also included in the repository. The experience is **optimized for use on readthedocs**. When navigating to the sample scripts on the github UI, -you will only see the `.rst` file that links to the `.py` file. To access the `.py` file, go to the folder that +you will only see the `.rst` file that links to the `.py` file. To access the `.py` file, go to the folder that contains the respective `.rst` file. ## Basic integration @@ -113,14 +113,14 @@ For more details about datasets, see [here](datasets.md) This example trains a simple classifier on a toy dataset, first creating the dataset files and then in a second script training the classifier. -The script [examples/5/inputs.py](examples/5/inputs.rst) is provided to prepare the csv files. Run the script to +The script [examples/5/inputs.py](examples/5/inputs.rst) is provided to prepare the csv files. Run the script to download the Iris dataset and create two CSV files: ```bash cd examples/5 python inputs.py ``` -The training script [examples/5/sample.py](examples/5/sample.rst) is modified from +The training script [examples/5/sample.py](examples/5/sample.rst) is modified from [https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train_iris.py](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train_iris.py) to work with input csv files. Start it to train the actual classifier, based on the data files that were just written: ```bash @@ -139,11 +139,11 @@ However, it is not ideal to have the input files in the snapshot: The size of th It is better to put the data files into blob storage and use input datasets. ### Creating the dataset in AzureML -The suggested way of creating a dataset is to run a script in AzureML that writes an output dataset. This is +The suggested way of creating a dataset is to run a script in AzureML that writes an output dataset. This is particularly important for large datasets, to avoid the usually low bandwith from a local machine to the cloud. -This is shown in [examples/6/inputs.py](examples/6/inputs.rst): -This script prepares the CSV files in an AzureML run, and writes them to an output dataset called `himl_sample6_input`. +This is shown in [examples/6/inputs.py](examples/6/inputs.rst): +This script prepares the CSV files in an AzureML run, and writes them to an output dataset called `himl_sample6_input`. The relevant code parts are: ```python run_info = submit_to_azure_if_needed( @@ -159,7 +159,7 @@ cd examples/6 python inputs.py --azureml ``` -You can now modify the training script [examples/6/sample.py](examples/6/sample.rst) to use the newly created dataset +You can now modify the training script [examples/6/sample.py](examples/6/sample.rst) to use the newly created dataset `himl_sample6_input` as an input. To do that, the following parameters are added to `submit_to_azure_if_needed`: ```python run_info = submit_to_azure_if_needed( @@ -174,10 +174,10 @@ input_folder = run_info.input_datasets[0] or Path("dataset") ``` The part behind the `or` statement is only necessary to keep a reasonable behaviour when running outside of AzureML: When running in AzureML `run_info.input_datasets[0]` will be populated using input dataset specified in the call to -`submit_to_azure_if_needed`, and the input will be downloaded from blob storage. When running locally +`submit_to_azure_if_needed`, and the input will be downloaded from blob storage. When running locally `run_info.input_datasets[0]` will be `None` and a local folder should be populated and used. -The `default_datastore` is required if using the simplest configuration for an input dataset. There are +The `default_datastore` is required if using the simplest configuration for an input dataset. There are alternatives that do not require the `default_datastore` and allows a different datastore for each dataset, for example: ```python @@ -194,15 +194,15 @@ For more details about datasets, see [here](datasets.md) An alternative to writing the dataset in AzureML (as suggested above) is to create them on the local machine, and upload them manually directly to Azure blob storage. -This is shown in [examples/7/inputs.py](examples/7/inputs.rst): This script prepares the CSV files +This is shown in [examples/7/inputs.py](examples/7/inputs.rst): This script prepares the CSV files and uploads them to blob storage, in a folder called `himl_sample7_input`. Run the script: ```bash cd examples/7 python inputs_via_upload.py ``` -As in the above example, you can now modify the training script [examples/7/sample.py](examples/7/sample.rst) to use -an input dataset that has the same name as the folder where the files just got uploaded. In this case, the following +As in the above example, you can now modify the training script [examples/7/sample.py](examples/7/sample.rst) to use +an input dataset that has the same name as the folder where the files just got uploaded. In this case, the following parameters are added to `submit_to_azure_if_needed`: ```python @@ -214,8 +214,8 @@ parameters are added to `submit_to_azure_if_needed`: ## Hyperdrive -The sample [examples/8/sample.py](examples/8/sample.rst) demonstrates adding hyperparameter tuning. This shows the -same hyperparameter search as in the +The sample [examples/8/sample.py](examples/8/sample.rst) demonstrates adding hyperparameter tuning. This shows the +same hyperparameter search as in the [AzureML sample](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb). Make the following additions: @@ -251,7 +251,7 @@ cd examples/8 python sample.py --azureml ``` -this will perform a Hyperdrive run in AzureML, i.e. there will be 12 child runs, each randomly drawing from the +this will perform a Hyperdrive run in AzureML, i.e. there will be 12 child runs, each randomly drawing from the parameter sample space. AzureML can plot the metrics from the child runs, but to do that, some small modifications are required. Add in: @@ -270,10 +270,10 @@ and these metrics will be displayed on the child runs tab in the Experiment page ## Controlling when to submit to AzureML and when not -By default, the `hi-ml` package assumes that you supply a commandline argument `--azureml` (that can be anywhere on +By default, the `hi-ml` package assumes that you supply a commandline argument `--azureml` (that can be anywhere on the commandline) to trigger a submission of the present script to AzureML. If you wish to control it via a different flag, coming out of your own argument parser, use the `submit_to_azureml` argument of the function -`health.azure.himl.submit_to_azure_if_needed`. +`health.azure.himl.submit_to_azure_if_needed`. ## Training with k-fold cross validation in Azure ML @@ -281,7 +281,7 @@ It is possible to create a parent run on Azure ML that is associated with one or https://docs.microsoft.com/en-us/azure/machine-learning/how-to-track-monitor-analyze-runs?tabs=python#create-child-runs) for further information.) This is useful in circumstances such as k-fold cross-validation, where individual child run perform validation on a different data split. When a [HyperDriveRun]( -https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.hyperdriverun?view=azure-ml-py) +https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.hyperdriverun?view=azure-ml-py) is created in Azure ML, it follows this same principle and generates multiple child runs, associated with one parent. To train with k-fold cross validation using `submit_to_azure_if_needed`, you must do two things. @@ -289,10 +289,10 @@ To train with k-fold cross validation using `submit_to_azure_if_needed`, you mus 1. Call the helper function `create_crossval_hyperdrive_config` to create an AML HyperDriveConfig object representing your parent run. It will have one child run for each of the k-fold splits you request, as follows - + ```python from health_azure import create_crossval_hyperdrive_config - + hyperdrive_config = create_crossval_hyperdrive_config(num_splits, cross_val_index_arg_name=cross_val_index_arg_name, metric_name=metric_name) @@ -301,7 +301,7 @@ splits you request, as follows - `num_splits` is the number of k-fold cross validation splits you require - `cross_val_index_arg_name` is the name of the argument given to each child run, whose value denotes which split that child represents (this parameter defaults to 'cross_validation_split_index', in which case, supposing you - specified 2 cross validation splits, one would receive the arguments ['--cross_validation_split_index' '0'] + specified 2 cross validation splits, one would receive the arguments ['--cross_validation_split_index' '0'] and the other would receive ['--cross_validation_split_index' '1']]. It is up to you to then use these args to retrieve the correct split from your data. - `metrics_name` represents the name of a metric that you will compare your child runs by. **NOTE** the @@ -311,7 +311,7 @@ splits you request, as follows You can log this metric in your training script as follows: ```python from azureml.core import Run - + # Example of logging a metric called to an AML Run. loss = run_log = Run.get_context() @@ -321,7 +321,7 @@ splits you request, as follows further explanation. 2. The hyperdrive_config returned above must be passed into the function `submit_to_azure_if_needed` as follows: - + ```python run_info = submit_to_azure_if_needed( ... @@ -332,7 +332,7 @@ splits you request, as follows ## Retrieving the aggregated results of a cross validation/ HyperDrive run -You can retrieve a Pandas DataFrame of the aggregated results from your cross validation run as follows: +You can retrieve a Pandas DataFrame of the aggregated results from your cross validation run as follows: ```python from health_azure import aggregate_hyperdrive_metrics @@ -340,7 +340,7 @@ from health_azure import aggregate_hyperdrive_metrics df = aggregate_hyperdrive_metrics(run_id, child_run_arg_name) ``` where: - - `run_id` is a string representing the id of your HyperDriveRun. Note that this **must** be an instance of an + - `run_id` is a string representing the id of your HyperDriveRun. Note that this **must** be an instance of an AML HyperDriveRun. - `child_run_arg_name` is a string representing the name of the argument given to each child run to denote its position relative to other child runs (e.g. this arg could equal 'child_run_index' - then each of your child runs should expect @@ -348,15 +348,15 @@ AML HyperDriveRun. If your HyperDrive run has 2 children, each logging the metrics epoch, accuracy and loss, the result would look like this: - + | | 0 | 1 | |--------------|-----------------|--------------------| | epoch | [1, 2, 3] | [1, 2, 3] | | accuracy | [0.7, 0.8, 0.9] | [0.71, 0.82, 0.91] | | loss | [0.5, 0.4, 0.3] | [0.45, 0.37, 0.29] | - - here each column is one of the splits/ child runs, and each row is one of the metrics you have logged to the run. - + + here each column is one of the splits/ child runs, and each row is one of the metrics you have logged to the run. + It is possible to log rows and tables in Azure ML by calling run.log_table and run.log_row respectively. In this case, the DataFrame will contain a Dictionary entry instead of a list, where the keys are the table columns (or keywords provided to log_row), and the values are the table values. e.g. diff --git a/docs/source/first_steps.md b/docs/source/first_steps.md index ddaaea85f..44b165305 100644 --- a/docs/source/first_steps.md +++ b/docs/source/first_steps.md @@ -6,7 +6,7 @@ resource, you can consume vast datasets, and access multiple machines at the sam ## Setting up AzureML You need to have an AzureML workspace in your Azure subscription. -Download the config file from your AzureML workspace, as described +Download the config file from your AzureML workspace, as described [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-environment). **Put this file (it should be called `config.json`) into the folder where your script lives**, or one of its parent folders. You can use parent folders up to the last parent that is still included in the `PYTHONPATH` environment variable: `hi-ml` will @@ -14,7 +14,7 @@ try to be smart and search through all folders that it thinks belong to your cur ## Using the AzureML integration layer -Consider a simple use case, where you have a Python script that does something - this could be training a model, +Consider a simple use case, where you have a Python script that does something - this could be training a model, or pre-processing some data. The `hi-ml` package can help easily run that on Azure Machine Learning (AML) services. Here is an example script that reads images from a folder, resizes and saves them to an output folder: @@ -68,14 +68,14 @@ Note that you do not need to modify the argument parser of your script to recogn ## Essential arguments to `submit_to_azure_if_needed` When calling `submit_to_azure_if_needed`, you can to supply the following parameters: -* `compute_cluster_name` (**Mandatory**): The name of the AzureML cluster that should run the job. This can be a +* `compute_cluster_name` (**Mandatory**): The name of the AzureML cluster that should run the job. This can be a cluster with CPU or GPU machines. See [here for documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-studio#amlcompute) * `entry_script`: The script that should be run. If omitted, the `hi-ml` package will assume that you would like to submit the script that is presently running, given in `sys.argv[0]`. * `snapshot_root_directory`: The directory that contains all code that should be packaged and sent to AzureML. All Python code that the script uses must be copied over. This defaults to the current working directory, but can be -one of its parents. If you would like to explicitly skip some folders inside the `snapshot_root_directory`, then use +one of its parents. If you would like to explicitly skip some folders inside the `snapshot_root_directory`, then use `ignored_folders` to specify those. * `conda_environment_file`: The conda configuration file that describes which packages are necessary for your script to run. If omitted, the `hi-ml` package searches for a file called `environment.yml` in the current folder or its @@ -83,12 +83,12 @@ parents. You can also supply an input dataset. For data pre-processing scripts, you can add an output dataset (omit this for ML training scripts). -* To use datasets, you need to provision a data store in your AML workspace, that points to your training data in - blob storage. This is described +* To use datasets, you need to provision a data store in your AML workspace, that points to your training data in + blob storage. This is described [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-connect-data-ui). * `input_datasets=["images123"]` in the code above means that the script will consume all data in folder `images123` in blob storage as the input. The folder must exist in blob storage, in the location that you gave when creating the -datastore. Once the script has run, it will also register the data in this folder as an AML dataset. +datastore. Once the script has run, it will also register the data in this folder as an AML dataset. * `output_datasets=["images123_resized"]` means that the script will create a temporary folder when running in AML, and while the job writes data to that folder, upload it to blob storage, in the data store. @@ -97,7 +97,7 @@ For more examples, please see [examples.md](examples.md). For more details about ## Additional arguments you should know about -`submit_to_azure_if_needed` has a large number of arguments, please check the +`submit_to_azure_if_needed` has a large number of arguments, please check the [API documentation](api/health.azure.submit_to_azure_if_needed.rst) for an exhaustive list. The particularly helpful ones are listed below. @@ -106,7 +106,7 @@ The particularly helpful ones are listed below. * `environment_variables`: A dictionary with the contents of all environment variables that should be set inside the AzureML run, before the script is started. * `docker_base_image`: This specifies the name of the Docker base image to use for creating the - Python environment for your script. The amount of memory to allocate for Docker is given by `docker_shm_size`. + Python environment for your script. The amount of memory to allocate for Docker is given by `docker_shm_size`. * `num_nodes`: The number of nodes on which your script should run. This is essential for distributed training. * `tags`: A dictionary mapping from string to string, with additional tags that will be stored on the AzureML run. This is helpful to add metadata about the run for later use. @@ -114,8 +114,8 @@ The particularly helpful ones are listed below. ## Conda environments, Alternate pips, Private wheels -The function `submit_to_azure_if_needed` tries to locate a Conda environment file in the current folder, -or in the Python path, with the name `environment.yml`. The actual Conda environment file to use can be specified +The function `submit_to_azure_if_needed` tries to locate a Conda environment file in the current folder, +or in the Python path, with the name `environment.yml`. The actual Conda environment file to use can be specified directly with: ```python @@ -126,10 +126,10 @@ directly with: where `conda_environment_file` is a `pathlib.Path` or a string identifying the Conda environment file to use. -The basic use of Conda assumes that packages listed are published -[Conda packages](https://docs.conda.io/projects/conda/en/latest/user-guide/concepts/packages.html) or published -Python packages on [PyPI](https://pypi.org/). However, during development, the Python package may be on -[Test.PyPI](https://test.pypi.org/), or in some other location, in which case the alternative package location can +The basic use of Conda assumes that packages listed are published +[Conda packages](https://docs.conda.io/projects/conda/en/latest/user-guide/concepts/packages.html) or published +Python packages on [PyPI](https://pypi.org/). However, during development, the Python package may be on +[Test.PyPI](https://test.pypi.org/), or in some other location, in which case the alternative package location can be specified directly with: ```python @@ -146,5 +146,5 @@ Finally, it is possible to use a private wheel, if the package is only available private_pip_wheel_path=private_pip_wheel_path, ``` -where `private_pip_wheel_path` is a `pathlib.Path` or a string identifying the wheel package to use. In this case, +where `private_pip_wheel_path` is a `pathlib.Path` or a string identifying the wheel package to use. In this case, this wheel will be copied to the AzureML environment as a private wheel. diff --git a/docs/source/hyperdrive.md b/docs/source/hyperdrive.md index 8399093ad..6d1e2e57c 100644 --- a/docs/source/hyperdrive.md +++ b/docs/source/hyperdrive.md @@ -7,7 +7,7 @@ object as an additional argument. Note that this object needs to be created with will later be replaced with the correct `run_config` that submits your script.) The example below shows a hyperparameter search that aims to minimize the validation loss `val_loss`, by choosing -one of three possible values for the learning rate commandline argument `learning_rate`. +one of three possible values for the learning rate commandline argument `learning_rate`. ```python from azureml.core import ScriptRunConfig from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice @@ -25,5 +25,5 @@ hyperdrive_config = HyperDriveConfig( submit_to_azure_if_needed(..., hyperdrive_config=hyperdrive_config) ``` -For further examples, please check the [example scripts here](examples.md), and the +For further examples, please check the [example scripts here](examples.md), and the [HyperDrive documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters). diff --git a/docs/source/logging.md b/docs/source/logging.md index d83ef7244..4e6ac9ea4 100644 --- a/docs/source/logging.md +++ b/docs/source/logging.md @@ -56,7 +56,7 @@ All results that you achieve in such runs outside AzureML can be written straigh * When instantiated outside an AzureML run, it will create a new `Run` object that writes its metrics straight through to AzureML, even though the code is not running in AzureML. -This behaviour is controlled by the `enable_logging_outside_azure_ml` argument. With the following code snippet, +This behaviour is controlled by the `enable_logging_outside_azure_ml` argument. With the following code snippet, you can to use the `AzureMLLogger` to write metrics to AzureML when the code is inside or outside AzureML: ```python @@ -156,4 +156,4 @@ class MyModule(LightningModule): `log_learning_rate` will log values from all learning rate schedulers, and all learning rates if a scheduler returns multiple values. In this example, the logged metric will be `learning_rate` if there is a single scheduler that outputs -a single LR, or `learning_rate/1/0` to indicate the value coming from scheduler index 1, value index 0. +a single LR, or `learning_rate/1/0` to indicate the value coming from scheduler index 1, value index 0. diff --git a/docs/source/lowpriority.md b/docs/source/lowpriority.md index 9e1b8409a..dc133ee93 100644 --- a/docs/source/lowpriority.md +++ b/docs/source/lowpriority.md @@ -100,7 +100,7 @@ def get_latest_recovery_checkpoint(): all_recovery_files = [f for f in Path(CHECKPOINT_FOLDER).glob(RECOVERY_CHECKPOINT_FILE_NAME + "*")] if len(all_recovery_files) == 0: return None - # Get recovery checkpoint with highest epoch number + # Get recovery checkpoint with highest epoch number recovery_epochs = [int(re.findall(r"[\d]+", f.stem)[0]) for f in all_recovery_files] idx_max_epoch = int(np.argmax(recovery_epochs)) return str(all_recovery_files[idx_max_epoch]) diff --git a/docs/source/runner.md b/docs/source/runner.md index 4a97b52f1..2bbf8070c 100644 --- a/docs/source/runner.md +++ b/docs/source/runner.md @@ -18,24 +18,24 @@ from the command line by `himl-runner --model=HelloWorld`. ## Specifying the model to run The `--model` argument specifies the name of a class that should be used for model training. The class needs to -be a subclass of `LightningContainer`, see below. There are different ways of telling the runner where to find +be a subclass of `LightningContainer`, see below. There are different ways of telling the runner where to find that class: -* If just providing a single class name, like `--model=HelloWorld`, the class is expected somewhere in the +* If just providing a single class name, like `--model=HelloWorld`, the class is expected somewhere in the `health_ml.configs` namespace. It can be in any module/folder inside of that namespace. * If the class is outside of the `health_ml.configs` (as would be normal if using the `himl-runner` from a package), you need to provide some "hints" where to start searching. It is enough to provide the start of the namespace string: -for example, `--model histopathology.PandaImageNetMIL` is effectively telling the runner to search for the -`PandaImageNetMIL` class _anywhere_ in the `histopathology` namespace. You can think of this as +for example, `--model histopathology.PandaImageNetMIL` is effectively telling the runner to search for the +`PandaImageNetMIL` class _anywhere_ in the `histopathology` namespace. You can think of this as `histopathology.*.PandaImageNetMIL` -## Running ML experiments in Azure ML +## Running ML experiments in Azure ML To train in AzureML, add a `--azureml` flag. Use the flag `--cluster` to specify the name of the cluster in your Workspace that you want to submit the job to. So the whole command would look like: `himl-runner --model=HelloContainer --cluster=my_cluster_name --azureml`. You can also specify `--num_nodes` if you wish to distribute the model training. -When starting the runner, you need to do that from a directory that contains all the code that your experiment needs: +When starting the runner, you need to do that from a directory that contains all the code that your experiment needs: The current working directory will be used as the root of all data that will be copied to AzureML to run your experiment. (the only exception to this rule is if you start the runner from within an enlistment of the HI-ML GitHub repository). @@ -155,7 +155,7 @@ class MyContainer(LightningContainer): return MyDataModule(root_path=self.local_dataset) ``` -By default, config files will be looked for in the folder "health_ml.configs". To specify config files +By default, config files will be looked for in the folder "health_ml.configs". To specify config files that live elsewhere, use a fully qualified name for the parameter `--model` - e.g. "MyModule.Configs.my_config.py" @@ -192,6 +192,6 @@ class MyContainer(LightningContainer): ``` ### Optimizer and LR scheduler arguments To the optimizer and LR scheduler: the Lightning model returned by `create_model` should define its own -`configure_optimizers` method, with the same signature as `LightningModule.configure_optimizers`, +`configure_optimizers` method, with the same signature as `LightningModule.configure_optimizers`, and returns a tuple containing the Optimizer and LRScheduler objects diff --git a/docs/source/self_supervised_models.md b/docs/source/self_supervised_models.md index dfbeba286..8f5f14d67 100644 --- a/docs/source/self_supervised_models.md +++ b/docs/source/self_supervised_models.md @@ -107,10 +107,10 @@ with the following available arguments: * `ssl_encoder`: name of the encoder to train, member of `EncoderName` class, currently supported are resnet50, resnet101 and densenet121, * `ssl_training_type`: which SSL algorithm to use, member of `SSLType` choice between BYOL and SimCLR, -* `ssl_training_batch_size`: batch size of SSL training. This is the number of examples processed by a single GPU. - Multiply this by the number of GPUs to get the effective batch size. -* `linear_head_batch_size`: batch size for linear head training (used for monitor of SSL embeddings quality). This is - the number of examples processed by a single GPU. Multiply this by the number of GPUs to get the effective batch size. +* `ssl_training_batch_size`: batch size of SSL training. This is the number of examples processed by a single GPU. + Multiply this by the number of GPUs to get the effective batch size. +* `linear_head_batch_size`: batch size for linear head training (used for monitor of SSL embeddings quality). This is + the number of examples processed by a single GPU. Multiply this by the number of GPUs to get the effective batch size. * `ssl_augmentation_config`: path to yaml config for augmentation to use during SSL training. Only used for NIH/Kaggle datasets. * `linear_head_augmentation_config`: path to yaml config for augmentation to use for linear head training. Only used for @@ -133,12 +133,12 @@ To use this code with your own data, you will need to: and `InnerEyeDataClassBaseWithReturnIndex`. See for example how we constructed `RSNAKaggleCXR` class. WARNING: the first positional argument of your dataset class constructor MUST be the data directory ("root"), as VisionDataModule expects this in the prepare_data step. -3. In your own container update the `_SSLDataClassMappings` member of the class so that the code knows which data class +3. In your own container update the `_SSLDataClassMappings` member of the class so that the code knows which data class to associate to your new dataset name. -4. Create a yaml configuration file that contains the augmentations specific to your dataset. The yaml file will be - consumed by the `create_transforms_from_config` function defined in the +4. Create a yaml configuration file that contains the augmentations specific to your dataset. The yaml file will be + consumed by the `create_transforms_from_config` function defined in the `InnerEye.ML.augmentations.transform_pipeline` module (see next paragraph for more details). Alternatively, overwrite - the `_get_transforms` method. To simplify this step, we have defined a series of standard operations in + the `_get_transforms` method. To simplify this step, we have defined a series of standard operations in `SSL/transforms_utils.py` . You could for example construct a transform pipeline similar to the one created inside `create_transform_from_config` inside your own method. 5. Update all necessary parameters in the model config (cf. previous paragraph) diff --git a/hi-ml-azure/package_description.md b/hi-ml-azure/package_description.md index 38dab3ce3..725d950ed 100644 --- a/hi-ml-azure/package_description.md +++ b/hi-ml-azure/package_description.md @@ -25,7 +25,7 @@ Examples that illustrate the use of the `hi-ml` toolbox can be found on ## Changelog -We are relying on Github's auto-generated changelog to describe what went into a release. Please check each individual release +We are relying on Github's auto-generated changelog to describe what went into a release. Please check each individual release ([available here](https://github.com/microsoft/hi-ml/releases)) to see a full changelog. ## Links diff --git a/hi-ml-azure/pytest.ini b/hi-ml-azure/pytest.ini index a82601d75..fe0ffe5b7 100644 --- a/hi-ml-azure/pytest.ini +++ b/hi-ml-azure/pytest.ini @@ -1,6 +1,6 @@ [pytest] testpaths = testazure -norecursedirs = outputs +norecursedirs = outputs log_cli = True log_cli_level = DEBUG addopts = --strict-markers