diff --git a/.ci/azure-pipelines.yml b/.ci/azure-pipelines.yml deleted file mode 100644 index 3c83dadcc..000000000 --- a/.ci/azure-pipelines.yml +++ /dev/null @@ -1,46 +0,0 @@ - -# Pull request against these branches will trigger this build -pr: -- master -- staging - -#Any commit to this branch will trigger the build. -trigger: -- staging -- master - -pool: - vmImage: 'ubuntu-16.04' - -steps: - -- bash: | - echo "##vso[task.prependpath]/usr/share/miniconda/bin" - displayName: Add Conda to PATH - -- bash: | - conda remove -q -n nlp --all -y - python tools/generate_conda_file.py --gpu - conda env create -n nlp_gpu -f nlp_gpu.yaml - conda env list - source activate nlp_gpu - displayName: 'Creating Conda Environment with dependencies' - -- bash: | - source activate nlp_gpu - python -m ipykernel install --user --name nlp_gpu --display-name "nlp_gpu" - # Commenting out pytest since it contains bunch of tests from other project which are not applicable. - # But keeping the line here to show how to run it once tests relevant to this project are added - # pytest --junitxml=junit/test-unitttest.xml #not running any tests for now - displayName: 'Run Unit tests' - -- task: PublishTestResults@2 - inputs: - testResultsFiles: '**/test-unitttest.xml' - testRunTitle: 'Test results for PyTest' - -- task: ComponentGovernanceComponentDetection@0 - inputs: - scanType: 'Register' - verbosity: 'Verbose' - alertWarningLevel: 'High' diff --git a/.flake8 b/.flake8 index 562be066d..4d86469bd 100644 --- a/.flake8 +++ b/.flake8 @@ -13,4 +13,4 @@ # F821 undefined name 'get_ipython' --> from generated python files using nbconvert ignore = E203, E266, W503, F403, F405, E402, E731, F821 -max-line-length = 79 +max-line-length = 100 diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 000000000..11dc6f43c --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,224 @@ +NOTICES AND INFORMATION +Do Not Translate or Localize + +This software incorporates material from third parties. Microsoft makes certain +open source code available at https://3rdpartysource.microsoft.com, or you may +send a check or money order for US $5.00, including the product name, the open +source component name, and version number, to: + +Source Code Compliance Team +Microsoft Corporation +One Microsoft Way +Redmond, WA 98052 +USA + +Notwithstanding any other terms, you may reverse engineer this software to the +extent required to debug changes to any libraries licensed under the GNU Lesser +General Public License. + +------------Attribution Starts Here---------------------------------------------- +Component: https://github.com/huggingface/pytorch-pretrained-BERT + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +------------End of Attribution --------------------------------------------------- \ No newline at end of file diff --git a/README.md b/README.md index 7c736f652..9c89107f8 100755 --- a/README.md +++ b/README.md @@ -3,27 +3,9 @@ | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | master | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/unit-test-master?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=22&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/unit-test-staging?branchName=staging)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=21&branchName=staging) | - # NLP Best Practices -This repository contains examples and best practices for building NLP systems, provided as Jupyter notebooks and utility functions. The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on problems involving text and language. - -The following section includes a list of the available scenarios. Each scenario is demonstrated in one or more Jupyter notebook examples that make use of the core code base of models and utilities. - - -## Scenarios - - -| Scenario | Applications | Languages | Models | -|---| ------------------------ | -------------------------------------------- | ------------------- | -|[Text Classification](scenarios/text_classification) |Topic Classification|en, zh, ar|BERT| -|[Named Entity Recognition](scenarios/named_entity_recognition) |Wikipedia NER | en, zh |BERT| -|[Sentence Similarity](scenarios/sentence_similarity) |STS Benchmark |en|Representation: TF-IDF, Word Embeddings, Doc Embeddings
Metrics: Cosine Similarity, Word Mover's Distance| -|[Embeddings](scenarios/embeddings)| Custom Embeddings Training|en|Word2Vec
fastText
GloVe| - - -## Planning -All feature planning is done via projects, milestones, and issues in this repository. +This repository contains examples and best practices for building NLP systems, provided as [Jupyter notebooks](scenarios) and [utility functions](utils_nlp). The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on problems involving text and language. ## Getting Started To get started, navigate to the [Setup Guide](SETUP.md), where you'll find instructions on how to setup your environment and dependencies. diff --git a/scenarios/README.md b/scenarios/README.md new file mode 100644 index 000000000..21b222acd --- /dev/null +++ b/scenarios/README.md @@ -0,0 +1,37 @@ +# NLP Scenarios + +This folder contains examples and best practices, written in Jupyter notebooks, for building Natural Language Processing systems for different scenarios. + +## Summary + +The following is a summary of the scenarios covered in the best practice notebooks. Each scenario is demonstrated in one or more Jupyter notebook examples that make use of the core code base of models and utilities. + +| Scenario | Applications | Models | +|---| ------------------------ | ------------------- | +|[Text Classification](scenarios/text_classification) |Topic Classification|BERT| +|[Named Entity Recognition](scenarios/named_entity_recognition) |Wikipedia NER |BERT| +|[Question Answering](scenarios/question_answering) |SQuAD | BiDAF| +|[Sentence Similarity](scenarios/sentence_similarity) |STS Benchmark |Representation: TF-IDF, Word Embeddings, Doc Embeddings
Metrics: Cosine Similarity, Word Mover's Distance| +|[Embeddings](scenarios/embeddings)| Custom Embeddings Training|Word2Vec
fastText
GloVe| + +## Azure-enhanced notebooks + +Azure products and services are used in certain notebooks to enhance the efficiency of developing Natural Language systems at scale. + +To successfully run these notebooks, the users **need an Azure subscription** or can [use Azure for free](https://azure.microsoft.com/en-us/free/). + +The Azure products featured in the notebooks include: + +* [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/) - Azure Machine Learning service is a cloud service used to train, deploy, automate, and manage machine learning models, all at the broad scale that the cloud provides. It is used across various notebooks for the AI model development related tasks like: + * Using Datastores + * Tracking and monitoring metrics to enhance the model creation process + * Distributed Training + * Hyperparameter tuning + * Scaling up and out on Azure Machine Learning Compute + * Deploying a web service to both Azure Container Instance and Azure Kubernetes Service + +* [Azure Kubernetes Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aks) - You can use Azure Machine Learning service to host your classification model in a web service deployment on Azure Kubernetes Service (AKS). AKS is good for high-scale production deployments and provides autoscaling, and fast response times. + +* [Azure Container Instance](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#aci)- You can use Azure Machine Learning service to host your classification model in a web service deployment on Azure Container Instance (ACI). ACI is good for low scale, CPU-based workloads. + +There may be other Azure service or products used in the notebooks. Introduction and/or reference of those will be provided in the notebooks. diff --git a/scenarios/data_prep/README.md b/scenarios/data_prep/README.md index eaf84ad28..5e13abec5 100644 --- a/scenarios/data_prep/README.md +++ b/scenarios/data_prep/README.md @@ -25,7 +25,7 @@ STS Benchmark - sts_load.ipynb + stsbenchmark.ipynb Downloads and cleans the STS Benchmark dataset. Shows an example of tokenizing and removing stopwords using the popular spaCy library. @@ -34,7 +34,7 @@ MSR Paraphrase Corpus - msrpc_load.ipynb + msrpc.ipynb Download and clean the MSR Paraphrase corpus. diff --git a/scenarios/data_prep/stsbenchmark.ipynb b/scenarios/data_prep/stsbenchmark.ipynb index ddd649814..e76967a79 100644 --- a/scenarios/data_prep/stsbenchmark.ipynb +++ b/scenarios/data_prep/stsbenchmark.ipynb @@ -46,7 +46,7 @@ "source": [ "import sys\n", "\n", - "sys.path.append(\"../../../\") ## set the environment path\n", + "sys.path.append(\"../../\") ## set the environment path\n", "\n", "import os\n", "import azureml.dataprep as dp\n", @@ -67,7 +67,7 @@ "outputs": [], "source": [ "STS_URL = \"http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz\"\n", - "BASE_DATA_PATH = \"../../../data\"\n", + "BASE_DATA_PATH = \"../../data\"\n", "RAW_DATA_PATH = os.path.join(BASE_DATA_PATH, \"raw\")\n", "CLEAN_DATA_PATH = os.path.join(BASE_DATA_PATH, \"clean\")" ] @@ -76,14 +76,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 01 Data Download" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make a directory for the data if it doesn't already exist, and then download." + "### 01 Data Download\n", + "In this section we \n", + "* load raw data into a dataframe\n", + "* peek into the first 5 rows" ] }, { @@ -100,68 +96,21 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [], - "source": [ - "def download_sts(url, dirpath):\n", - " zipfile = maybe_download(url, work_directory=dirpath)\n", - " unzipped = stsbenchmark._extract_sts(zipfile, target_dirpath=dirpath, tmode=\"r:gz\")\n", - " return zipfile, unzipped" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "418kB [00:03, 138kB/s] " + "100%|██████████| 401/401 [00:01<00:00, 310KB/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Data downloaded to ../../../data/raw/stsbenchmark\n" + "Data downloaded to ../../data/raw/raw/stsbenchmark\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "tarfile, datapath = download_sts(STS_URL, RAW_DATA_PATH)\n", - "print(\"Data downloaded to {}\".format(datapath))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 02 Data Understanding\n", - "In this section we \n", - "* load raw data into a dataframe\n", - "* peek into the first 10 rows" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can load the data using a `read` function that has built-in automatic filetype inference:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ { "data": { "text/html": [ @@ -183,13 +132,13 @@ " \n", " \n", " \n", - " Column1\n", - " Column2\n", - " Column3\n", - " Column4\n", - " Column5\n", - " Column6\n", - " Column7\n", + " column_0\n", + " column_1\n", + " column_2\n", + " column_3\n", + " column_4\n", + " column_5\n", + " column_6\n", " \n", " \n", " \n", @@ -198,7 +147,7 @@ " main-captions\n", " MSRvid\n", " 2012test\n", - " 1\n", + " 0001\n", " 5.00\n", " A plane is taking off.\n", " An air plane is taking off.\n", @@ -208,7 +157,7 @@ " main-captions\n", " MSRvid\n", " 2012test\n", - " 4\n", + " 0004\n", " 3.80\n", " A man is playing a large flute.\n", " A man is playing a flute.\n", @@ -218,7 +167,7 @@ " main-captions\n", " MSRvid\n", " 2012test\n", - " 5\n", + " 0005\n", " 3.80\n", " A man is spreading shreded cheese on a pizza.\n", " A man is spreading shredded cheese on an uncoo...\n", @@ -228,7 +177,7 @@ " main-captions\n", " MSRvid\n", " 2012test\n", - " 6\n", + " 0006\n", " 2.60\n", " Three men are playing chess.\n", " Two men are playing chess.\n", @@ -238,178 +187,59 @@ " main-captions\n", " MSRvid\n", " 2012test\n", - " 9\n", + " 0009\n", " 4.25\n", " A man is playing the cello.\n", " A man seated is playing the cello.\n", " \n", - " \n", - " 5\n", - " main-captions\n", - " MSRvid\n", - " 2012test\n", - " 11\n", - " 4.25\n", - " Some men are fighting.\n", - " Two men are fighting.\n", - " \n", - " \n", - " 6\n", - " main-captions\n", - " MSRvid\n", - " 2012test\n", - " 12\n", - " 0.50\n", - " A man is smoking.\n", - " A man is skating.\n", - " \n", - " \n", - " 7\n", - " main-captions\n", - " MSRvid\n", - " 2012test\n", - " 13\n", - " 1.60\n", - " The man is playing the piano.\n", - " The man is playing the guitar.\n", - " \n", - " \n", - " 8\n", - " main-captions\n", - " MSRvid\n", - " 2012test\n", - " 14\n", - " 2.20\n", - " A man is playing on a guitar and singing.\n", - " A woman is playing an acoustic guitar and sing...\n", - " \n", - " \n", - " 9\n", - " main-captions\n", - " MSRvid\n", - " 2012test\n", - " 16\n", - " 5.00\n", - " A person is throwing a cat on to the ceiling.\n", - " A person throws a cat on the ceiling.\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " Column1 Column2 Column3 Column4 Column5 \\\n", - "0 main-captions MSRvid 2012test 1 5.00 \n", - "1 main-captions MSRvid 2012test 4 3.80 \n", - "2 main-captions MSRvid 2012test 5 3.80 \n", - "3 main-captions MSRvid 2012test 6 2.60 \n", - "4 main-captions MSRvid 2012test 9 4.25 \n", - "5 main-captions MSRvid 2012test 11 4.25 \n", - "6 main-captions MSRvid 2012test 12 0.50 \n", - "7 main-captions MSRvid 2012test 13 1.60 \n", - "8 main-captions MSRvid 2012test 14 2.20 \n", - "9 main-captions MSRvid 2012test 16 5.00 \n", + " column_0 column_1 column_2 column_3 column_4 \\\n", + "0 main-captions MSRvid 2012test 0001 5.00 \n", + "1 main-captions MSRvid 2012test 0004 3.80 \n", + "2 main-captions MSRvid 2012test 0005 3.80 \n", + "3 main-captions MSRvid 2012test 0006 2.60 \n", + "4 main-captions MSRvid 2012test 0009 4.25 \n", "\n", - " Column6 \\\n", + " column_5 \\\n", "0 A plane is taking off. \n", "1 A man is playing a large flute. \n", "2 A man is spreading shreded cheese on a pizza. \n", "3 Three men are playing chess. \n", "4 A man is playing the cello. \n", - "5 Some men are fighting. \n", - "6 A man is smoking. \n", - "7 The man is playing the piano. \n", - "8 A man is playing on a guitar and singing. \n", - "9 A person is throwing a cat on to the ceiling. \n", "\n", - " Column7 \n", + " column_6 \n", "0 An air plane is taking off. \n", "1 A man is playing a flute. \n", "2 A man is spreading shredded cheese on an uncoo... \n", "3 Two men are playing chess. \n", - "4 A man seated is playing the cello. \n", - "5 Two men are fighting. \n", - "6 A man is skating. \n", - "7 The man is playing the guitar. \n", - "8 A woman is playing an acoustic guitar and sing... \n", - "9 A person throws a cat on the ceiling. " + "4 A man seated is playing the cello. " ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "dflow = dp.auto_read_file(path=os.path.join(datapath, \"sts-train.csv\"))\n", - "dflow.head()" + "df = stsbenchmark.load_pandas_df(RAW_DATA_PATH, file_split=\"train\")\n", + "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The `auto_read_file` function from the AzureML Data Prep module actually returns a `Dataflow` object, which you can read more about [here](https://docs.microsoft.com/en-us/python/api/azureml-dataprep/azureml.dataprep.dataflow?view=azure-dataprep-py). We can easily transfer the data into a Pandas DataFrame (as before) in a single line using the `to_pandas_dataframe` function, or we can continue manipulating the data as a Dataflow object using the AzureML Data Prep API. For the remainder of this notebook we will be doing the latter." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 03 Data Cleaning\n", + "### 02 Data Cleaning\n", "Now that we know about the general shape of the data, we can clean it so that it is ready for further preprocessing. The main operation we need for the STS Benchmark data is to drop all of columns except for the sentence pairs and scores." ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "sentences = dflow.keep_columns([\"Column5\", \"Column6\", \"Column7\"]).rename_columns(\n", - " {\"Column5\": \"score\", \"Column6\": \"sentence1\", \"Column7\": \"sentence2\"}\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 04 One-Shot Dataframe Loading\n", - "You can also use our STSBenchmark utils to automatically download, extract, and persist the data. You can then load the sanitized data as a pandas DataFrame in one line. " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "418kB [00:02, 191kB/s] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data downloaded to ../../../data/raw/stsbenchmark\n", - "Writing clean dataframe to ../../../data/clean/stsbenchmark/sts-test.csv\n", - "Writing clean dataframe to ../../../data/clean/stsbenchmark/sts-dev.csv\n", - "Writing clean dataframe to ../../../data/clean/stsbenchmark/sts-train.csv\n" - ] - } - ], - "source": [ - "# Initializing this instance runs the downloader and extractor behind the scenes\n", - "sts_train = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -489,12 +319,13 @@ "4 A man seated is playing the cello. " ] }, - "execution_count": 9, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "sts_train = stsbenchmark.clean_sts(df)\n", "sts_train.head()" ] }, @@ -502,13 +333,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 05 Make Lowercase\n", - "We start with simple standardization of the text by making all text lowercase." + "### 03 Make Lowercase\n", + "We do simple standardization of the text by making all text lowercase." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -588,7 +419,7 @@ "4 a man seated is playing the cello. " ] }, - "execution_count": 10, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -602,13 +433,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 06 Tokenize\n", + "### 04 Tokenize\n", "We tokenize the text using spaCy's non-destructive tokenizer." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -680,46 +511,6 @@ " [a, man, is, playing, the, cello, .]\n", " [a, man, seated, is, playing, the, cello, .]\n", " \n", - " \n", - " 5\n", - " 4.25\n", - " some men are fighting.\n", - " two men are fighting.\n", - " [some, men, are, fighting, .]\n", - " [two, men, are, fighting, .]\n", - " \n", - " \n", - " 6\n", - " 0.50\n", - " a man is smoking.\n", - " a man is skating.\n", - " [a, man, is, smoking, .]\n", - " [a, man, is, skating, .]\n", - " \n", - " \n", - " 7\n", - " 1.60\n", - " the man is playing the piano.\n", - " the man is playing the guitar.\n", - " [the, man, is, playing, the, piano, .]\n", - " [the, man, is, playing, the, guitar, .]\n", - " \n", - " \n", - " 8\n", - " 2.20\n", - " a man is playing on a guitar and singing.\n", - " a woman is playing an acoustic guitar and sing...\n", - " [a, man, is, playing, on, a, guitar, and, sing...\n", - " [a, woman, is, playing, an, acoustic, guitar, ...\n", - " \n", - " \n", - " 9\n", - " 5.00\n", - " a person is throwing a cat on to the ceiling.\n", - " a person throws a cat on the ceiling.\n", - " [a, person, is, throwing, a, cat, on, to, the,...\n", - " [a, person, throws, a, cat, on, the, ceiling, .]\n", - " \n", " \n", "\n", "" @@ -731,11 +522,6 @@ "2 3.80 a man is spreading shreded cheese on a pizza. \n", "3 2.60 three men are playing chess. \n", "4 4.25 a man is playing the cello. \n", - "5 4.25 some men are fighting. \n", - "6 0.50 a man is smoking. \n", - "7 1.60 the man is playing the piano. \n", - "8 2.20 a man is playing on a guitar and singing. \n", - "9 5.00 a person is throwing a cat on to the ceiling. \n", "\n", " sentence2 \\\n", "0 an air plane is taking off. \n", @@ -743,11 +529,6 @@ "2 a man is spreading shredded cheese on an uncoo... \n", "3 two men are playing chess. \n", "4 a man seated is playing the cello. \n", - "5 two men are fighting. \n", - "6 a man is skating. \n", - "7 the man is playing the guitar. \n", - "8 a woman is playing an acoustic guitar and sing... \n", - "9 a person throws a cat on the ceiling. \n", "\n", " sentence1_tokens \\\n", "0 [a, plane, is, taking, off, .] \n", @@ -755,48 +536,36 @@ "2 [a, man, is, spreading, shreded, cheese, on, a... \n", "3 [three, men, are, playing, chess, .] \n", "4 [a, man, is, playing, the, cello, .] \n", - "5 [some, men, are, fighting, .] \n", - "6 [a, man, is, smoking, .] \n", - "7 [the, man, is, playing, the, piano, .] \n", - "8 [a, man, is, playing, on, a, guitar, and, sing... \n", - "9 [a, person, is, throwing, a, cat, on, to, the,... \n", "\n", " sentence2_tokens \n", "0 [an, air, plane, is, taking, off, .] \n", "1 [a, man, is, playing, a, flute, .] \n", "2 [a, man, is, spreading, shredded, cheese, on, ... \n", "3 [two, men, are, playing, chess, .] \n", - "4 [a, man, seated, is, playing, the, cello, .] \n", - "5 [two, men, are, fighting, .] \n", - "6 [a, man, is, skating, .] \n", - "7 [the, man, is, playing, the, guitar, .] \n", - "8 [a, woman, is, playing, an, acoustic, guitar, ... \n", - "9 [a, person, throws, a, cat, on, the, ceiling, .] " + "4 [a, man, seated, is, playing, the, cello, .] " ] }, - "execution_count": 11, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sts_train_tok = to_spacy_tokens(\n", - " sts_train_low.head(10)\n", - ") # operating on a small slice of the data as an example\n", - "sts_train_tok.head(10)" + "sts_train_tok = to_spacy_tokens(sts_train_low.head())\n", + "sts_train_tok.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 07 Optional: Remove Stop Words\n", + "### 05 Optional: Remove Stop Words\n", "Removing stop words is another common preprocessing step for NLP tasks. We use the `rm_spacy_stopwords` utility function to do this on the dataframe. This function makes use of the spaCy language model's default set of stop words. If we need to add our own set of stop words (for example, if we are doing an NLP task for a very specific domain of content), we can do this in-line by simply providing the list as the `custom_stopwords` parameter of `rm_spacy_stopwords`." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -880,56 +649,6 @@ " [man, playing, cello, .]\n", " [man, seated, playing, cello, .]\n", " \n", - " \n", - " 5\n", - " 4.25\n", - " some men are fighting.\n", - " two men are fighting.\n", - " [some, men, are, fighting, .]\n", - " [two, men, are, fighting, .]\n", - " [men, fighting, .]\n", - " [men, fighting, .]\n", - " \n", - " \n", - " 6\n", - " 0.50\n", - " a man is smoking.\n", - " a man is skating.\n", - " [a, man, is, smoking, .]\n", - " [a, man, is, skating, .]\n", - " [man, smoking, .]\n", - " [man, skating, .]\n", - " \n", - " \n", - " 7\n", - " 1.60\n", - " the man is playing the piano.\n", - " the man is playing the guitar.\n", - " [the, man, is, playing, the, piano, .]\n", - " [the, man, is, playing, the, guitar, .]\n", - " [man, playing, piano, .]\n", - " [man, playing, guitar, .]\n", - " \n", - " \n", - " 8\n", - " 2.20\n", - " a man is playing on a guitar and singing.\n", - " a woman is playing an acoustic guitar and sing...\n", - " [a, man, is, playing, on, a, guitar, and, sing...\n", - " [a, woman, is, playing, an, acoustic, guitar, ...\n", - " [man, playing, guitar, singing, .]\n", - " [woman, playing, acoustic, guitar, singing, .]\n", - " \n", - " \n", - " 9\n", - " 5.00\n", - " a person is throwing a cat on to the ceiling.\n", - " a person throws a cat on the ceiling.\n", - " [a, person, is, throwing, a, cat, on, to, the,...\n", - " [a, person, throws, a, cat, on, the, ceiling, .]\n", - " [person, throwing, cat, ceiling, .]\n", - " [person, throws, cat, ceiling, .]\n", - " \n", " \n", "\n", "" @@ -941,11 +660,6 @@ "2 3.80 a man is spreading shreded cheese on a pizza. \n", "3 2.60 three men are playing chess. \n", "4 4.25 a man is playing the cello. \n", - "5 4.25 some men are fighting. \n", - "6 0.50 a man is smoking. \n", - "7 1.60 the man is playing the piano. \n", - "8 2.20 a man is playing on a guitar and singing. \n", - "9 5.00 a person is throwing a cat on to the ceiling. \n", "\n", " sentence2 \\\n", "0 an air plane is taking off. \n", @@ -953,11 +667,6 @@ "2 a man is spreading shredded cheese on an uncoo... \n", "3 two men are playing chess. \n", "4 a man seated is playing the cello. \n", - "5 two men are fighting. \n", - "6 a man is skating. \n", - "7 the man is playing the guitar. \n", - "8 a woman is playing an acoustic guitar and sing... \n", - "9 a person throws a cat on the ceiling. \n", "\n", " sentence1_tokens \\\n", "0 [a, plane, is, taking, off, .] \n", @@ -965,11 +674,6 @@ "2 [a, man, is, spreading, shreded, cheese, on, a... \n", "3 [three, men, are, playing, chess, .] \n", "4 [a, man, is, playing, the, cello, .] \n", - "5 [some, men, are, fighting, .] \n", - "6 [a, man, is, smoking, .] \n", - "7 [the, man, is, playing, the, piano, .] \n", - "8 [a, man, is, playing, on, a, guitar, and, sing... \n", - "9 [a, person, is, throwing, a, cat, on, to, the,... \n", "\n", " sentence2_tokens \\\n", "0 [an, air, plane, is, taking, off, .] \n", @@ -977,11 +681,6 @@ "2 [a, man, is, spreading, shredded, cheese, on, ... \n", "3 [two, men, are, playing, chess, .] \n", "4 [a, man, seated, is, playing, the, cello, .] \n", - "5 [two, men, are, fighting, .] \n", - "6 [a, man, is, skating, .] \n", - "7 [the, man, is, playing, the, guitar, .] \n", - "8 [a, woman, is, playing, an, acoustic, guitar, ... \n", - "9 [a, person, throws, a, cat, on, the, ceiling, .] \n", "\n", " sentence1_tokens_rm_stopwords \\\n", "0 [plane, taking, .] \n", @@ -989,34 +688,22 @@ "2 [man, spreading, shreded, cheese, pizza, .] \n", "3 [men, playing, chess, .] \n", "4 [man, playing, cello, .] \n", - "5 [men, fighting, .] \n", - "6 [man, smoking, .] \n", - "7 [man, playing, piano, .] \n", - "8 [man, playing, guitar, singing, .] \n", - "9 [person, throwing, cat, ceiling, .] \n", "\n", " sentence2_tokens_rm_stopwords \n", "0 [air, plane, taking, .] \n", "1 [man, playing, flute, .] \n", "2 [man, spreading, shredded, cheese, uncooked, p... \n", "3 [men, playing, chess, .] \n", - "4 [man, seated, playing, cello, .] \n", - "5 [men, fighting, .] \n", - "6 [man, skating, .] \n", - "7 [man, playing, guitar, .] \n", - "8 [woman, playing, acoustic, guitar, singing, .] \n", - "9 [person, throws, cat, ceiling, .] " + "4 [man, seated, playing, cello, .] " ] }, - "execution_count": 12, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "rm_spacy_stopwords(\n", - " sts_train_tok\n", - ") # operating on a small slice of the data as an example" + "rm_spacy_stopwords(sts_train_tok).head()" ] } ], @@ -1036,7 +723,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.5" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/scenarios/embeddings/embedding_trainer.ipynb b/scenarios/embeddings/embedding_trainer.ipynb index 7d656f04d..f5ed37567 100644 --- a/scenarios/embeddings/embedding_trainer.ipynb +++ b/scenarios/embeddings/embedding_trainer.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -80,17 +80,35 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:01<00:00, 309KB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data downloaded to ../../data\\raw\\stsbenchmark\n" + ] + } + ], "source": [ "# Produce a pandas dataframe for the training set\n", - "sts_train = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")" + "train_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n", + "\n", + "# Clean the sts dataset\n", + "sts_train = stsbenchmark.clean_sts(train_raw)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -170,7 +188,7 @@ "4 A man seated is playing the cello. " ] }, - "execution_count": 5, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -181,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -190,7 +208,7 @@ "(5749, 3)" ] }, - "execution_count": 6, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -209,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -223,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "metadata": { "scrolled": true }, @@ -237,16 +255,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "11492" + "11498" ] }, - "execution_count": 9, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -257,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -279,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -297,7 +315,7 @@ " ['man', 'seated', 'playing', 'cello', '.']]" ] }, - "execution_count": 11, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -334,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -344,7 +362,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -358,14 +376,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Time elapsed: 0.3874\n" + "Time elapsed: 0.4556\n" ] } ], @@ -386,33 +404,41 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Embedding for apple: [ 0.1108162 0.24349137 -0.01440436 0.03533127 -0.06876028 0.07968962\n", - " 0.01578981 0.14264993 -0.06832716 0.00339077 0.07635406 0.06265593\n", - " 0.03414075 0.10075415 -0.05965225 0.00968812 0.16405381 -0.24480335\n", - " -0.06949984 -0.18414594 0.0465034 0.2028756 0.09074208 0.20703372\n", - " 0.1098601 -0.32350177 -0.10786435 0.08799383 -0.19245893 -0.09788057\n", - " 0.09563518 0.08567159 0.15692063 0.08486914 -0.10940372 0.10400604\n", - " 0.03643018 0.15096138 0.12341096 -0.06584675 -0.21533655 -0.01426107\n", - " -0.06800868 -0.03641699 -0.15752348 -0.01934456 0.0068708 -0.06268159\n", - " 0.04240354 -0.06285387 -0.0215644 -0.00047655 -0.0192252 -0.12477098\n", - " -0.08567388 0.08970863 0.07633136 0.21374965 0.19123942 0.01627954\n", - " 0.11209694 0.06009139 -0.03454148 0.0743629 0.03803044 0.059964\n", - " 0.08909379 -0.04600987 0.06926275 -0.09804282 0.02527839 0.16690746\n", - " -0.11900123 -0.0311705 -0.05939943 -0.14164011 0.22661647 0.08943615\n", - " -0.03721635 0.03887443 -0.15312009 0.06582782 0.13990967 0.08372186\n", - " -0.03915371 0.09002874 0.14046906 -0.04060138 0.11289847 0.0010503\n", - " -0.1014872 -0.08762068 -0.19562078 -0.03109288 -0.16293499 -0.00314896\n", - " -0.02791101 0.04398078 0.04605171 -0.08095105]\n", + "Embedding for apple: [ 0.05805362 0.06101197 -0.04139881 0.02149955 -0.09089632 0.08171839\n", + " 0.10880544 0.04739253 -0.18464622 0.18185261 -0.0273802 0.23335838\n", + " 0.02462817 0.19001065 0.042492 -0.03106086 0.13986434 -0.08186961\n", + " -0.04803263 -0.03560257 -0.01290459 -0.05349363 -0.01384514 -0.19388926\n", + " -0.07060098 0.06136238 -0.08374732 -0.07936234 0.14275725 -0.17378892\n", + " -0.07579862 0.1358681 0.03124874 0.07999087 -0.10487169 0.03901242\n", + " -0.03545398 0.1413099 0.06107847 -0.06615571 0.03585797 -0.1804256\n", + " 0.23718679 0.0819917 -0.17114222 0.06501587 -0.03194249 -0.05697308\n", + " -0.16496892 -0.02637602 0.01153994 -0.10465483 0.16883366 0.03583959\n", + " -0.05584354 0.11883577 -0.01215279 -0.2250833 -0.07159518 0.08646166\n", + " 0.00850767 0.07679912 -0.13213757 -0.08736049 -0.09475534 -0.03855689\n", + " 0.01396248 -0.02864163 0.00354996 -0.01462657 -0.08833787 -0.11314301\n", + " -0.04131266 -0.09071928 -0.03713143 0.1178434 -0.12651944 -0.11256607\n", + " 0.13031591 -0.15850762 0.11350677 0.14365956 -0.02895318 0.09518009\n", + " -0.02517641 0.00678065 -0.01811527 -0.08079742 0.10072935 0.2130049\n", + " -0.10550384 -0.01195244 -0.0962322 0.05746774 0.05794769 0.22316577\n", + " -0.00290377 -0.11464126 0.01171946 -0.04879373]\n", "\n", "First 30 vocabulary words: ['plane', 'taking', '.', 'air', 'man', 'playing', 'large', 'flute', 'spreading', 'cheese', 'pizza', 'men', 'seated', 'fighting', 'smoking', 'piano', 'guitar', 'singing', 'woman', 'person']\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp_gpu\\lib\\site-packages\\smart_open\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n", + " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n" + ] } ], "source": [ @@ -456,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -466,7 +492,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -480,14 +506,14 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Time elapsed: 10.4061\n" + "Time elapsed: 10.3698\n" ] } ], @@ -504,30 +530,30 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Embedding for apple: [ 0.24594913 0.0478383 0.576843 -0.14472146 -0.13372016 0.3994271\n", - " -0.18761183 -0.10253572 -0.5489808 0.3115678 0.18665203 0.08805989\n", - " 0.565551 0.26285723 0.01494028 0.39692047 -0.39978772 -0.30473194\n", - " 0.05508447 0.10066988 0.20679028 0.30775183 0.0472638 -0.239493\n", - " 0.12949444 -0.20410636 -0.13940431 -0.03945793 0.4396631 -0.08924853\n", - " 0.08834386 -0.22228362 0.28431413 0.18899629 0.3427995 -0.2114068\n", - " -0.01075403 0.8549923 0.09068774 -0.04244559 -0.22046468 0.06916029\n", - " -0.31791446 0.11447909 -0.05693823 0.10290135 -0.09406947 -0.26463747\n", - " -0.17336299 0.07076416 -0.26909345 0.1761348 0.14077482 0.24621071\n", - " -0.0408617 -0.3031526 0.10244257 0.4772046 0.25927255 -0.02917116\n", - " 0.2211562 0.04355185 0.19956268 0.13878216 0.28868207 -0.5039835\n", - " 0.41010958 0.07107946 -0.09606131 -0.22969621 0.05883528 -0.01241339\n", - " 0.00676485 0.311163 0.08247512 -0.13799056 0.15181121 0.08045118\n", - " -0.06654785 0.04279696 0.532607 0.2505259 0.10194286 0.05519621\n", - " -0.451315 -0.24121635 0.10120259 0.36105216 0.47429752 0.4230102\n", - " -0.07235575 -0.16397384 0.28193682 -0.21931437 -0.16088559 -0.03915804\n", - " 0.41476008 -0.03525754 0.34007013 -0.152273 ]\n", + "Embedding for apple: [-0.18175453 -0.14863092 0.01440668 0.41852772 0.4886491 -0.24110396\n", + " -0.26591563 -0.42659786 -0.04840926 -0.05654079 0.26051033 -0.02733019\n", + " 0.00937179 -0.07287153 0.21057971 0.21508346 0.06344912 0.10872953\n", + " -0.10214202 -0.54538804 -0.15845574 -0.05536952 -0.04718296 -0.46515992\n", + " -0.12252445 -0.09347973 0.11549287 0.14775406 -0.4141621 0.24835227\n", + " 0.08907127 -0.00180367 -0.02042806 0.13677692 0.19265138 0.1525672\n", + " 0.05339279 -0.18745865 -0.38480887 -0.26928213 0.2699537 0.38778877\n", + " 0.28482276 -0.17511593 0.11898511 -0.06478633 -0.39813048 0.30248052\n", + " 0.03833921 0.08309021 -0.06976178 -0.15951832 -0.6560336 -0.4534666\n", + " -0.18082033 0.09569218 0.10938869 -0.3292928 -0.4216524 0.24858503\n", + " -0.35272446 -0.30754313 0.06224228 0.23139575 -0.11154156 0.03544799\n", + " -0.09699723 0.13625555 0.3257419 -0.09298395 0.3291442 -0.03776973\n", + " -0.17104091 -0.19018205 0.13310616 0.22434781 -0.00192542 -0.22643566\n", + " -0.02940017 -0.3396929 0.09581995 -0.09487487 0.15184835 0.05633284\n", + " -0.13727354 0.28902617 -0.09076066 -0.15375414 0.11667106 0.1914239\n", + " 0.36700025 0.03567546 0.67464125 0.48771846 -0.40189445 -0.37667385\n", + " -0.50891036 -0.16170104 -0.40450782 0.07738833]\n", "\n", "First 30 vocabulary words: ['plane', 'taking', '.', 'air', 'man', 'playing', 'large', 'flute', 'spreading', 'cheese', 'pizza', 'men', 'seated', 'fighting', 'smoking', 'piano', 'guitar', 'singing', 'woman', 'person']\n" ] @@ -589,7 +615,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -601,7 +627,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -626,7 +652,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -634,7 +660,7 @@ "output_type": "stream", "text": [ "BUILDING VOCABULARY\n", - "Processed 0 tokens.Processed 84997 tokens.\n", + "Processed 0 tokens.Processed 85334 tokens.\n", "Counted 11716 unique words.\n", "Truncating vocabulary at min count 5.\n", "Using vocabulary of size 2943.\n", @@ -665,7 +691,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -679,9 +705,9 @@ "overflow length: 38028356\n", "Reading vocab from file \"../../data/trained_word_embeddings/vocab.txt\"...loaded 2943 words.\n", "Building lookup table...table contains 8661250 elements.\n", - "Processing token: 0Processed 84997 tokens.\n", + "Processing token: 0Processed 85334 tokens.\n", "Writing cooccurrences to disk......2 files in total.\n", - "Merging cooccurrence files: processed 0 lines.0 lines.100000 lines.Merging cooccurrence files: processed 187717 lines.\n", + "Merging cooccurrence files: processed 0 lines.0 lines.100000 lines.Merging cooccurrence files: processed 188154 lines.\n", "\n" ] } @@ -706,7 +732,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -715,9 +741,9 @@ "text": [ "SHUFFLING COOCCURRENCES\n", "array size: 255013683\n", - "Shuffling by chunks: processed 0 lines.processed 187717 lines.\n", + "Shuffling by chunks: processed 0 lines.processed 188154 lines.\n", "Wrote 1 temporary file(s).\n", - "Merging temp files: processed 0 lines.187717 lines.Merging temp files: processed 187717 lines.\n", + "Merging temp files: processed 0 lines.188154 lines.Merging temp files: processed 188154 lines.\n", "\n" ] } @@ -747,7 +773,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -755,27 +781,27 @@ "output_type": "stream", "text": [ "TRAINING MODEL\n", - "Read 187717 lines.\n", + "Read 188154 lines.\n", "Initializing parameters...done.\n", "vector size: 50\n", "vocab size: 2943\n", "x_max: 10.000000\n", "alpha: 0.750000\n", - "05/09/19 - 03:10.13PM, iter: 001, cost: 0.078329\n", - "05/09/19 - 03:10.13PM, iter: 002, cost: 0.072090\n", - "05/09/19 - 03:10.13PM, iter: 003, cost: 0.070081\n", - "05/09/19 - 03:10.13PM, iter: 004, cost: 0.067171\n", - "05/09/19 - 03:10.13PM, iter: 005, cost: 0.063501\n", - "05/09/19 - 03:10.13PM, iter: 006, cost: 0.060700\n", - "05/09/19 - 03:10.13PM, iter: 007, cost: 0.058092\n", - "05/09/19 - 03:10.13PM, iter: 008, cost: 0.056080\n", - "05/09/19 - 03:10.13PM, iter: 009, cost: 0.054016\n", - "05/09/19 - 03:10.13PM, iter: 010, cost: 0.051806\n", - "05/09/19 - 03:10.13PM, iter: 011, cost: 0.049565\n", - "05/09/19 - 03:10.13PM, iter: 012, cost: 0.047378\n", - "05/09/19 - 03:10.13PM, iter: 013, cost: 0.045232\n", - "05/09/19 - 03:10.13PM, iter: 014, cost: 0.043136\n", - "05/09/19 - 03:10.13PM, iter: 015, cost: 0.041132\n" + "06/26/19 - 09:24.10AM, iter: 001, cost: 0.078565\n", + "06/26/19 - 09:24.10AM, iter: 002, cost: 0.072320\n", + "06/26/19 - 09:24.10AM, iter: 003, cost: 0.070274\n", + "06/26/19 - 09:24.10AM, iter: 004, cost: 0.067244\n", + "06/26/19 - 09:24.10AM, iter: 005, cost: 0.063690\n", + "06/26/19 - 09:24.10AM, iter: 006, cost: 0.060640\n", + "06/26/19 - 09:24.10AM, iter: 007, cost: 0.058201\n", + "06/26/19 - 09:24.10AM, iter: 008, cost: 0.056211\n", + "06/26/19 - 09:24.10AM, iter: 009, cost: 0.054148\n", + "06/26/19 - 09:24.10AM, iter: 010, cost: 0.051913\n", + "06/26/19 - 09:24.10AM, iter: 011, cost: 0.049649\n", + "06/26/19 - 09:24.10AM, iter: 012, cost: 0.047426\n", + "06/26/19 - 09:24.10AM, iter: 013, cost: 0.045255\n", + "06/26/19 - 09:24.10AM, iter: 014, cost: 0.043138\n", + "06/26/19 - 09:24.10AM, iter: 015, cost: 0.041108\n" ] } ], @@ -787,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -796,14 +822,14 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Time elapsed: 8.1586\n" + "Time elapsed: 25.0459\n" ] } ], @@ -827,7 +853,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -841,16 +867,16 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Embedding for apple: [0.123773, -0.053006, 0.070493, 0.108794, 0.056317, -0.121031, 0.031882, 0.036723, -0.080099, 0.070415, -0.049969, 0.13519, 0.02835, 0.077195, 0.038348, -0.07014, 0.064163, -0.073477, 0.054575, 0.000798, 0.144856, 0.129294, 0.088421, 0.098318, -0.208831, 0.003972, 0.043487, 0.098745, -0.135213, -0.080192, 0.033854, -0.092947, -0.086098, 0.063487, -0.003857, -0.040265, 0.006533, -0.028026, -0.0315, -0.046298, 0.053757, -0.038117, 0.008664, -0.141584, 0.105524, 0.106604, -0.102875, 0.062868, -0.185542, -0.002386]\n", + "Embedding for apple: [0.062942, -0.097984, 0.037373, 0.111635, 0.086733, -0.071781, 0.043611, -0.01458, -0.012725, 0.076614, -0.13072, 0.129127, -0.00262, 0.015669, 0.06114, -0.044421, 0.004353, -0.066637, 0.049023, -0.00885, 0.138072, 0.165017, 0.047256, 0.122998, -0.247253, 0.01951, 0.007255, 0.070611, -0.130033, -0.05971, 0.056946, -0.085183, -0.118371, 0.033433, -0.035763, 0.021646, -0.005461, -0.03758, -0.048107, -0.075025, 0.012993, -0.07799, -0.030288, -0.137319, 0.121737, 0.054742, -0.013201, 0.055261, -0.146741, -0.041641]\n", "\n", - "First 30 vocabulary words: ['.', ',', 'man', '-', 'woman', \"'\", 'said', 'dog', '\"', 'playing', ':', 'white', 'black', '$', 'killed', 'percent', 'new', 'syria', 'people', 'china']\n" + "First 30 vocabulary words: ['.', ',', 'man', '-', '\"', 'woman', \"'\", 'said', 'dog', 'playing', ':', 'white', 'black', '$', 'killed', 'percent', 'new', 'syria', 'people', 'china']\n" ] } ], diff --git a/scenarios/entailment/entailment_xnli_multilingual.ipynb b/scenarios/entailment/entailment_xnli_multilingual.ipynb new file mode 100644 index 000000000..0816e8a47 --- /dev/null +++ b/scenarios/entailment/entailment_xnli_multilingual.ipynb @@ -0,0 +1,581 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multi-lingual Inference on XNLI Dataset using BERT" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "In this notebook, we demostrate using the [Multi-lingual BERT model](https://github.com/google-research/bert/blob/master/multilingual.md) to do language inference in Chinese and Hindi. We use the [XNLI](https://github.com/facebookresearch/XNLI) dataset and the task is to classify sentence pairs into three classes: contradiction, entailment, and neutral. \n", + "The figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. It concatenates the tokens in each sentence pairs and separates the sentences by the [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "import random\n", + "import numpy as np\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "import torch\n", + "\n", + "nlp_path = os.path.abspath('../../')\n", + "if nlp_path not in sys.path:\n", + " sys.path.insert(0, nlp_path)\n", + "\n", + "from utils_nlp.bert.sequence_classification import BERTSequenceClassifier\n", + "from utils_nlp.bert.common import Language, Tokenizer\n", + "from utils_nlp.dataset.xnli import load_pandas_df\n", + "from utils_nlp.common.timer import Timer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configurations\n", + "Note that the running time shown in this notebook are on a Standard_NC12 Azure Deep Learning Virtual Machine with two NVIDIA Tesla K80 GPUs. If you want to run through the notebook quickly, you can change the `TRAIN_DATA_USED_PERCENT` to a small number, e.g. 0.01. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "TRAIN_DATA_USED_PERCENT = 1.0\n", + "\n", + "# set random seeds\n", + "RANDOM_SEED = 42\n", + "random.seed(RANDOM_SEED)\n", + "np.random.seed(RANDOM_SEED)\n", + "torch.manual_seed(RANDOM_SEED)\n", + "num_cuda_devices = torch.cuda.device_count()\n", + "if num_cuda_devices > 1:\n", + " torch.cuda.manual_seed_all(RANDOM_SEED)\n", + "\n", + "# model configurations\n", + "LANGUAGE_CHINESE = Language.CHINESE\n", + "LANGUAGE_MULTI = Language.MULTILINGUAL\n", + "TO_LOWER = True\n", + "MAX_SEQ_LENGTH = 128\n", + "\n", + "# training configurations\n", + "NUM_GPUS = 2\n", + "BATCH_SIZE = 32\n", + "NUM_EPOCHS = 2\n", + "\n", + "# optimizer configurations\n", + "LEARNING_RATE= 5e-5\n", + "WARMUP_PROPORTION= 0.1\n", + "\n", + "# data configurations\n", + "TEXT_COL = \"text\"\n", + "LABEL_COL = \"label\"\n", + "\n", + "CACHE_DIR = \"./temp\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data\n", + "The XNLI dataset comes in two zip files: \n", + "* XNLI-1.0.zip: dev and test datasets in 15 languages. The original English data was translated into other languages by human translators. \n", + "* XNLI-MT-1.0.zip: training dataset in 15 languages. This dataset is machine translations of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset. It also contains English translations of the dev and test datasets, but not used in this notebook. \n", + "\n", + "The `load_pandas_df` function downloads and extracts the zip files if they don't already exist in `local_cache_path` and returns the data subset specified by `file_split` and `language`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "train_df_chinese = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"train\", language=\"zh\")\n", + "dev_df_chinese = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev\", language=\"zh\")\n", + "test_df_chinese = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"test\", language=\"zh\")\n", + "\n", + "train_df_hindi = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"train\", language=\"hi\")\n", + "dev_df_hindi = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"dev\", language=\"hi\")\n", + "test_df_hindi = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"test\", language=\"hi\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Chinese training dataset size: 392702\n", + "Chinese dev dataset size: 2490\n", + "Chinese test dataset size: 5010\n", + "\n", + "Hindi training dataset size: 392702\n", + "Hindi dev dataset size: 2490\n", + "Hindi test dataset size: 5010\n", + "\n", + " text label\n", + "0 (从 概念 上 看 , 奶油 收入 有 两 个 基本 方面 产品 和 地理 ., 产品 和 ... neutral\n", + "1 (你 知道 在 这个 季节 , 我 猜 在 你 的 水平 你 把 他们 丢到 下 一个 水平... entailment\n", + "2 (我们 的 一个 号码 会 非常 详细 地 执行 你 的 指示, 我 团队 的 一个 成员 ... entailment\n", + "3 (你 怎么 知道 的 ? 所有 这些 都 是 他们 的 信息 ., 这些 信息 属于 他们 .) entailment\n", + "4 (是 啊 , 我 告诉 你 , 如果 你 去 买 一些 网球鞋 , 我 可以 看到 为什么 ... neutral\n", + " text label\n", + "0 (Conceptually क ् रीम एंजलिस में दो मूल आयाम ह... neutral\n", + "1 (आप मौसम के दौरान जानते हैं और मैं अपने स ् तर... entailment\n", + "2 (हमारे एक नंबर में से एक आपके निर ् देशों को म... entailment\n", + "3 (आप कैसे जानते हैं ? ये सब उनकी जानकारी फिर से... entailment\n", + "4 (हाँ मैं आपको बताता हूँ कि अगर आप उन टेनिस जूत... neutral\n" + ] + } + ], + "source": [ + "print(\"Chinese training dataset size: {}\".format(train_df_chinese.shape[0]))\n", + "print(\"Chinese dev dataset size: {}\".format(dev_df_chinese.shape[0]))\n", + "print(\"Chinese test dataset size: {}\".format(test_df_chinese.shape[0]))\n", + "print()\n", + "print(\"Hindi training dataset size: {}\".format(train_df_hindi.shape[0]))\n", + "print(\"Hindi dev dataset size: {}\".format(dev_df_hindi.shape[0]))\n", + "print(\"Hindi test dataset size: {}\".format(test_df_hindi.shape[0]))\n", + "print()\n", + "print(train_df_chinese.head())\n", + "print(train_df_hindi.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "train_data_used_count = round(TRAIN_DATA_USED_PERCENT * train_df_chinese.shape[0])\n", + "train_df_chinese = train_df_chinese.loc[:train_data_used_count]\n", + "train_df_hindi = train_df_hindi.loc[:train_data_used_count]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Language Inference on Chinese\n", + "For Chinese dataset, we use the `bert-base-chinese` model which was pretrained on Chinese dataset only. The `bert-base-multilingual-cased` model can also be used on Chinese, but the accuracy is 3% lower." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tokenize and Preprocess\n", + "Before training, we tokenize the sentence texts and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 392702/392702 [02:26<00:00, 2682.67it/s]\n", + "100%|██████████| 5010/5010 [00:01<00:00, 3122.04it/s]\n" + ] + } + ], + "source": [ + "tokenizer_chinese = Tokenizer(LANGUAGE_CHINESE, to_lower=TO_LOWER, cache_dir=CACHE_DIR)\n", + "\n", + "train_tokens_chinese = tokenizer_chinese.tokenize(train_df_chinese[TEXT_COL])\n", + "test_tokens_chinese= tokenizer_chinese.tokenize(test_df_chinese[TEXT_COL])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition, we perform the following preprocessing steps in the cell below:\n", + "\n", + "* Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\n", + "* Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n", + "* Pad or truncate the token lists to the specified max length\n", + "* Return mask lists that indicate paddings' positions\n", + "* Return token type id lists that indicate which sentence the tokens belong to\n", + "\n", + "*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "train_token_ids_chinese, train_input_mask_chinese, train_token_type_ids_chinese = \\\n", + " tokenizer_chinese.preprocess_classification_tokens(train_tokens_chinese, max_len=MAX_SEQ_LENGTH)\n", + "test_token_ids_chinese, test_input_mask_chinese, test_token_type_ids_chinese = \\\n", + " tokenizer_chinese.preprocess_classification_tokens(test_tokens_chinese, max_len=MAX_SEQ_LENGTH)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "label_encoder_chinese = LabelEncoder()\n", + "train_labels_chinese = label_encoder_chinese.fit_transform(train_df_chinese[LABEL_COL])\n", + "num_labels_chinese = len(np.unique(train_labels_chinese))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "classifier_chinese = BERTSequenceClassifier(language=LANGUAGE_CHINESE,\n", + " num_labels=num_labels_chinese,\n", + " cache_dir=CACHE_DIR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch:1/2; batch:1->1228/12271; loss:1.194384\n", + "epoch:1/2; batch:1229->2456/12271; loss:0.863067\n", + "epoch:1/2; batch:2457->3684/12271; loss:0.781256\n", + "epoch:1/2; batch:3685->4912/12271; loss:1.067413\n", + "epoch:1/2; batch:4913->6140/12271; loss:0.599279\n", + "epoch:1/2; batch:6141->7368/12271; loss:0.471488\n", + "epoch:1/2; batch:7369->8596/12271; loss:0.572327\n", + "epoch:1/2; batch:8597->9824/12271; loss:0.689093\n", + "epoch:1/2; batch:9825->11052/12271; loss:0.651702\n", + "epoch:1/2; batch:11053->12271/12271; loss:0.431085\n", + "epoch:2/2; batch:1->1228/12271; loss:0.255859\n", + "epoch:2/2; batch:1229->2456/12271; loss:0.434052\n", + "epoch:2/2; batch:2457->3684/12271; loss:0.433569\n", + "epoch:2/2; batch:3685->4912/12271; loss:0.405915\n", + "epoch:2/2; batch:4913->6140/12271; loss:0.636128\n", + "epoch:2/2; batch:6141->7368/12271; loss:0.416685\n", + "epoch:2/2; batch:7369->8596/12271; loss:0.265789\n", + "epoch:2/2; batch:8597->9824/12271; loss:0.328964\n", + "epoch:2/2; batch:9825->11052/12271; loss:0.436310\n", + "epoch:2/2; batch:11053->12271/12271; loss:0.374193\n", + "Training time : 8.050 hrs\n" + ] + } + ], + "source": [ + "with Timer() as t:\n", + " classifier_chinese.fit(token_ids=train_token_ids_chinese,\n", + " input_mask=train_input_mask_chinese,\n", + " token_type_ids=train_token_type_ids_chinese,\n", + " labels=train_labels_chinese,\n", + " num_gpus=NUM_GPUS,\n", + " num_epochs=NUM_EPOCHS,\n", + " batch_size=BATCH_SIZE,\n", + " lr=LEARNING_RATE,\n", + " warmup_proportion=WARMUP_PROPORTION)\n", + "print(\"Training time : {:.3f} hrs\".format(t.interval / 3600))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Predict on Test Data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "5024it [00:54, 101.88it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction time : 0.015 hrs\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "with Timer() as t:\n", + " predictions_chinese = classifier_chinese.predict(token_ids=test_token_ids_chinese,\n", + " input_mask=test_input_mask_chinese,\n", + " token_type_ids=test_token_type_ids_chinese,\n", + " batch_size=BATCH_SIZE)\n", + "print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + "contradiction 0.81 0.84 0.82 1670\n", + " entailment 0.84 0.68 0.76 1670\n", + " neutral 0.70 0.80 0.74 1670\n", + "\n", + " accuracy 0.77 5010\n", + " macro avg 0.78 0.77 0.77 5010\n", + " weighted avg 0.78 0.77 0.77 5010\n", + "\n" + ] + } + ], + "source": [ + "predictions_chinese = label_encoder_chinese.inverse_transform(predictions_chinese)\n", + "print(classification_report(test_df_chinese[LABEL_COL], predictions_chinese))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Language Inference on Hindi\n", + "For Hindi and all other languages except Chinese, we use the `bert-base-multilingual-cased` model. \n", + "The preprocesing, model training, and prediction steps are the same as on Chinese data, except for the underlying tokenizer and BERT model used" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tokenize and Preprocess" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 392702/392702 [03:48<00:00, 1719.84it/s]\n", + "100%|██████████| 5010/5010 [00:02<00:00, 1916.46it/s]\n" + ] + } + ], + "source": [ + "tokenizer_multi = Tokenizer(LANGUAGE_MULTI, cache_dir=CACHE_DIR)\n", + "\n", + "train_tokens_hindi = tokenizer_multi.tokenize(train_df_hindi[TEXT_COL])\n", + "test_tokens_hindi= tokenizer_multi.tokenize(test_df_hindi[TEXT_COL])\n", + "\n", + "train_token_ids_hindi, train_input_mask_hindi, train_token_type_ids_hindi = \\\n", + " tokenizer_multi.preprocess_classification_tokens(train_tokens_hindi, max_len=MAX_SEQ_LENGTH)\n", + "test_token_ids_hindi, test_input_mask_hindi, test_token_type_ids_hindi = \\\n", + " tokenizer_multi.preprocess_classification_tokens(test_tokens_hindi, max_len=MAX_SEQ_LENGTH)\n", + "\n", + "label_encoder_hindi = LabelEncoder()\n", + "train_labels_hindi = label_encoder_hindi.fit_transform(train_df_hindi[LABEL_COL])\n", + "num_labels_hindi = len(np.unique(train_labels_hindi))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create and Train Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch:1/2; batch:1->1228/12271; loss:1.091754\n", + "epoch:1/2; batch:1229->2456/12271; loss:0.992931\n", + "epoch:1/2; batch:2457->3684/12271; loss:1.045146\n", + "epoch:1/2; batch:3685->4912/12271; loss:0.799912\n", + "epoch:1/2; batch:4913->6140/12271; loss:0.815425\n", + "epoch:1/2; batch:6141->7368/12271; loss:0.564856\n", + "epoch:1/2; batch:7369->8596/12271; loss:0.726981\n", + "epoch:1/2; batch:8597->9824/12271; loss:0.764087\n", + "epoch:1/2; batch:9825->11052/12271; loss:0.964115\n", + "epoch:1/2; batch:11053->12271/12271; loss:0.502252\n", + "epoch:2/2; batch:1->1228/12271; loss:0.601600\n", + "epoch:2/2; batch:1229->2456/12271; loss:0.695099\n", + "epoch:2/2; batch:2457->3684/12271; loss:0.419610\n", + "epoch:2/2; batch:3685->4912/12271; loss:0.603106\n", + "epoch:2/2; batch:4913->6140/12271; loss:0.705180\n", + "epoch:2/2; batch:6141->7368/12271; loss:0.493404\n", + "epoch:2/2; batch:7369->8596/12271; loss:0.864921\n", + "epoch:2/2; batch:8597->9824/12271; loss:0.518601\n", + "epoch:2/2; batch:9825->11052/12271; loss:0.395920\n", + "epoch:2/2; batch:11053->12271/12271; loss:0.685858\n", + "Training time : 9.520 hrs\n" + ] + } + ], + "source": [ + "classifier_multi = BERTSequenceClassifier(language=LANGUAGE_MULTI,\n", + " num_labels=num_labels_hindi,\n", + " cache_dir=CACHE_DIR)\n", + "with Timer() as t:\n", + " classifier_multi.fit(token_ids=train_token_ids_hindi,\n", + " input_mask=train_input_mask_hindi,\n", + " token_type_ids=train_token_type_ids_hindi,\n", + " labels=train_labels_hindi,\n", + " num_gpus=NUM_GPUS,\n", + " num_epochs=NUM_EPOCHS,\n", + " batch_size=BATCH_SIZE,\n", + " lr=LEARNING_RATE,\n", + " warmup_proportion=WARMUP_PROPORTION)\n", + "print(\"Training time : {:.3f} hrs\".format(t.interval / 3600))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Predict and Evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "5024it [01:02, 87.10it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction time : 0.017 hrs\n", + " precision recall f1-score support\n", + "\n", + "contradiction 0.69 0.72 0.70 1670\n", + " entailment 0.74 0.51 0.60 1670\n", + " neutral 0.58 0.74 0.65 1670\n", + "\n", + " accuracy 0.65 5010\n", + " macro avg 0.67 0.65 0.65 5010\n", + " weighted avg 0.67 0.65 0.65 5010\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "with Timer() as t:\n", + " predictions_hindi = classifier_multi.predict(token_ids=test_token_ids_hindi,\n", + " input_mask=test_input_mask_hindi,\n", + " token_type_ids=test_token_type_ids_hindi,\n", + " batch_size=BATCH_SIZE)\n", + "print(\"Prediction time : {:.3f} hrs\".format(t.interval / 3600))\n", + "predictions_hindi= label_encoder_hindi.inverse_transform(predictions_hindi)\n", + "print(classification_report(test_df_hindi[LABEL_COL], predictions_hindi))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pytorch", + "language": "python", + "name": "pytorch" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scenarios/named_entity_recognition/README.md b/scenarios/named_entity_recognition/README.md index e69de29bb..2c80ba53e 100644 --- a/scenarios/named_entity_recognition/README.md +++ b/scenarios/named_entity_recognition/README.md @@ -0,0 +1,8 @@ +# Named Entity Recognition (NER) +Named Entity Recognition (NER) is the task of detecting and classifying +real-world objects mentioned in text. Common named entities include person +names, locations, organizations, etc. The state-of-the art NER methods include +combining Long Short-Term Memory neural network with Conditional Random Field +(LSTM-CRF) and pretrained language models like BERT. NER can be used for +information extraction and filtering. It also plays an important role in other +NLP tasks like question answering and text summarization. diff --git a/scenarios/named_entity_recognition/ner_msra_bert_chinese.ipynb b/scenarios/named_entity_recognition/ner_msra_bert_chinese.ipynb new file mode 100644 index 000000000..611aa38cb --- /dev/null +++ b/scenarios/named_entity_recognition/ner_msra_bert_chinese.ipynb @@ -0,0 +1,849 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Copyright (c) Microsoft Corporation. All rights reserved.* \n", + "*Licensed under the MIT License.*\n", + "# Named Entity Recognition Using BERT on Chinese\n", + "## Summary\n", + "This notebook demonstrates how to fine tune [pretrained BERT model](https://github.com/huggingface/pytorch-pretrained-BERT) for named entity recognition (NER) task on Chinese text. Utility functions and classes in the NLP Best Practices repo are used to facilitate data preprocessing, model training, model scoring and model evaluation.\n", + "\n", + "[BERT (Bidirectional Transformers for Language Understanding)](https://arxiv.org/pdf/1810.04805.pdf) is a powerful pre-trained lanaguage model that can be used for multiple NLP tasks, including text classification, question answering, named entity recognition, etc. It's able to achieve state of the art performance with only a few epochs of fine tuning on task specific datasets. \n", + "The figure below illustrates how BERT can be fine tuned for NER tasks. The input data is a list of tokens representing a sentence. In the training data, each token has an entity label. After fine tuning, the model predicts an entity label for each token in a given testing sentence. \n", + "\n", + "\n", + "\n", + "Named Entity Recognition on non-English text is not very differnt from that on English text. The only difference is the model used, which is configured by the `LANGUAGE` variable below. For non-English languages including Chinese, the *bert-base-multilingual-cased* model can be used by setting `LANGUAGE = Language.MULTILINGUAL`. For Chinese, the *bert-base-chinese* model can also be used by setting `LANGUAGE = Language.CHINESE`. On Chinese text, the performance of *bert-base-chinese* is usually better than *bert-base-multilingual-cased* because the *bert-base-chinese* model is pretrained on Chinese data only. On this particular dataset, the performances of the Chinese-only model and multilingual model are very similar" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Required packages\n", + "* pytorch\n", + "* pytorch-pretrained-bert\n", + "* pandas\n", + "* seqeval" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "import random\n", + "from seqeval.metrics import classification_report\n", + "\n", + "import torch\n", + "\n", + "nlp_path = os.path.abspath('../../')\n", + "if nlp_path not in sys.path:\n", + " sys.path.insert(0, nlp_path)\n", + "\n", + "from utils_nlp.bert.token_classification import BERTTokenClassifier, postprocess_token_labels, create_label_map\n", + "from utils_nlp.bert.common import Language, Tokenizer\n", + "from utils_nlp.dataset.msra_ner import load_pandas_df, get_unique_labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configurations" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# path configurations\n", + "CACHE_DIR = \"./temp\"\n", + "\n", + "# set random seeds\n", + "RANDOM_SEED = 100\n", + "torch.manual_seed(RANDOM_SEED)\n", + "\n", + "# model configurations\n", + "LANGUAGE = Language.CHINESE\n", + "DO_LOWER_CASE = True\n", + "MAX_SEQ_LENGTH = 200\n", + "\n", + "# training configurations\n", + "BATCH_SIZE = 16\n", + "NUM_TRAIN_EPOCHS = 1\n", + "\n", + "# optimizer configuration\n", + "LEARNING_RATE = 3e-5\n", + "\n", + "TEXT_COL = \"sentence\"\n", + "LABEL_COL = \"labels\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocess Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get training and testing data\n", + "The dataset used in this notebook is the MSRA NER dataset. The dataset consists of 45000 training sentences and 3442 testing sentences. \n", + "\n", + "The helper function `load_pandas_df` downloads the data files if they don't exist in `local_cache_path`. It returns the training or testing data frame based on `file_split`\n", + "\n", + "The helper function `get_unique_labels` returns the unique entity labels in the dataset. There are 7 unique labels in the dataset: \n", + "* 'O': non-entity \n", + "* 'B-LOC': beginning of location entity\n", + "* 'I-LOC': within location entity\n", + "* 'B-PER': beginning of person entity\n", + "* 'I-PER': within person entity\n", + "* 'B-ORG': beginning of organization entity\n", + "* 'I-ORG': within organization entity\n", + "\n", + "The maximum number of words in a sentence is 2427. We set MAX_SEQ_LENGTH to 200 above to reduce the GPU memory needed to run this notebook. Less than 1% of testing data are longer than 200, so this should have negligible impact on the model performance evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Maximum sequence length in train data is: 746\n", + "Maximum sequence length in test data is: 2427\n", + "Number of sentences in training data: 45000\n", + "Number of sentences in testing data: 3442\n", + "Unique labels: ['O', 'B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER']\n" + ] + } + ], + "source": [ + "train_df = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"train\")\n", + "test_df = load_pandas_df(local_cache_path=CACHE_DIR, file_split=\"test\")\n", + "label_list = get_unique_labels()\n", + "print(\"Number of sentences in training data: {}\".format(train_df.shape[0]))\n", + "print(\"Number of sentences in testing data: {}\".format(test_df.shape[0]))\n", + "print(\"Unique labels: {}\".format(label_list))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencelabels
0当 希 望 工 程 救 助 的 百 万 儿 童 成 长 起 来 , 科 教 兴 国 蔚 然 ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
1藏 书 本 来 就 是 所 有 传 统 收 藏 门 类 中 的 第 一 大 户 , 只 是 ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
2因 有 关 日 寇 在 京 掠 夺 文 物 详 情 , 藏 界 较 为 重 视 , 也 是 ...[O, O, O, B-LOC, O, O, B-LOC, O, O, O, O, O, O...
3我 们 藏 有 一 册 1 9 4 5 年 6 月 油 印 的 《 北 京 文 物 保 存 ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
4以 家 乡 的 历 史 文 献 、 特 定 历 史 时 期 书 刊 、 某 一 名 家 或 ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
\n", + "
" + ], + "text/plain": [ + " sentence \\\n", + "0 当 希 望 工 程 救 助 的 百 万 儿 童 成 长 起 来 , 科 教 兴 国 蔚 然 ... \n", + "1 藏 书 本 来 就 是 所 有 传 统 收 藏 门 类 中 的 第 一 大 户 , 只 是 ... \n", + "2 因 有 关 日 寇 在 京 掠 夺 文 物 详 情 , 藏 界 较 为 重 视 , 也 是 ... \n", + "3 我 们 藏 有 一 册 1 9 4 5 年 6 月 油 印 的 《 北 京 文 物 保 存 ... \n", + "4 以 家 乡 的 历 史 文 献 、 特 定 历 史 时 期 书 刊 、 某 一 名 家 或 ... \n", + "\n", + " labels \n", + "0 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "1 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "2 [O, O, O, B-LOC, O, O, B-LOC, O, O, O, O, O, O... \n", + "3 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "4 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tokenization and Preprocessing\n", + "The `tokenize_ner` method of the `Tokenizer` class converts raw string data to numerical features, involving the following steps:\n", + "1. WordPiece tokenization.\n", + "2. Convert tokens and labels to numerical values, i.e. token ids and label ids.\n", + "3. Sequence padding or truncation according to the `max_seq_length` configuration." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Create a dictionary that maps labels to numerical values**" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "label_map = create_label_map(label_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Tokenize input text**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "tokenizer = Tokenizer(language=LANGUAGE, \n", + " to_lower=DO_LOWER_CASE, \n", + " cache_dir=CACHE_DIR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Create numerical features** " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "train_token_ids, train_input_mask, train_trailing_token_mask, train_label_ids = \\\n", + " tokenizer.tokenize_ner(text=train_df[TEXT_COL],\n", + " label_map=label_map,\n", + " max_len=MAX_SEQ_LENGTH,\n", + " labels=train_df[LABEL_COL])\n", + "test_token_ids, test_input_mask, test_trailing_token_mask, test_label_ids = \\\n", + " tokenizer.tokenize_ner(text=test_df[TEXT_COL],\n", + " label_map=label_map,\n", + " max_len=MAX_SEQ_LENGTH,\n", + " labels=test_df[LABEL_COL])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Tokenizer.tokenize_ner` outputs three or four lists of numerical features lists, each sublist contains features of an input sentence: \n", + "1. token ids: list of numerical values each corresponds to a token.\n", + "2. attention mask: list of 1s and 0s, 1 for input tokens and 0 for padded tokens, so that padded tokens are not attended to. \n", + "3. trailing word piece mask: boolean list, `True` for the first word piece of each original word, `False` for the trailing word pieces, e.g. ##ize. This mask is useful for removing predictions on trailing word pieces, so that each original word in the input text has a unique predicted label. \n", + "4. label ids: list of numerical values each corresponds to an entity label, if `labels` is provided." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sample token ids:\n", + "[2496, 2361, 3307, 2339, 4923, 3131, 1221, 4638, 4636, 674, 1036, 4997, 2768, 7270, 6629, 3341, 8024, 4906, 3136, 1069, 1744, 5917, 4197, 2768, 7599, 3198, 8024, 791, 1921, 3300, 3119, 5966, 817, 966, 4638, 741, 872, 3766, 743, 8024, 3209, 3189, 2218, 1373, 872, 2637, 679, 2496, 1159, 8013, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n", + "\n", + "Sample attention mask:\n", + "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n", + "\n", + "Sample trailing token mask:\n", + "[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]\n", + "\n", + "Sample label ids:\n", + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", + "\n" + ] + } + ], + "source": [ + "print((\"Sample token ids:\\n{}\\n\".format(train_token_ids[0])))\n", + "print(\"Sample attention mask:\\n{}\\n\".format(train_input_mask[0]))\n", + "print(\"Sample trailing token mask:\\n{}\\n\".format(train_trailing_token_mask[0]))\n", + "print(\"Sample label ids:\\n{}\\n\".format(train_label_ids[0]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Token Classifier\n", + "The value of the `language` argument determines which BERT model is used:\n", + "* Language.ENGLISH: \"bert-base-uncased\"\n", + "* Language.ENGLISHCASED: \"bert-base-cased\"\n", + "* Language.ENGLISHLARGE: \"bert-large-uncased\"\n", + "* Language.ENGLISHLARGECASED: \"bert-large-cased\"\n", + "* Language.CHINESE: \"bert-base-chinese\"\n", + "* Language.MULTILINGUAL: \"bert-base-multilingual-cased\"\n", + "\n", + "Here we use the base model pre-trained only on Chinese data." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "token_classifier = BERTTokenClassifier(language=LANGUAGE,\n", + " num_labels=len(label_map),\n", + " cache_dir=CACHE_DIR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train Model" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "t_total value of -1 results in schedule not being applied\n", + "Epoch: 0%| | 0/1 [00:00\",\n", + " resource_group=\"\",\n", + " workspace_name=\"\",\n", + " workspace_region=\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 Register BiDAF model for Deployment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This step downloads the pre-trained [AllenNLP](https://allennlp.org/models) pretrained model and registers the model in our Workspace. The pre-trained AllenNLP model we use is called Bidirectional Attention Flow for Machine Comprehension ([BiDAF](https://www.semanticscholar.org/paper/Bidirectional-Attention-Flow-for-Machine-Seo-Kembhavi/007ab5528b3bd310a80d553cccad4b78dc496b02\n", + ")) It achieved state-of-the-art performance on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset in 2017 and is a well-respected, performant baseline for QA. AllenNLP's pre-trained BIDAF model is trained on the SQuAD training set and achieves an EM score of 68.3 on the SQuAD development set. See the [BIDAF deep dive notebook](https://github.com/microsoft/nlp/blob/courtney-bidaf/scenarios/question_answering/bidaf_deep_dive.ipynb\n", + ") for more information on this algorithm and AllenNLP implementation." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "x config.json\n", + "x vocabulary/\n", + "x vocabulary/non_padded_namespaces.txt\n", + "x vocabulary/tokens.txt\n", + "x weights.th\n" + ] + } + ], + "source": [ + "bidaf_model_url = 'https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-model-2017.09.15-charpad.tar.gz'\n", + "urllib.request.urlretrieve(bidaf_model_url, filename=\"bidaf.tar.gz\")\n", + "!tar xvzf bidaf.tar.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Registering a model means registering one or more files that make up a model (in our case, we register all the files contained in the downloaded .tar.gz file). Here we demonstrate how to register a model using the AzureML SDK, but see the [model registration](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where#registermodel\n", + ") documentation for other registration methods.\n", + "\n", + "\n", + "**Note**: If you have already registered the model, you need not re-register it. Rather, just retrieve the pre-existing model in your Workspace with `bidaf_model = Model(ws, name='bidaf')`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registering model bidaf\n" + ] + } + ], + "source": [ + "bidaf_model = Model.register(workspace = ws,\n", + " model_path =\"bidaf.tar.gz\",\n", + " model_name = \"bidaf\",\n", + " tags = MODEL_TAGS,\n", + " description = \"BiDAF Pretrained Model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 Create Scoring Script" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section we show an example of an entry script, score.py, which is called from the deployed webservice. The script must contain:\n", + "\n", + "1. init() - This function loads the model in a global object. \n", + "2. run() - This function is used for model prediction. The inputs and outputs to run() typically use JSON for serialization and deserilization. \n", + "\n", + "Our scoring script allows for both real-time and batch prediction. Each observation is a dictionary with two keys: _question_ and _passage_. With batch prediction we pass in a list of observations and use AllenNLPs `predict_batch_json()` method. For real-time prediction we pass in a single observation and use AllenNLPs `predict()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing score.py\n" + ] + } + ], + "source": [ + "%%writefile score.py\n", + "import json\n", + "from allennlp.predictors import Predictor\n", + "from azureml.core.model import Model\n", + "\n", + "def init():\n", + " global model\n", + " bidaf_dir_path = Model.get_model_path('bidaf')\n", + " model = Predictor.from_path(bidaf_dir_path)\n", + "\n", + "def run(rawdata):\n", + " try:\n", + " data = json.loads(rawdata)\n", + " \n", + " # if one question-passage pair was passed\n", + " if type(data) == dict:\n", + " passage = data['passage']\n", + " question = data['question']\n", + " result = model.predict(question, passage)[\"best_span_str\"]\n", + " \n", + " # if multiple question-passage pairs were passed\n", + " elif type(data) == list:\n", + " result = model.predict_batch_json(data)\n", + " result = [i[\"best_span_str\"] for i in result]\n", + "\n", + " except Exception as e:\n", + " result = str(e)\n", + " return json.dumps({\"error\": result})\n", + " return json.dumps({\"result\":result})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.4 Create a YAML File for the Environment " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. The following cells create a file, bidafenv.yml, which specifies the dependencies from the run." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'bidafenv.yml'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "myenv = CondaDependencies.create(conda_packages= DEPLOYMENT_CONDA_PACKAGES,\n", + " pip_packages= DEPLOYMENT_PIP_PACKAGES, \n", + " python_version = DEPLOYMENT_PYTHON_VERSION)\n", + "myenv.add_channel('conda-forge')\n", + "myenv.add_channel('pytorch')\n", + "\n", + "conda_env_file_name = 'bidafenv.yml'\n", + "myenv.save_to_file('.', conda_env_file_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.5 Image Creation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this step we create a container image which is wrapper containing the entry script, yaml file with package dependencies and the model. The created image is then deployed as a webservice in the next step. This step can take up to 10 minutes and even longer if the model is large." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating image\n", + "Running..............................................................................................................................................................\n", + "SucceededImage creation operation finished for image bidaf-image:34, operation \"Succeeded\"\n" + ] + } + ], + "source": [ + "image_config = ContainerImage.image_configuration(execution_script = \"score.py\",\n", + " runtime = \"python\",\n", + " conda_file = conda_env_file_name,\n", + " description = \"Image with BiDAF model\",\n", + " tags = CONTAINER_TAGS)\n", + "\n", + "image = ContainerImage.create(name = \"bidaf-image\",\n", + " models = [bidaf_model],\n", + " image_config = image_config,\n", + " workspace = ws)\n", + "\n", + "image.wait_for_creation(show_output = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the above step fails, then use the below command to see logs" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# print(image.image_build_log_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.6 Deploy the Image as a Web Service to Azure Container Instance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure Container Instances are mostly used for deploying your models as a web service if one or more of the following conditions are true: \n", + "1. You need to quickly deploy and validate your model.\n", + "2. You are testing a model that is under development. \n", + "\n", + "\n", + "To set them up properly, we need to indicate the number of CPU cores and the amount of memory we want to allocate to our web service." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "#Set the web service configuration\n", + "aci_config = AciWebservice.deploy_configuration(cpu_cores = CPU_CORES, \n", + " memory_gb = MEMORY_GB)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final step to deploying our web service is to call WebService.deploy_from_image(). This function uses the Docker image and the deployment configuration we created above to perform the following: \n", + "1. Deploy the docker image to an Azure Container Instance\n", + "2. Call the init() function in our scoring file\n", + "3. Provide an HTTP endpoint for scoring calls \n", + "\n", + "The deploy_from_image method requires the following parameters:\n", + "1. workspace: the workspace containing the service\n", + "2. name: a unique name used to identify the service in the workspace\n", + "3. image: a docker image object that contains the environment needed for scoring/inference\n", + "4. deployment_config: a configuration object describing the compute type\n", + "\n", + "**Note**: The web service creation can take a few minutes" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating service\n", + "Running....................................................\n", + "SucceededACI service creation operation finished, operation \"Succeeded\"\n", + "Healthy\n" + ] + } + ], + "source": [ + "# deploy image as web service\n", + "aci_service = Webservice.deploy_from_image(workspace = ws, \n", + " name = 'bidaf-aci-service',\n", + " image = image,\n", + " deployment_config = aci_config)\n", + "\n", + "aci_service.wait_for_deployment(show_output = True)\n", + "print(aci_service.state)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fetch logs to debug in case of failures." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print(aci_service.get_logs())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to reuse an existing service versus creating a new one, call the webservice with the name of the service. You can look up all the deployed webservices under deployment in the Azure Portal. Below is an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# aci_service = Webservice(workspace=ws, name='<>')\n", + "\n", + "# to use the webservice\n", + "# aci_service.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Conclusion**: Now we have a deployed webservice and deploying the model took less than 20 minutes!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Test Deployed Webservice" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Depending on the needs of our QA system, we can either do real-time or batch scoring. We show an example of both types of scoring below using the following example [passage](https://www.semanticscholar.org/paper/Bidirectional-Attention-Flow-for-Machine-Seo-Kembhavi/007ab5528b3bd310a80d553cccad4b78dc496b02) and questions:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "passage = \"Machine Comprehension (MC), answering questions about a given context, \\\n", + "requires modeling complex interactions between the context and the query. Recently,\\\n", + "attention mechanisms have been successfully extended to MC. Typically these mechanisms\\\n", + "use attention to summarize the query and context into a single vector, couple \\\n", + "attentions temporally, and often form a uni-directional attention. In this paper \\\n", + "we introduce the Bi-Directional Attention Flow (BIDAF) network, a multi-stage \\\n", + "hierarchical process that represents the context at different levels of granularity \\\n", + "and uses a bi-directional attention flow mechanism to achieve a query-aware context \\\n", + "representation without early summarization. Our experimental evaluations show that \\\n", + "our model achieves the state-of-the-art results in Stanford QA (SQuAD) and\\\n", + "CNN/DailyMail Cloze Test datasets.\"\n", + "\n", + "question1 = \"What is BIDAF?\"\n", + "question2 = \"What datasets does BIDAF achieve state-of-the-art results on?\"\n", + "question3 = \"What do attention mechanisms do?\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Real-time Scoring" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We prepare data for predicting answers for one passage-question pair by creating a dictionary with _question_ and _passage_ keys" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "data = {\"passage\": passage, \"question\":question1}\n", + "data = json.dumps(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time elapsed: 0.8884\n", + "Answer: Bi-Directional Attention Flow\n" + ] + } + ], + "source": [ + "with Timer() as t:\n", + " score = aci_service.run(input_data=data)\n", + " t.stop()\n", + " print(\"Time elapsed: {}\".format(t))\n", + " \n", + "result = json.loads(score)\n", + "try:\n", + " output = result[\"result\"]\n", + " print(\"Answer:\", output)\n", + "except:\n", + " print(result[\"error\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that the model responded to the question \"What is BiDAF?\" with \"Bi-Directional Attention Flow\"." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Batch Scoring\n", + "\n", + "We prepare the data for batch scoring by creating a list of dictionaries with _passage_ and _question_ keys." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "data_multiple = [{\"passage\": passage, \"question\":i} for i in [question1, question2, question3]]\n", + "data_multiple = json.dumps(data_multiple)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time elapsed: 0.9046\n", + "['Bi-Directional Attention Flow', 'Stanford QA (SQuAD) andCNN/DailyMail Cloze Test', 'have been successfully extended to MC']\n" + ] + } + ], + "source": [ + "with Timer() as t:\n", + " score = aci_service.run(input_data=data_multiple)\n", + " t.stop()\n", + " print(\"Time elapsed: {}\".format(t))\n", + " \n", + "result = json.loads(score)\n", + "try:\n", + " output = result[\"result\"]\n", + " print(output)\n", + "except:\n", + " print(result[\"error\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that the model responded to the question \"What is BiDAF?\" with \"Bi-Directional Attention Flow\", the question \"What datasets does BIDAF achieve state-of-the-art results on?\" with \"Stanford QA (SQuAD) and CNN/DailyMail Cloze Test\", and the question \"What do attention mechanisms do?\" with \"summarize the query and context into a single vector, couple attentions temporally, and often form a uni-directional attention\". All these answers make sense given the passage and demonstrate that the AllenNLP pre-trained model is a good model for a deployed QA system. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrated how to produce a fast QA service in under 20 minutes using Azure Container Instances (ACI). We deployed a popular pre-trained model, BiDAF, provided by AllenNLP, which was state-of-the-art in 2017 and performs well on our example queries. " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scenarios/sentence_similarity/README.md b/scenarios/sentence_similarity/README.md index f22c24d23..d2f5f3af5 100644 --- a/scenarios/sentence_similarity/README.md +++ b/scenarios/sentence_similarity/README.md @@ -1,6 +1,27 @@ +# Sentence Similarity -## What is sentence similarity? +This folder contains examples and best practices, written in Jupyter notebooks, for building sentence similarity models. The scores can be used in a wide variety of applications, such as search/retrieval, nearest-neighbor or kernel-based classification methods, recommendations, and ranking tasks. -Sentence similarity or semantic textual similarity is to determine how similar two pieces of texts are and a measure of the degree to which two pieces of text express the same meaning. This can take the form of assigning a score from 1 to 5. Related tasks are parahrase or duplicate identification. Sentence similarity is normally calculated by the following two steps: 1. obtaining the embeddings of the sentences, 2. taking the cosine similarity between them as shown in the following figure: +## What is sentence similarity -![Sentence Similarity](https://nlpbp.blob.core.windows.net/images/example-similarity.png)**Sentence Similarity ([Source](https://tfhub.dev/google/universal-sentence-encoder/1))** \ No newline at end of file +Sentence similarity or semantic textual similarity is a measure of how similar two pieces of text are, or to what degree they express the same meaning. Related tasks include paraphrase or duplicate identification, search, and matching applications. The common methods used for text similarity range from simple word-vector dot products to pairwise classification, and more recently, deep neural networks. + +Sentence similarity is normally calculated by the following two steps: + +1. obtaining the embeddings of the sentences + +2. taking the cosine similarity between them as shown in the following figure([source](https://tfhub.dev/google/universal-sentence-encoder/1)): + + ![Sentence Similarity](https://nlpbp.blob.core.windows.net/images/example-similarity.png) + +## Summary + +The following summarizes each notebook for Sentence Similarity. Each notebook provides more details and guiding in principles on building state of the art models. + +|Notebook|Runs Local|Description| +|---|---|---| +|[Creating a Baseline model](baseline_deep_dive.ipynb)| Yes| A baseline model is a basic solution that serves as a point of reference for comparing other models to. The baseline model's performance gives us an indication of how much better our models can perform relative to a naive approach.| +|Senteval |[local](senteval_local.ipynb), [AzureML](senteval_azureml.ipynb)|SentEval is a widely used benchmarking tool for evaluating general-purpose sentence embeddings. Running SentEval locally is easy, but not necessarily efficient depending on the model specs. We provide an example on how to do this efficiently in Azure Machine Learning Service. | +|[GenSen on AzureML](gensen_aml_deep_dive.ipynb)| No | This notebook serves as an introduction to an end-to-end NLP solution for sentence similarity building one of the State of the Art models, GenSen, on the AzureML platform. We show the advantages of AzureML when training large NLP models with GPU. +|[Automated Machine Learning(AutoML) with Deployment on Azure Container Instance](automl_local_deployment_ACI.ipynb)| Yes |This notebook shows users how to use AutoML on local machine and deploy the model as a webservice to Azure Container Instance(ACI) to get a sentence similarity score. +|[Google Universal Sentence Encoder with Azure Machine Learning Pipeline, AutoML with Deployment on Azure Kubernetes Service](aml_pipelines_deployment_AKS.ipynb)| No | This notebook shows a user how to use AzureML pipelines and deploy the pipeline output model as a webservice to Azure Kubernetes Service which can be used as an end point to get sentence similarity scores. diff --git a/scenarios/sentence_similarity/automl_local_deployment_aci.ipynb b/scenarios/sentence_similarity/automl_local_deployment_aci.ipynb new file mode 100644 index 000000000..ff76830cb --- /dev/null +++ b/scenarios/sentence_similarity/automl_local_deployment_aci.ipynb @@ -0,0 +1,986 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Local Automated Machine Learning Model with ACI Deployment for Predicting Sentence Similarity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates how to use [Azure Machine Learning Service's](https://azure.microsoft.com/en-us/services/machine-learning-service/\n", + ") Automated Machine Learning ([AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml\n", + ")) locally to automate machine learning model selection and tuning and how to use Azure Container Instance ([ACI](https://azure.microsoft.com/en-us/services/container-instances/\n", + ")) for deployment. We utilize the STS Benchmark dataset to predict sentence similarity and utilize AutoML's text preprocessing features." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "1. [Introduction](#1.-Introduction) \n", + " * 1.1 [What is Azure AutoML?](#1.1-What-is-Azure-AutoML?) \n", + " * 1.2 [Modeling Problem](#1.2-Modeling-Problem) \n", + " \n", + " \n", + "2. [Data Preparation](#2.-Data-Preparation) \n", + "\n", + "\n", + "3. [Create AutoML Run](#3.-Create-AutoML-Run) \n", + " * 3.1 [Link to or create a Workspace](#3.1-Link-to-or-create-a-Workspace) \n", + " * 3.2 [Create AutoMLConfig object](#3.2-Create-AutoMLConfig-object)\n", + " * 3.3 [Run Experiment](#3.3-Run-Experiment)\n", + " \n", + " \n", + "4. [Deploy Sentence Similarity Model](#4.-Deploy-Sentence-Similarity-Model) \n", + " 4.1 [Retrieve the Best Model](#4.1-Retrieve-the-Best-Model) \n", + " 4.2 [Register the Fitted Model for Deployment](#4.2-Register-the-Fitted-Model-for-Deployment) \n", + " 4.3 [Create an Entry Script](#4.3-Create-an-Entry-Script) \n", + " 4.4 [Create a YAML File for the Environment](#4.4-Create-a-YAML-File-for-the-Environment) \n", + " 4.5 [Create a Container Image](#4.5-Create-a-Container-Image) \n", + " 4.6 [Deploy the Image as a Web Service to Azure Container Instance](#4.6-Deploy-the-Image-as-a-Web-Service-to-Azure-Container-Instance) \n", + " 4.7 [Test Deployed Model](#4.7-Test-Deployed-Model) \n", + " \n", + " \n", + "5. [Clean](#5-Clean)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 What is Azure AutoML?\n", + "\n", + "Automated machine learning ([AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml)) is a capability of Microsoft's [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/\n", + "). The goal of AutoML is to improve the productivity of data scientists and democratize AI by allowing for the rapid development and deployment of machine learning models. To acheive this goal, AutoML automates the process of selecting a ML model and tuning the model. All the user is required to provide is a dataset (suitable for a classification, regression, or time-series forecasting problem) and a metric to optimize in choosing the model and hyperparameters. The user is also given the ability to set time and cost constraints for the model selection and tuning." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://nlpbp.blob.core.windows.net/images/automl.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The AutoML model selection and tuning process can be easily tracked through the Azure portal or directly in python notebooks through the use of widgets. AutoML quickly selects a high quilty machine learning model tailored for your prediction problem. In this notebook, we walk through the steps of preparing data, setting up an AutoML experiment, and evaluating the results of our best model. More information about running AutoML experiments in Python can be found [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 Modeling Problem\n", + "\n", + "The regression problem we will demonstrate is predicting sentence similarity scores on the STS Benchmark dataset. The [STS Benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset) contains a selection of English datasets that were used in Semantic Textual Similarity (STS) tasks 2012-2017. The dataset contains 8,628 sentence pairs with a human-labeled integer representing the sentences' similarity (ranging from 0, for no meaning overlap, to 5, meaning equivalence). The sentence pairs will be embedded using AutoML's built-in preprocessing, so we'll pass the sentences directly into the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Set the environment path to find NLP\n", + "import sys\n", + "\n", + "sys.path.append(\"../../\")\n", + "import time\n", + "import os\n", + "import pandas as pd\n", + "import shutil\n", + "import numpy as np\n", + "import torch\n", + "import sys\n", + "from scipy.stats import pearsonr\n", + "from scipy.spatial import distance\n", + "from sklearn.externals import joblib\n", + "import json\n", + "\n", + "# Import utils\n", + "from utils_nlp.azureml import azureml_utils\n", + "from utils_nlp.dataset import stsbenchmark\n", + "from utils_nlp.dataset.preprocess import (\n", + " to_lowercase,\n", + " to_spacy_tokens,\n", + " rm_spacy_stopwords,\n", + ")\n", + "from utils_nlp.common.timer import Timer\n", + "\n", + "# Tensorflow dependencies for Google Universal Sentence Encoder\n", + "import tensorflow as tf\n", + "import tensorflow_hub as hub\n", + "\n", + "tf.logging.set_verbosity(tf.logging.ERROR) # reduce logging output\n", + "\n", + "# AzureML packages\n", + "import azureml as aml\n", + "import logging\n", + "from azureml.telemetry import set_diagnostics_collection\n", + "\n", + "set_diagnostics_collection(send_diagnostics=True)\n", + "from azureml.train.automl import AutoMLConfig\n", + "from azureml.core.experiment import Experiment\n", + "from azureml.widgets import RunDetails\n", + "from azureml.train.automl.run import AutoMLRun\n", + "from azureml.core.webservice import AciWebservice, Webservice\n", + "from azureml.core.image import ContainerImage\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Azure ML SDK Version:\", aml.core.VERSION)\n", + "print(\"Pandas version: {}\".format(pd.__version__))\n", + "print(\"Tensorflow Version:\", tf.VERSION)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DATA_PATH = \"../../data\"\n", + "CPU_CORES = 1\n", + "MEMORY_GB = 8" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the settings for AutoML\n", + "automl_settings = {\n", + " \"task\": \"regression\", # type of task: classification, regression or forecasting\n", + " \"debug_log\": \"automated_ml_errors.log\",\n", + " \"path\": \"./automated-ml-regression\",\n", + " \"iteration_timeout_minutes\": 15, # How long each iteration can take before moving on\n", + " \"iterations\": 50, # Number of algorithm options to try\n", + " \"primary_metric\": \"spearman_correlation\", # Metric to optimize\n", + " \"preprocess\": True, # Whether dataset preprocessing should be applied\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Data Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## STS Benchmark Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As described above, the STS Benchmark dataset contains 8.6K sentence pairs along with a human-annotated score for how similiar the two sentences are. We will load the training, development (validation), and test sets provided by STS Benchmark and preprocess the data (lowercase the text, drop irrelevant columns, and rename the remaining columns) using the utils contained in this repo. Each dataset will ultimately have three columns: _sentence1_ and _sentence2_ which contain the text of the sentences in the sentence pair, and _score_ which contains the human-annotated similarity score of the sentence pair." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load in the raw datasets as pandas dataframes\n", + "train_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n", + "dev_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\")\n", + "test_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"test\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Clean each dataset by lowercasing text, removing irrelevant columns,\n", + "# and renaming the remaining columns\n", + "train_clean = stsbenchmark.clean_sts(train_raw)\n", + "dev_clean = stsbenchmark.clean_sts(dev_raw)\n", + "test_clean = stsbenchmark.clean_sts(test_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert all text to lowercase\n", + "train = to_lowercase(train_clean)\n", + "dev = to_lowercase(dev_clean)\n", + "test = to_lowercase(test_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Training set has {} sentences\".format(len(train)))\n", + "print(\"Development set has {} sentences\".format(len(dev)))\n", + "print(\"Testing set has {} sentences\".format(len(test)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Create AutoML Run" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "AutoML can be used for classification, regression or timeseries experiments. Each experiment type has corresponding machine learning models and metrics that can be optimized (see [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train)) and the options will be delineated below. As a first step we connect to an existing workspace or create one if it doesn't exist." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.1 Link to or create a Workspace" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.\n", + "\n", + "**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ws = azureml_utils.get_or_create_workspace(\n", + " subscription_id=\"\",\n", + " resource_group=\"\",\n", + " workspace_name=\"\",\n", + " workspace_region=\"\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\n", + " \"Workspace name: \" + ws.name,\n", + " \"Azure region: \" + ws.location,\n", + " \"Subscription id: \" + ws.subscription_id,\n", + " \"Resource group: \" + ws.resource_group,\n", + " sep=\"\\n\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.2 Create AutoMLConfig object\n", + "Next, we specify the parameters for the AutoMLConfig class. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**task** \n", + "AutoML supports the following base learners for the regression task: Elastic Net, Light GBM, Gradient Boosting, Decision Tree, K-nearest Neighbors, LARS Lasso, Stochastic Gradient Descent, Random Forest, Extremely Randomized Trees, XGBoost, DNN Regressor, Linear Regression. In addition, AutoML also supports two kinds of ensemble methods: voting (weighted average of the output of multiple base learners) and stacking (training a second \"metalearner\" which uses the base algorithms' predictions to predict the target variable). Specific base learners can be included or excluded in the parameters for the AutoMLConfig class (whitelist_models and blacklist_models) and the voting/stacking ensemble options can be specified as well (enable_voting_ensemble and enable_stack_ensemble)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**preprocess** \n", + "AutoML also has advanced preprocessing methods, eliminating the need for users to perform this manually. Data is automatically scaled and normalized but an additional parameter in the AutoMLConfig class enables the use of more advanced techniques including imputation, generating additional features, transformations, word embeddings, etc. (full list found [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-create-portal-experiments#preprocess)). Note that algorithm-specific preprocessing will be applied even if preprocess=False. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**primary_metric** \n", + "The regression metrics available are the following: Spearman Correlation (spearman_correlation), Normalized RMSE (normalized_root_mean_squared_error), Normalized MAE (normalized_mean_absolute_error), and R2 score (r2_score) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Constraints:** \n", + "There is a cost_mode parameter to set cost prediction modes (see options [here](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlconfig?view=azure-ml-py)). To set constraints on time there are multiple parameters including experiment_exit_score (target score to exit the experiment after achieving), experiment_timeout_minutes (maximum amount of time for all combined iterations), and iterations (total number of different algorithm and parameter combinations to try)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: we are directly passing in sentence pairs as data because we are relying upon AutoML's built-in preprocessing (by setting preprocess = True in the AutoMLConfig parameters) to perform the embedding step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = train.drop(\"score\", axis=1).values\n", + "y_train = train[\"score\"].values.flatten()\n", + "X_validation = dev.drop(\"score\", axis=1).values\n", + "y_validation = dev[\"score\"].values.flatten()\n", + "\n", + "# local compute\n", + "automated_ml_config = AutoMLConfig(\n", + " X=X_train,\n", + " y=y_train,\n", + " X_valid=X_validation,\n", + " y_valid=y_validation,\n", + " verbosity=logging.ERROR,\n", + " **automl_settings # where the autoML main settings are defined\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.3 Run Experiment\n", + "\n", + "Run the experiment locally and inspect the results using a widget" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "experiment = Experiment(ws, \"automated-ml-regression\")\n", + "local_run = experiment.submit(automated_ml_config, show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results of the completed run can be visualized in two ways. First, by using a RunDetails widget as shown in the cell below. Second, by accessing the [Azure portal](https://portal.azure.com), selecting your workspace, clicking on _Experiments_ and then selecting the name and run number of the experiment you want to inspect. Both these methods will show the results and duration for each iteration (algorithm tried), a visualization of the results, and information about the run including the compute target, primary metric, etc." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect the run details using the provided widget\n", + "RunDetails(local_run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://nlpbp.blob.core.windows.net/images/autoMLwidget.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Deploy Sentence Similarity Model\n", + "Deploying an Azure Machine Learning model as a web service creates a REST API. You can send data to this API and receive the prediction returned by the model.\n", + "In general, you create a webservice by deploying a model as an image to a Compute Target.\n", + "\n", + "Some of the Compute Targets are: \n", + "1. Azure Container Instance\n", + "2. Azure Kubernetes Service\n", + "3. Local web service\n", + "\n", + "The general workflow for deploying a model is as follows:\n", + "1. Register a model\n", + "2. Prepare to deploy\n", + "3. Deploy the model to the compute target\n", + "4. Test the deployed model (webservice)\n", + "\n", + "In this notebook, we walk you through the process of creating a webservice running on Azure Container Instance by deploying an AutoML model as an image. ACI is typically used for low scale, CPU-based workloads. (You can find more information on deploying and serving models [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where))\n", + "\n", + "## 4.1 Retrieve the Best Model\n", + "Now we can identify the model that maximized performance on a given metric (spearman correlation in our case) using the `get_output` method which returns the best_run (AutoMLRun object with information about the experiment) and fitted_model ([Pipeline]((https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb)) object) across all iterations. Overloads on `get_output` allow you to retrieve the best run and fitted model for any logged metric or for a particular iteration. \n", + "\n", + "The different steps that make up the pipeline can be accessed through `fitted_model.named_steps` and information about data preprocessing is available through `fitted_model.named_steps['datatransformer'].get_featurization_summary()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_run, fitted_model = local_run.get_output()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.2 Register the Fitted Model for Deployment\n", + "\n", + "Registering a model means registering one or more files that make up a model. The Machine Learning models are registered in your current Aure Machine Learning Workspace. The model can either come from Azure Machine Learning or another location, such as your local machine. \n", + "Below we show how a model is registered from the results of an experiment run. If neither metric nor iteration are specified in the register_model call, the iteration with the best primary metric is registered.\n", + "\n", + "See other ways to register a model [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "description = \"AutoML Model\"\n", + "tags = {\"area\": \"nlp\", \"type\": \"sentence similarity automl\"}\n", + "name = \"automl\"\n", + "model = local_run.register_model(description=description, tags=tags)\n", + "\n", + "print(local_run.model_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.3 Create an Entry Script\n", + "In this section we show an example of an entry script, which is called from the deployed webservice. `score.py` is our entry script. The script must contain:\n", + "1. init() - This function loads the model in a global object.\n", + "2. run() - This function is used for model prediction. The inputs and outputs to `run()` typically use JSON for serialization and deserilization. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile score.py\n", + "import pickle\n", + "import json\n", + "import numpy\n", + "import azureml.train.automl\n", + "from sklearn.externals import joblib\n", + "from azureml.core.model import Model\n", + "\n", + "\n", + "def init():\n", + " global model\n", + " model_path = Model.get_model_path(\n", + " model_name=\"<>\"\n", + " ) # this name is model.id of model that we want to deploy\n", + " # deserialize the model file back into a sklearn model\n", + " model = joblib.load(model_path)\n", + "\n", + "\n", + "def run(rawdata):\n", + " try:\n", + " data = json.loads(rawdata)[\"data\"]\n", + " data = numpy.array(data)\n", + " result = model.predict(data)\n", + " except Exception as e:\n", + " result = str(e)\n", + " return json.dumps({\"error\": result})\n", + " return json.dumps({\"result\": result.tolist()})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Substitute the actual model id in the script file.\n", + "script_file_name = \"score.py\"\n", + "\n", + "with open(script_file_name, \"r\") as cefr:\n", + " content = cefr.read()\n", + "\n", + "with open(script_file_name, \"w\") as cefw:\n", + " cefw.write(content.replace(\"<>\", local_run.model_id))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.4 Create a YAML File for the Environment\n", + "\n", + "To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. The following cells create a file, autoenv.yml, which specifies the dependencies from the run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "experiment = Experiment(ws, \"automated-ml-regression\")\n", + "ml_run = AutoMLRun(experiment=experiment, run_id=local_run.id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_iteration = int(\n", + " best_run.id.split(\"_\")[-1]\n", + ") # get the appended iteration number for the best model\n", + "dependencies = ml_run.get_run_sdk_dependencies(iteration=best_iteration)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add dependencies in the yaml file from the above cell. You must specify the version of \"azureml-sdk[automl]\" while creating the yaml file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "myenv = CondaDependencies.create(\n", + " conda_packages=[\"numpy\", \"scikit-learn\", \"py-xgboost<=0.80\"],\n", + " pip_packages=[\"azureml-sdk[automl]==1.0.43.*\"],\n", + " python_version=\"3.6.8\",\n", + ")\n", + "\n", + "conda_env_file_name = \"automlenv.yml\"\n", + "myenv.save_to_file(\".\", conda_env_file_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.5 Create a Container Image" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this step we create a container image which is wrapper containing the entry script, yaml file with package dependencies and the model. The created image is then deployed as a webservice in the next step. This step can take up to 10 minutes and even longer if the model is large." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_config = ContainerImage.image_configuration(\n", + " execution_script=script_file_name,\n", + " runtime=\"python\",\n", + " conda_file=conda_env_file_name,\n", + " description=\"Image with automl model\",\n", + " tags={\"area\": \"nlp\", \"type\": \"sentencesimilarity automl\"},\n", + ")\n", + "\n", + "image = ContainerImage.create(\n", + " name=\"automl-image\",\n", + " # this is the model object\n", + " models=[model],\n", + " image_config=image_config,\n", + " workspace=ws,\n", + ")\n", + "\n", + "image.wait_for_creation(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the above step fails, then use the below command to see logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print(image.image_build_log_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.6 Deploy the Image as a Web Service to Azure Container Instance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure Container Instances are mostly used for deploying your models as a web service if one or more of the following conditions are true:\n", + "1. You need to quickly deploy and validate your model.\n", + "2. You are testing a model that is under development.\n", + "\n", + "To set them up properly, we need to indicate the number of CPU cores and the amount of memory we want to allocate to our web service." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set the web service configuration\n", + "aci_config = AciWebservice.deploy_configuration(\n", + " cpu_cores=CPU_CORES, memory_gb=MEMORY_GB\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final step to deploying our web service is to call `WebService.deploy_from_image()`. This function uses the Docker image and the deployment configuration we created above to perform the following:\n", + "1. Deploy the docker image to an Azure Container Instance\n", + "2. Call the init() function in our scoring file\n", + "3. Provide an HTTP endpoint for scoring calls\n", + "\n", + "The deploy_from_image method requires the following parameters:\n", + "\n", + "1. workspace: the workspace containing the service\n", + "2. name: a unique name used to identify the service in the workspace\n", + "3. image: a docker image object that contains the environment needed for scoring/inference\n", + "4. deployment_config: a configuration object describing the compute type\n", + "\n", + "**Note:** The web service creation can take a few minutes " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# deploy image as web service\n", + "aci_service = Webservice.deploy_from_image(\n", + " workspace=ws, name=\"aci-automl-service-1\", image=image, deployment_config=aci_config\n", + ")\n", + "\n", + "aci_service.wait_for_deployment(show_output=True)\n", + "print(aci_service.state)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fetch logs to debug in case of failures." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print(aci_service.get_logs())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to reuse an existing service versus creating a new one, call the webservice with the name. You can look up all the deployed webservices under deployment in the Azure Portal. Below is an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# aci_service = Webservice(workspace=ws, name='<>')\n", + "\n", + "# to use the webservice\n", + "# aci_service.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.7 Test Deployed Model\n", + "\n", + "Testing the deployed model means running the created webservice.
\n", + "The deployed model can be tested by passing a list of sentence pairs. The output will be a score between 0 and 5, with 0 indicating no meaning overlap between the sentences and 5 meaning equivalence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentences = [\n", + " [\"This is sentence1\", \"This is sentence1\"],\n", + " [\"A hungry cat.\", \"A sleeping cat\"],\n", + " [\"Its summer time \", \"Winter is coming\"],\n", + "]\n", + "data = {\"data\": sentences}\n", + "data = json.dumps(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set up a Timer to see how long the model takes to predict\n", + "t = Timer()\n", + "\n", + "t.start()\n", + "score = aci_service.run(input_data=data)\n", + "t.stop()\n", + "\n", + "print(\"Time elapsed: {}\".format(t))\n", + "\n", + "result = json.loads(score)\n", + "try:\n", + " output = result[\"result\"]\n", + " print(\"Number of samples predicted: {}\".format(len(output)))\n", + " print(output)\n", + "except:\n", + " print(result[\"error\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we'll calculate the Pearson Correlation on the test set.\n", + "\n", + "**What is Pearson Correlation?**\n", + "\n", + "Our evaluation metric is Pearson correlation ($\\rho$) which is a measure of the linear correlation between two variables. The formula for calculating Pearson correlation is as follows: \n", + "\n", + "$$\\rho_{X,Y} = \\frac{E[(X-\\mu_X)(Y-\\mu_Y)]}{\\sigma_X \\sigma_Y}$$\n", + "\n", + "This metric takes a value in [-1,1] where -1 represents a perfect negative correlation, 1 represents a perfect positive correlation, and 0 represents no correlation. We utilize the Pearson correlation metric as this is the main metric that [SentEval](http://nlpprogress.com/english/semantic_textual_similarity.html), a widely-used evaluation toolkit for evaluation sentence representations, uses for the STS Benchmark dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_y = test[\"score\"].values.flatten()\n", + "test_x = test.drop(\"score\", axis=1).values.tolist()\n", + "\n", + "data = {\"data\": test_x}\n", + "data = json.dumps(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set up a Timer to see how long the model takes to predict\n", + "t = Timer()\n", + "\n", + "t.start()\n", + "score = aci_service.run(input_data=data)\n", + "t.stop()\n", + "\n", + "print(\"Time elapsed: {}\".format(t))\n", + "\n", + "result = json.loads(score)\n", + "try:\n", + " output = result[\"result\"]\n", + " print(\"Number of samples predicted: {}\".format(len(output)))\n", + "except:\n", + " print(result[\"error\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(pearsonr(output, test_y)[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Clean up\n", + "Throughout the notebook, we used a workspace and Azure container instances. To get a sense of the cost we incurred, we can refer to this [calculator](https://azure.microsoft.com/en-us/pricing/calculator/). We can also navigate to the [Cost Management + Billing](https://ms.portal.azure.com/#blade/Microsoft_Azure_Billing/ModernBillingMenuBlade/Overview) pane on the portal, click on our subscription ID, and click on the Cost Analysis tab to check our credit usage.\n", + "

\n", + "In order not to incur extra costs, let's delete the resources we no longer need.\n", + "

\n", + "Once we have verified that our web service works well on ACI, we can delete it. This helps reduce [costs](https://azure.microsoft.com/en-us/pricing/details/container-instances/), since the container group we were paying for no longer exists, and allows us to keep our workspace clean." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# aci_service.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point, the main resource we are paying for is the Standard Azure Container Registry (ACR), which contains our Docker image. Details on pricing are available [here](https://azure.microsoft.com/en-us/pricing/details/container-registry/).\n", + "\n", + "We may decide to use our Docker image in a separate ACI or even in an AKS deployment. In that case, we should keep it available in our workspace. However, if we no longer have a use for it, we can delete it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# docker_image.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If our goal is to continue using our workspace, we should keep it available. On the contrary, if we plan on no longer using it and its associated resources, we can delete it.\n", + "

\n", + "Note: Deleting the workspace will delete all the experiments, outputs, models, Docker images, deployments, etc. that we created in that workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ws.delete(delete_dependent_resources=True)\n", + "# This deletes our workspace, the container registry, the account storage, Application Insights and the key vault" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As mentioned above, Azure Container Instances tend to be used to develop and test deployments. They are typically configured with CPUs, which usually suffice when the number of requests per second is not too high. When working with several instances, we can configure them further by specifically allocating CPU resources to each of them.\n", + "\n", + "\n", + "For production requirements, i.e. when > 100 requests per second are expected, we recommend deploying models to Azure Kubernetes Service (AKS). It is a convenient infrastructure as it manages hosted Kubernetes environments, and makes it easy to deploy and manage containerized applications without container orchestration expertise. It also supports deployments with CPU clusters and deployments with GPU clusters.\n", + "\n", + "To see an example with Azure Kubernetes Service example, go to [this notebook](https://github.com/microsoft/nlp/blob/courtney-janhavi-automl/scenarios/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb)\n", + "\n", + "For more examples on deployment follow [MachineLearningNotebooks](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/deployment) github repository." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scenarios/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb b/scenarios/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb new file mode 100644 index 000000000..00673a288 --- /dev/null +++ b/scenarios/sentence_similarity/automl_with_pipelines_deployment_aks.ipynb @@ -0,0 +1,1602 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AzureML Pipeline, AutoML, AKS Deployment for Sentence Similarity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates how to use [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning-service/\n", + ") pipelines and Automated Machine Learning ([AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml\n", + ")) to streamline the creation of a machine learning workflow for predicting sentence similarity. The pipeline contains two steps: \n", + "1. PythonScriptStep: embeds sentences using a popular sentence embedding model, Google Universal Sentence Encoder\n", + "2. AutoMLStep: demonstrates how to use Automated Machine Learning (AutoML) to automate model selection for predicting sentence similarity (regression)\n", + "\n", + "After creating the pipeline, the notebook demonstrates the deployment of our sentence similarity model using Azure Kubernetes Service ([AKS](https://docs.microsoft.com/en-us/azure/aks/intro-kubernetes\n", + ")).\n", + "\n", + "This notebook showcases how to use the following AzureML features: \n", + "- AzureML Pipelines (PythonScriptStep and AutoMLStep)\n", + "- Automated Machine Learning\n", + "- AmlCompute\n", + "- Datastore\n", + "- Logging" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "1. [Introduction](#1.-Introduction) \n", + " * 1.1 [What are AzureML Pipelines?](#1.1-What-are-AzureML-Pipelines?) \n", + " * 1.2 [What is Azure AutoML?](#1.2-What-is-Azure-AutoML?) \n", + " * 1.3 [Modeling Problem](#1.3-Modeling-Problem) \n", + "2. [Data Preparation](#2.-Data-Preparation) \n", + "3. [AzureML Setup](#3.-AzureML-Setup) \n", + " * 3.1 [Link to or create a `Workspace`](#3.1-Link-to-or-create-a-Workspace) \n", + " * 3.2 [Set up an `Experiment` and Logging](#3.2-Set-up-an-Experiment-and-Logging) \n", + " * 3.3 [Link `AmlCompute` compute target](#3.3-Link-AmlCompute-compute-target) \n", + " * 3.4 [Upload data to `Datastore`](#3.4-Upload-data-to-Datastore) \n", + "4. [Create AzureML Pipeline](#4.-Create-AzureML-Pipeline) \n", + " * 4.1 [Set up run configuration file](#4.1-Set-up-run-configuration-file) \n", + " * 4.2 [PythonScriptStep](#4.2-PythonScriptStep) \n", + " * 4.2.1 [Define python script to run](#4.2.1-Define-python-script-to-run)\n", + " * 4.2.2 [Create PipelineData object](#4.2.2-Create-PipelineData-object)\n", + " * 4.2.3 [Create PythonScriptStep](#4.2.3-Create-PythonScriptStep)\n", + " * 4.3 [AutoMLStep](#4.3-AutoMLStep)\n", + " * 4.3.1 [Define get_data script to load data](#4.3.1-Define-get_data-script-to-load-data)\n", + " * 4.3.2 [Create AutoMLConfig object](#4.3.2-Create-AutoMLConfig-object)\n", + " * 4.3.3 [Create AutoMLStep](#4.3.3-Create-AutoMLStep) \n", + "5. [Run Pipeline](#5.-Run-Pipeline) \n", + "6. [Deploy Sentence Similarity Model](#6.-Deploy-Sentence-Similarity-Model)\n", + " * 6.1 [Register/Retrieve AutoML and Google Universal Sentence Encoder Models for Deployment](#6.1-Register/Retrieve-AutoML-and-Google-Universal-Sentence-Encoder-Models-for-Deployment) \n", + " * 6.2 [Create Scoring Script](#6.2-Create-Scoring-Script)\n", + " * 6.3 [Create a YAML File for the Environment](#6.3-Create-a-YAML-File-for-the-Environment) \n", + " * 6.4 [Image Creation](#6.4-Image-Creation) \n", + " * 6.5 [Provision the AKS Cluster](#6.5-Provision-the-AKS-Cluster) \n", + " * 6.6 [Deploy the image as a Web Service to Azure Kubernetes Service](#6.6-Deploy-the-image-as-a-Web-Service-to-Azure-Kubernetes-Service) \n", + " * 6.7 [Test Deployed Model](#6.7-Test-Deployed-Webservice) \n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 What are AzureML Pipelines?\n", + "\n", + "[AzureML Pipelines](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines) define reusable machine learning workflows that can be used as a template for your machine learning scenarios. Pipelines allow you to optimize your workflow and spend time on machine learning rather than infrastructure. A Pipeline is defined by a series of steps; the following steps are available: AdlaStep, AutoMLStep, AzureBatchStep, DataTransferStep, DatabricksStep, EstimatorStep, HyperDriveStep, ModuleStep, MpiStep, and PythonScriptStep (see [here](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/?view=azure-ml-py) for details of each step). When the pipeline is run, cached results are used for all steps that have not changed, optimizing the run time. Data sources and intermediate data can be used across multiple steps in a pipeline, saving time and resources. Below we see an example of an AzureML pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://nlpbp.blob.core.windows.net/images/pipelines.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 What is Azure AutoML?\n", + "\n", + "Automated machine learning ([AutoML](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-automated-ml)) is a capability of Microsoft's [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/\n", + "). The goal of AutoML is to improve the productivity of data scientists and democratize AI by allowing for the rapid development and deployment of machine learning models. To acheive this goal, AutoML automates the process of selecting a ML model and tuning the model. All the user is required to provide is a dataset (suitable for a classification, regression, or time-series forecasting problem) and a metric to optimize in choosing the model and hyperparameters. The user is also given the ability to set time and cost constraints for the model selection and tuning." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](automl.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The AutoML model selection and tuning process can be easily tracked through the Azure portal or directly in python notebooks through the use of widgets. AutoML quickly selects a high quality machine learning model tailored for your prediction problem. In this notebook, we walk through the steps of preparing data, setting up an AutoML experiment, and evaluating the results of our best model. More information about running AutoML experiments in Python can be found [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 Modeling Problem\n", + "\n", + "The regression problem we will demonstrate is predicting sentence similarity scores on the STS Benchmark dataset. The [STS Benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#STS_benchmark_dataset_and_companion_dataset) contains a selection of English datasets that were used in Semantic Textual Similarity (STS) tasks 2012-2017. The dataset contains 8,628 sentence pairs with a human-labeled integer representing the sentences' similarity (ranging from 0, for no meaning overlap, to 5, meaning equivalence).\n", + "\n", + "For each sentence in the sentence pair, we will use Google's pretrained Universal Sentence Encoder (details provided below) to generate a $512$-dimensional embedding. Both embeddings in the sentence pair will be concatenated and the resulting $1024$-dimensional vector will be used as features in our regression problem. Our target variable is the sentence similarity score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Set the environment path to find NLP\n", + "import sys\n", + "\n", + "sys.path.append(\"../../\")\n", + "import time\n", + "import logging\n", + "import csv\n", + "import os\n", + "import pandas as pd\n", + "import shutil\n", + "import numpy as np\n", + "import torch\n", + "import sys\n", + "from scipy.stats import pearsonr\n", + "from scipy.spatial import distance\n", + "from sklearn.externals import joblib\n", + "import json\n", + "\n", + "# Import utils\n", + "from utils_nlp.azureml import azureml_utils\n", + "from utils_nlp.dataset import stsbenchmark\n", + "from utils_nlp.dataset.preprocess import (\n", + " to_lowercase,\n", + " to_spacy_tokens,\n", + " rm_spacy_stopwords,\n", + ")\n", + "from utils_nlp.common.timer import Timer\n", + "\n", + "# Google Universal Sentence Encoder loader\n", + "import tensorflow_hub as hub\n", + "\n", + "# AzureML packages\n", + "import azureml as aml\n", + "import logging\n", + "from azureml.telemetry import set_diagnostics_collection\n", + "\n", + "set_diagnostics_collection(send_diagnostics=True)\n", + "from azureml.core import Datastore, Experiment\n", + "from azureml.core.compute import ComputeTarget, AmlCompute\n", + "from azureml.core.runconfig import RunConfiguration\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "from azureml.core.webservice import AksWebservice, Webservice\n", + "from azureml.core.compute import AksCompute, ComputeTarget\n", + "from azureml.core.image import ContainerImage\n", + "from azureml.core.model import Model\n", + "from azureml.train.automl import AutoMLStep, AutoMLStepRun, AutoMLConfig\n", + "from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput\n", + "from azureml.pipeline.steps import PythonScriptStep\n", + "from azureml.data.data_reference import DataReference\n", + "from azureml.widgets import RunDetails\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"Azure ML SDK Version:\", aml.core.VERSION)\n", + "print(\"Pandas version: {}\".format(pd.__version__))\n", + "print(\"Tensorflow Version:\", tf.VERSION)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DATA_PATH = \"../../data\"\n", + "EMBEDDED_DATA_REF = os.environ[\"AZUREML_DATAREFERENCE_embedded_data\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "automl_settings = {\n", + " \"task\": \"regression\", # type of task: classification, regression or forecasting\n", + " \"iteration_timeout_minutes\": 15, # How long each iteration can take before moving on\n", + " \"iterations\": 50, # Number of algorithm options to try\n", + " \"primary_metric\": \"spearman_correlation\", # Metric to optimize\n", + " \"preprocess\": True, # Whether dataset preprocessing should be applied\n", + " \"verbosity\": logging.INFO,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Data Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**STS Benchmark Dataset**\n", + "\n", + "As described above, the STS Benchmark dataset contains 8.6K sentence pairs along with a human-annotated score for how similar the two sentences are. We will load the training, development (validation), and test sets provided by STS Benchmark and preprocess the data (lowercase the text, drop irrelevant columns, and rename the remaining columns) using the utils contained in this repo. Each dataset will ultimately have three columns: _sentence1_ and _sentence2_ which contain the text of the sentences in the sentence pair, and _score_ which contains the human-annotated similarity score of the sentence pair." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load in the raw datasets as pandas dataframes\n", + "train_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n", + "dev_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\")\n", + "test_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"test\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Clean each dataset by lowercasing text, removing irrelevant columns,\n", + "# and renaming the remaining columns\n", + "train_clean = stsbenchmark.clean_sts(train_raw)\n", + "dev_clean = stsbenchmark.clean_sts(dev_raw)\n", + "test_clean = stsbenchmark.clean_sts(test_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert all text to lowercase\n", + "train = to_lowercase(train_clean)\n", + "dev = to_lowercase(dev_clean)\n", + "test = to_lowercase(test_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Training set has {} sentences\".format(len(train)))\n", + "print(\"Development set has {} sentences\".format(len(dev)))\n", + "print(\"Testing set has {} sentences\".format(len(test)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save the cleaned data\n", + "if not os.path.isdir(\"data\"):\n", + " os.mkdir(\"data\")\n", + "\n", + "train.to_csv(\"data/train.csv\", index=False)\n", + "test.to_csv(\"data/test.csv\", index=False)\n", + "dev.to_csv(\"data/dev.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. AzureML Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we set up the necessary components for running this as an AzureML experiment\n", + "1. Create or link to an existing `Workspace`\n", + "2. Set up an `Experiment` with `logging`\n", + "3. Create or attach existing `AmlCompute`\n", + "4. Upload our data to a `Datastore`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.1 Link to or create a Workspace" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.\n", + "\n", + "**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "ws = azureml_utils.get_or_create_workspace(\n", + " subscription_id=\"\",\n", + " resource_group=\"\",\n", + " workspace_name=\"\",\n", + " workspace_region=\"\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\n", + " \"Workspace name: \" + ws.name,\n", + " \"Azure region: \" + ws.location,\n", + " \"Subscription id: \" + ws.subscription_id,\n", + " \"Resource group: \" + ws.resource_group,\n", + " sep=\"\\n\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.2 Set up an Experiment and Logging" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make a folder for the project\n", + "project_folder = \"./automl-sentence-similarity\"\n", + "os.makedirs(project_folder, exist_ok=True)\n", + "\n", + "# Set up an experiment\n", + "experiment_name = \"automl-sentence-similarity\"\n", + "experiment = Experiment(ws, experiment_name)\n", + "\n", + "# Add logging to our experiment\n", + "run = experiment.start_logging()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.3 Link AmlCompute Compute Target" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use AzureML Pipelines we need to link a compute target as they can not be run locally (see [compute options](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#supported-compute-targets) for explanation of the different options). We will use an AmlCompute target in this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# choose your cluster\n", + "cluster_name = \"gpucluster\"\n", + "\n", + "try:\n", + " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", + " print(\"Found existing compute target.\")\n", + "except ComputeTargetException:\n", + " print(\"Creating a new compute target...\")\n", + " compute_config = AmlCompute.provisioning_configuration(\n", + " vm_size=\"STANDARD_NC6\", max_nodes=4\n", + " )\n", + "\n", + " # create the cluster\n", + " compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n", + "\n", + " compute_target.wait_for_completion(show_output=True)\n", + "\n", + "# use get_status() to get a detailed status for the current AmlCompute.\n", + "print(compute_target.get_status().serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.4 Upload data to Datastore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This step uploads our local data to a `Datastore` so that the data is accessible from the remote compute target and creates a `DataReference` to point to the location of the data on the Datastore. A DataStore is backed either by a Azure File Storage (default option) or Azure Blob Storage ([how to decide between these options](https://docs.microsoft.com/en-us/azure/storage/common/storage-decide-blobs-files-disks)) and data is made accessible by mounting or copying data to the compute target. `ws.datastores` lists all options for datastores and `ds.account_name` gets the name of the datastore that can be used to find it in the Azure portal." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Select a specific datastore or you can call ws.get_default_datastore()\n", + "datastore_name = \"workspacefilestore\"\n", + "ds = ws.datastores[datastore_name]\n", + "\n", + "# Upload files in data folder to the datastore\n", + "ds.upload(\n", + " src_dir=\"./data\",\n", + " target_path=\"stsbenchmark_data\",\n", + " overwrite=True,\n", + " show_progress=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also set up a `DataReference` object that points to the data we just uploaded into the stsbenchmark_data folder. DataReference objects point to data that is accessible from a datastore and will be used an an input into our pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_data = DataReference(\n", + " datastore=ds,\n", + " data_reference_name=\"stsbenchmark\",\n", + " path_on_datastore=\"stsbenchmark_data/\",\n", + " overwrite=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Create AzureML Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we set up our pipeline which is made of two steps: \n", + "1. `PythonScriptStep`: takes each sentence pair from the data in the `Datastore` and concatenates the Google USE embeddings for each sentence into one vector. This step saves the embedding feature matrix back to our `Datastore` and uses a `PipelineData` object to represent this intermediate data. \n", + "2. `AutoMLStep`: takes the intermediate data produced by the previous step and passes it to an `AutoMLConfig` which performs the automatic model selection" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.1 Set up run configuration file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we set up a `RunConfiguration` object which configures the execution environment for an experiment (sets up the conda dependencies, etc.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a new RunConfig object\n", + "conda_run_config = RunConfiguration(framework=\"python\")\n", + "\n", + "# Set compute target to AmlCompute\n", + "conda_run_config.target = compute_target\n", + "\n", + "conda_run_config.environment.docker.enabled = True\n", + "conda_run_config.environment.docker.base_image = aml.core.runconfig.DEFAULT_CPU_IMAGE\n", + "\n", + "# Use conda_dependencies.yml to create a conda environment in the Docker image for execution\n", + "conda_run_config.environment.python.user_managed_dependencies = False\n", + "\n", + "conda_run_config.environment.python.conda_dependencies = CondaDependencies.create(\n", + " pip_packages=[\n", + " \"azureml-sdk[automl]\",\n", + " \"azureml-sdk\",\n", + " \"azureml-dataprep\",\n", + " \"azureml-train-automl==1.0.33\",\n", + " ],\n", + " conda_packages=[\n", + " \"numpy\",\n", + " \"py-xgboost<=0.80\",\n", + " \"pandas\",\n", + " \"tensorflow\",\n", + " \"tensorflow-hub\",\n", + " \"scikit-learn\",\n", + " ],\n", + " pin_sdk_version=False,\n", + ")\n", + "\n", + "print(\"run config is ready\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.2 PythonScriptStep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PythonScriptStep` is a step which runs a user-defined Python script ([documentation](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.python_script_step.pythonscriptstep?view=azure-ml-py) here). In this `PythonScriptStep`, we will convert our sentences into a numerical representation in order to use them in our machine learning model. We will embed both sentences using the Google Universal Sentence Encoder (provided by tensorflow-hub) and concatenate their representations into a $1024$-dimensional vector to use as features for AutoML.\n", + "\n", + "**Google Universal Sentence Encoder:**\n", + "We'll use a popular sentence encoder called Google Universal Sentence Encoder (see [original paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46808.pdf)). Google provides two pretrained models based on different design goals: a Transformer model (targets high accuracy even if this reduces model complexity) and a Deep Averaging Network model (DAN; targets efficient inference). Both models are trained on a variety of web sources (Wikipedia, news, question-answers pages, and discussion forums) and produced 512-dimensional embeddings. This notebook utilizes the Transformer-based encoding model which can be downloaded [here](https://tfhub.dev/google/universal-sentence-encoder-large/3) because of its better performance relative to the DAN model on the STS Benchmark dataset (see Table 2 in Google Research's [paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46808.pdf)). The Transformer model produces sentence embeddings using the \"encoding sub-graph of the transformer architecture\" (original architecture introduced [here](https://arxiv.org/abs/1706.03762)). \"This sub-graph uses attention to compute context aware representations of words in a sentence that take into account both the ordering and identity of all the other workds. The context aware word representations are converted to a fixed length sentence encoding vector by computing the element-wise sum of the representations at each word position.\" The input to the model is lowercase PTB-tokenized strings and the model is designed to be useful for multiple different tasks by using multi-task learning. More details about the model can be found in the [paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46808.pdf) by Google Research." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.2.1 Define python script to run\n", + "\n", + "Define the script (called embed.py) that the `PythonScriptStep` will execute:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $project_folder/embed.py\n", + "import argparse\n", + "import os\n", + "import azureml.core\n", + "import pandas as pd\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "import tensorflow_hub as hub\n", + "\n", + "tf.logging.set_verbosity(tf.logging.ERROR) # reduce logging output\n", + "\n", + "\n", + "def google_encoder(dataset):\n", + " \"\"\" Function that embeds sentences using the Google Universal\n", + " Sentence Encoder pretrained model\n", + " \n", + " Parameters:\n", + " ----------\n", + " dataset: pandas dataframe with sentences and scores\n", + " \n", + " Returns:\n", + " -------\n", + " emb1: 512-dimensional representation of sentence1\n", + " emb2: 512-dimensional representation of sentence2\n", + " \"\"\"\n", + " sts_input1 = tf.placeholder(tf.string, shape=(None))\n", + " sts_input2 = tf.placeholder(tf.string, shape=(None))\n", + "\n", + " # Apply embedding model and normalize the input\n", + " sts_encode1 = tf.nn.l2_normalize(embedding_model(sts_input1), axis=1)\n", + " sts_encode2 = tf.nn.l2_normalize(embedding_model(sts_input2), axis=1)\n", + "\n", + " with tf.Session() as session:\n", + " session.run(tf.global_variables_initializer())\n", + " session.run(tf.tables_initializer())\n", + " emb1, emb2 = session.run(\n", + " [sts_encode1, sts_encode2],\n", + " feed_dict={\n", + " sts_input1: dataset[\"sentence1\"],\n", + " sts_input2: dataset[\"sentence2\"],\n", + " },\n", + " )\n", + " return emb1, emb2\n", + "\n", + "\n", + "def feature_engineering(dataset):\n", + " \"\"\"Extracts embedding features from the dataset and returns\n", + " features and target in a dataframe\n", + " \n", + " Parameters:\n", + " ----------\n", + " dataset: pandas dataframe with sentences and scores\n", + " \n", + " Returns:\n", + " -------\n", + " df: pandas dataframe with embedding features\n", + " scores: list of target variables\n", + " \"\"\"\n", + " google_USE_emb1, google_USE_emb2 = google_encoder(dataset)\n", + " n_google = google_USE_emb1.shape[1] # length of the embeddings\n", + " df = np.concatenate((google_USE_emb1, google_USE_emb2), axis=1)\n", + " names = [\"USEEmb1_\" + str(i) for i in range(n_google)] + [\n", + " \"USEEmb2_\" + str(i) for i in range(n_google)\n", + " ]\n", + " df = pd.DataFrame(df, columns=names)\n", + " return df, dataset[\"score\"]\n", + "\n", + "\n", + "def write_output(df, path, name):\n", + " \"\"\"Write dataframes to correct path\"\"\"\n", + " os.makedirs(path, exist_ok=True)\n", + " print(\"%s created\" % path)\n", + " df.to_csv(path + \"/\" + name, index=False)\n", + "\n", + "\n", + "# Parse arguments\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument(\"--sentence_data\", type=str)\n", + "parser.add_argument(\"--embedded_data\", type=str)\n", + "args = parser.parse_args()\n", + "\n", + "# Import the Universal Sentence Encoder's TF Hub module\n", + "module_url = \"https://tfhub.dev/google/universal-sentence-encoder-large/3\"\n", + "embedding_model = hub.Module(module_url)\n", + "\n", + "# Read data\n", + "train = pd.read_csv(args.sentence_data + \"/train.csv\")\n", + "dev = pd.read_csv(args.sentence_data + \"/dev.csv\")\n", + "\n", + "# Get Google USE features\n", + "training_data, training_scores = feature_engineering(train)\n", + "validation_data, validation_scores = feature_engineering(dev)\n", + "\n", + "# Write out training data to Datastore\n", + "write_output(training_data, args.embedded_data, \"X_train.csv\")\n", + "write_output(\n", + " pd.DataFrame(training_scores, columns=[\"score\"]), args.embedded_data, \"y_train.csv\"\n", + ")\n", + "\n", + "# Write out validation data to Datastore\n", + "write_output(validation_data, args.embedded_data, \"X_dev.csv\")\n", + "write_output(\n", + " pd.DataFrame(validation_scores, columns=[\"score\"]), args.embedded_data, \"y_dev.csv\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.2.2 Create PipelineData object" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PipelineData` objects represent a piece of intermediate data in a pipeline. Generally they are produced by one step (as an output) and then consumed by the next step (as an input), introducing an implicit order between steps in a pipeline. We create a PipelineData object that can represent the data produced by our first pipeline step that will be consumed by our second pipeline step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedded_data = PipelineData(\"embedded_data\", datastore=ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.2.3 Create PythonScriptStep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This step defines the `PythonScriptStep`. We give the step a name, tell the step which python script to run (embed.py) and what directory that script is located in (source_directory). Note that the hash_paths parameter will be deprecated but currently is needed to check for any updates to the embed.py file.\n", + "\n", + "We also link the compute target and run configuration that we made previously. Our input is the `DataReference` object (input_data) where our raw sentence data was uploaded and our ouput is the `PipelineData` object (embedded_data) where the embedded data produced by this step will be stored. These are also passed in as arguments so that we have access to the correct data paths." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embed_step = PythonScriptStep(\n", + " name=\"Embed\",\n", + " script_name=\"embed.py\",\n", + " arguments=[\"--embedded_data\", embedded_data, \"--sentence_data\", input_data],\n", + " inputs=[input_data],\n", + " outputs=[embedded_data],\n", + " compute_target=compute_target,\n", + " runconfig=conda_run_config,\n", + " hash_paths=[\"embed.py\"],\n", + " source_directory=project_folder,\n", + " allow_reuse=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.3 AutoMLStep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`AutoMLStep` creates an AutoML step in a pipeline (see [documentation](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlstep?view=azure-ml-py) and [basic example](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb)). When using AutoML on remote compute, rather than passing our data directly into the `AutoMLConfig` object as we did in the local example, we must define a get_data.py script with a get_data() function to pass as the data_script argument. This workflow can be used for both local and remote executions (see [details](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-auto-train-remote)). \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.3.1 Define get_data script to load data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define the get_data.py file and get_data() function that the `AutoMLStep` will execute to collect data. Note that we can directly access the path of the intermediate data (called embedded_data) through `os.environ['AZUREML_DATAREFERENCE_embedded_data']`. This is necessary because the AutoMLStep does not accept additional parameters like the PythonScriptStep does with `arguments`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $project_folder/get_data.py\n", + "\n", + "import os\n", + "import pandas as pd\n", + "\n", + "\n", + "def get_data():\n", + " \"\"\"Function needed to load data for use on remote AutoML experiments\"\"\"\n", + " X_train = pd.read_csv(EMBEDDED_DATA_REF + \"/X_train.csv\")\n", + " y_train = pd.read_csv(EMBEDDED_DATA_REF + \"/y_train.csv\")\n", + " X_dev = pd.read_csv(EMBEDDED_DATA_REF + \"/X_dev.csv\")\n", + " y_dev = pd.read_csv(EMBEDDED_DATA_REF + \"/y_dev.csv\")\n", + " return {\n", + " \"X\": X_train.values,\n", + " \"y\": y_train.values.flatten(),\n", + " \"X_valid\": X_dev.values,\n", + " \"y_valid\": y_dev.values.flatten(),\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.3.2 Create AutoMLConfig object" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we specify the parameters for the `AutoMLConfig` class:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**task** \n", + "AutoML supports the following base learners for the regression task: Elastic Net, Light GBM, Gradient Boosting, Decision Tree, K-nearest Neighbors, LARS Lasso, Stochastic Gradient Descent, Random Forest, Extremely Randomized Trees, XGBoost, DNN Regressor, Linear Regression. In addition, AutoML also supports two kinds of ensemble methods: voting (weighted average of the output of multiple base learners) and stacking (training a second \"metalearner\" which uses the base algorithms' predictions to predict the target variable). Specific base learners can be included or excluded in the parameters for the AutoMLConfig class (whitelist_models and blacklist_models) and the voting/stacking ensemble options can be specified as well (enable_voting_ensemble and enable_stack_ensemble)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**preprocess** \n", + "AutoML also has advanced preprocessing methods, eliminating the need for users to perform this manually. Data is automatically scaled and normalized but an additional parameter in the AutoMLConfig class enables the use of more advanced techniques including imputation, generating additional features, transformations, word embeddings, etc. (full list found [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-create-portal-experiments#preprocess)). Note that algorithm-specific preprocessing will be applied even if preprocess=False. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**primary_metric** \n", + "The regression metrics available are the following: Spearman Correlation (spearman_correlation), Normalized RMSE (normalized_root_mean_squared_error), Normalized MAE (normalized_mean_absolute_error), and R2 score (r2_score) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Constraints:** \n", + "There is a cost_mode parameter to set cost prediction modes (see options [here](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlconfig?view=azure-ml-py)). To set constraints on time there are multiple parameters including experiment_exit_score (target score to exit the experiment after achieving), experiment_timeout_minutes (maximum amount of time for all combined iterations), and iterations (total number of different algorithm and parameter combinations to try)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "automl_config = AutoMLConfig(\n", + " debug_log=\"automl_errors.log\",\n", + " path=project_folder,\n", + " compute_target=compute_target,\n", + " run_configuration=conda_run_config,\n", + " data_script=project_folder\n", + " + \"/get_data.py\", # local path to script with get_data() function\n", + " **automl_settings #where the autoML main settings are defined\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.3.3 Create AutoMLStep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we create `PipelineData` objects for the model data (our outputs) and then create the `AutoMLStep`. The `AutoMLStep` requires a `AutoMLConfig` object and we pass our intermediate data (embedded_data) in as the inputs. Again, note that the hash_paths parameter will be deprecated but currently is needed to check for any updates to the get_data.py file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create PipelineData objects for tracking AutoML metrics\n", + "\n", + "metrics_data = PipelineData(\n", + " name=\"metrics_data\",\n", + " datastore=ds,\n", + " pipeline_output_name=\"metrics_output\",\n", + " training_output=TrainingOutput(type=\"Metrics\"),\n", + ")\n", + "model_data = PipelineData(\n", + " name=\"model_data\",\n", + " datastore=ds,\n", + " pipeline_output_name=\"best_model_output\",\n", + " training_output=TrainingOutput(type=\"Model\"),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "automl_step = AutoMLStep(\n", + " name=\"AutoML\",\n", + " automl_config=automl_config, # the AutoMLConfig object created previously\n", + " inputs=[\n", + " embedded_data\n", + " ], # inputs is the PipelineData that was the output of the previous pipeline step\n", + " outputs=[\n", + " metrics_data,\n", + " model_data,\n", + " ], # PipelineData objects to reference metric and model information\n", + " hash_paths=[\"get_data.py\"],\n", + " allow_reuse=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5. Run Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we set up our pipeline which requires specifying our `Workspace` and the ordering of the steps that we created (steps parameter). We submit the pipeline and inspect the run details using a RunDetails widget. For remote runs, the execution of iterations is asynchronous." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = Pipeline(\n", + " description=\"pipeline_embed_automl\", # give a name for the pipeline\n", + " workspace=ws,\n", + " steps=[embed_step, automl_step],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_run = experiment.submit(pipeline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect the run details using the provided widget\n", + "RunDetails(pipeline_run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://nlpbp.blob.core.windows.net/images/pipelineWidget.PNG)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, block until the run has completed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_run.wait_for_completion(\n", + " show_output=True\n", + ") # show console output while run is in progress" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Cancel the Run**\n", + "\n", + "Interrupting/Restarting the jupyter kernel will not properly cancel the run, which can lead to wasted compute resources. To avoid this, we recommend explicitly canceling a run with the following code:\n", + "\n", + "`pipeline_run.cancel()`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 6. Deploy Sentence Similarity Model\n", + "\n", + "Deploying an Azure Machine Learning model as a web service creates a REST API. You can send data to this API and receive the prediction returned by the model.\n", + "In general, you create a webservice by deploying a model as an image to a Compute Target.\n", + "\n", + "Some of the Compute Targets are: \n", + "1. Azure Container Instance\n", + "2. Azure Kubernetes Service\n", + "3. Local web service\n", + "\n", + "The general workflow for deploying a model is as follows:\n", + "1. Register a model\n", + "2. Prepare to deploy\n", + "3. Deploy the model to the compute target\n", + "4. Test the deployed model (webservice)\n", + "\n", + "In this notebook we walk you through the process of creating a webservice running on Azure Kubernetes Service ([AKS](https://docs.microsoft.com/en-us/azure/aks/intro-kubernetes\n", + ")) by deploying the model as an image. AKS is good for high-scale production deployments. It provides fast response time and autoscaling of the deployed service. Cluster autoscaling is not supported through the Azure Machine Learning SDK. \n", + "\n", + "You can find more information on deploying and serving models [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.1 Register/Retrieve AutoML and Google Universal Sentence Encoder Models for Deployment\n", + "\n", + "Registering a model means registering one or more files that make up a model. The Machine Learning models are registered in your current Aure Machine Learning Workspace. The model can either come from Azure Machine Learning or another location, such as your local machine.\n", + "\n", + "See other ways to register a model [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-and-where)\n", + "\n", + "Below we show how to register a new model and also how to retrieve and register an existing model.\n", + "\n", + "### Register a new automl model\n", + "Register the best AutoML model based on the pipeline results or load the saved model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "automl_step_run = AutoMLStepRun(step_run=pipeline_run.find_step_run(\"AutoML\")[0])\n", + "# to get the outputs\n", + "best_run, fitted_model = automl_step_run.get_output()\n", + "\n", + "# to register the fitted_mode\n", + "description = \"Pipeline Model\"\n", + "tags = {\"area\": \"nlp\", \"type\": \"sentencesimilarity pipelines\"}\n", + "model = automl_step_run.register_model(description=description, tags=tags)\n", + "automl_model_name = automl_step_run.model_id\n", + "print(\n", + " automl_step_run.model_id\n", + ") # Use this id to deploy the model as a web service in Azure." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Retrieve existing model from Azure\n", + "If you already have a best model then you can skip registering the model by just retrieving the latest version of model by providing its name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "automl_model_name = \"76a6169d7f364bdbest\" # best fit model registered in the workspace\n", + "model = Model(ws, name=automl_model_name)\n", + "print(\"Found model with name\", automl_model_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Register Google Universal Sentence Encoder Model\n", + "Register the Google Universal Sentence Encoder model if not already registered in your workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set location for where to download google tensorflow model\n", + "os.environ[\"TFHUB_CACHE_DIR\"] = \"./googleUSE\"\n", + "# download model\n", + "hub.Module(\"https://tfhub.dev/google/universal-sentence-encoder-large/3\")\n", + "# register model\n", + "embedding_model = Model.register(\n", + " model_path=\"googleUSE\",\n", + " model_name=\"googleUSEmodel\",\n", + " tags={\"Model\": \"GoogleUSE\"},\n", + " description=\"Google Universal Sentence Embedding pretrained model\",\n", + " workspace=ws,\n", + ")\n", + "print(\"Registered googleUSEembeddings model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Retrieve existing Google USE model from Azure" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_model = Model(ws, name=\"googleUSEmodel\")\n", + "print(\"Found model with name googleUSEembeddings\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.2 Create Scoring Script\n", + "\n", + "In this section we show an example of an entry script, which is called from the deployed webservice. `score.py` is our entry script. The script must contain:\n", + "1. init() - This function loads the model in a global object.\n", + "2. run() - This function is used for model prediction. The inputs and outputs to `run()` typically use JSON for serialization and deserilization. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile score.py\n", + "import pickle\n", + "import json\n", + "import numpy as np\n", + "import azureml.train.automl\n", + "from sklearn.externals import joblib\n", + "from azureml.core.model import Model\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import tensorflow_hub as hub\n", + "import os\n", + "\n", + "tf.logging.set_verbosity(tf.logging.ERROR) # reduce logging output\n", + "\n", + "\n", + "def google_encoder(dataset):\n", + " \"\"\" Function that embeds sentences using the Google Universal\n", + " Sentence Encoder pretrained model\n", + " \n", + " Parameters:\n", + " ----------\n", + " dataset: pandas dataframe with sentences and scores\n", + " \n", + " Returns:\n", + " -------\n", + " emb1: 512-dimensional representation of sentence1\n", + " emb2: 512-dimensional representation of sentence2\n", + " \"\"\"\n", + " global embedding_model, sess\n", + " sts_input1 = tf.placeholder(tf.string, shape=(None))\n", + " sts_input2 = tf.placeholder(tf.string, shape=(None))\n", + "\n", + " # Apply embedding model and normalize the input\n", + " sts_encode1 = tf.nn.l2_normalize(embedding_model(sts_input1), axis=1)\n", + " sts_encode2 = tf.nn.l2_normalize(embedding_model(sts_input2), axis=1)\n", + "\n", + " sess.run(tf.global_variables_initializer())\n", + " sess.run(tf.tables_initializer())\n", + " emb1, emb2 = sess.run(\n", + " [sts_encode1, sts_encode2],\n", + " feed_dict={sts_input1: dataset[\"sentence1\"], sts_input2: dataset[\"sentence2\"]},\n", + " )\n", + " return emb1, emb2\n", + "\n", + "\n", + "def feature_engineering(dataset):\n", + " \"\"\"Extracts embedding features from the dataset and returns\n", + " features and target in a dataframe\n", + " \n", + " Parameters:\n", + " ----------\n", + " dataset: pandas dataframe with sentences and scores\n", + " \n", + " Returns:\n", + " -------\n", + " df: pandas dataframe with embedding features\n", + " scores: list of target variables\n", + " \"\"\"\n", + " google_USE_emb1, google_USE_emb2 = google_encoder(dataset)\n", + " n_google = google_USE_emb1.shape[1] # length of the embeddings\n", + " return np.concatenate((google_USE_emb1, google_USE_emb2), axis=1)\n", + "\n", + "\n", + "def init():\n", + " global model, googleUSE_dir_path\n", + " model_path = Model.get_model_path(\n", + " model_name=\"<>\"\n", + " ) # this name is model.id of model that we want to deploy\n", + " # deserialize the model file back into a sklearn model\n", + " model = joblib.load(model_path)\n", + "\n", + " # load the path for google USE embedding model\n", + " googleUSE_dir_path = Model.get_model_path(model_name=\"googleUSEmodel\")\n", + " os.environ[\"TFHUB_CACHE_DIR\"] = googleUSE_dir_path\n", + "\n", + "\n", + "def run(rawdata):\n", + " global embedding_model, sess, googleUSE_dir_path, model\n", + " try:\n", + " # load data and convert to dataframe\n", + " data = json.loads(rawdata)[\"data\"]\n", + " data_df = pd.DataFrame(data, columns=[\"sentence1\", \"sentence2\"])\n", + "\n", + " # begin a tensorflow session and load tensorhub module\n", + " sess = tf.Session()\n", + " embedding_model = hub.Module(\n", + " googleUSE_dir_path + \"/96e8f1d3d4d90ce86b2db128249eb8143a91db73\"\n", + " )\n", + "\n", + " # Embed sentences using Google USE model\n", + " embedded_data = feature_engineering(data_df)\n", + " # Predict using AutoML saved model\n", + " result = model.predict(embedded_data)\n", + "\n", + " except Exception as e:\n", + " result = str(e)\n", + " sess.close()\n", + " return json.dumps({\"error\": result})\n", + "\n", + " sess.close()\n", + " return json.dumps({\"result\": result.tolist()})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Substitute the actual model id in the script file.\n", + "script_file_name = \"score.py\"\n", + "\n", + "with open(script_file_name, \"r\") as cefr:\n", + " content = cefr.read()\n", + "\n", + "with open(script_file_name, \"w\") as cefw:\n", + " cefw.write(content.replace(\"<>\", automl_model_name))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.3 Create a YAML File for the Environment\n", + "\n", + "To ensure the fit results are consistent with the training results, the SDK dependency versions need to be the same as the environment that trains the model. The following cells create a file, pipeline_env.yml, which specifies the dependencies from the run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "myenv = CondaDependencies.create(\n", + " conda_packages=[\n", + " \"numpy\",\n", + " \"scikit-learn\",\n", + " \"py-xgboost<=0.80\",\n", + " \"pandas\",\n", + " \"tensorflow\",\n", + " \"tensorflow-hub\",\n", + " ],\n", + " pip_packages=[\"azureml-sdk[automl]==1.0.43.*\"],\n", + " python_version=\"3.6.8\",\n", + ")\n", + "\n", + "conda_env_file_name = \"pipeline_env.yml\"\n", + "myenv.save_to_file(\".\", conda_env_file_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.4 Image Creation\n", + "\n", + "In this step we create a container image which is wrapper containing the entry script, yaml file with package dependencies and the model. The created image is then deployed as a webservice in the next step. This step can take up to 10 minutes and even longer if the model is large." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# trying to add dependencies\n", + "image_config = ContainerImage.image_configuration(\n", + " execution_script=script_file_name,\n", + " runtime=\"python\",\n", + " conda_file=conda_env_file_name,\n", + " description=\"Image with aml pipeline model\",\n", + " tags={\"area\": \"nlp\", \"type\": \"sentencesimilarity pipeline\"},\n", + ")\n", + "\n", + "image = ContainerImage.create(\n", + " name=\"pipeline-image\",\n", + " # this is the model object\n", + " models=[model, embedding_model], # add both embedding and autoML models\n", + " image_config=image_config,\n", + " workspace=ws,\n", + ")\n", + "\n", + "image.wait_for_creation(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the above step fails, then use below command to see logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# image.get_logs()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.5 Provision the AKS Cluster\n", + "\n", + "**Time estimate:** Approximately 20 minutes.\n", + "\n", + "Creating or attaching an AKS cluster is a one time process for your workspace. You can reuse this cluster for multiple deployments. If you delete the cluster or the resource group that contains it, you must create a new cluster the next time you need to deploy. You can have multiple AKS clusters attached to your workspace.\n", + "\n", + "If you delete the cluster or the resource group that contains it, then you would have to recreate it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create aks cluser\n", + "\n", + "# Use the default configuration (can also provide parameters to customize)\n", + "prov_config = AksCompute.provisioning_configuration()\n", + "\n", + "# Create the cluster\n", + "aks_target = ComputeTarget.create(\n", + " workspace=ws, name=\"nlp-aks\", provisioning_configuration=prov_config\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## 6.6 Deploy the Image as a Web Service on Azure Kubernetes Service\n", + "\n", + "In the case of deployment on AKS, in addition to the Docker image, we need to define computational resources. This is typically a cluster of CPUs or a cluster of GPUs. If we already have a Kubernetes-managed cluster in our workspace, we can use it, otherwise, we can create a new one.\n", + "\n", + "In this notebook we will use the cluster in the above cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set the web service configuration\n", + "aks_config = AksWebservice.deploy_configuration()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are now ready to deploy our web service. We will deploy from the Docker image. It contains our AutoML model as well as the Google Universal Sentence Encoder model and the conda environment needed for the scoring script to work properly. The parameters to pass to the Webservice.deploy_from_image() command are similar to those used for the deployment on ACI. The only major difference is the compute target (aks_target), i.e. the CPU cluster we just spun up.\n", + "\n", + "**Note:** This deployment takes a few minutes to complete." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# deploy image as web service\n", + "aks_service_name = \"aks-with-pipelines-service-1\"\n", + "\n", + "aks_service = Webservice.deploy_from_image(\n", + " workspace=ws,\n", + " name=aks_service_name,\n", + " image=image,\n", + " deployment_config=aks_config,\n", + " deployment_target=aks_target,\n", + ")\n", + "aks_service.wait_for_deployment(show_output=True)\n", + "print(aks_service.state)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the above step fails then use below command to see logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# aks_service.get_logs()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.7 Test Deployed Webservice\n", + "\n", + "Testing the deployed model means running the created webservice.
\n", + "The deployed model can be tested by passing a list of sentence pairs. The output will be a score between 0 and 5, with 0 indicating no meaning overlap between the sentences and 5 meaning equivalence.\n", + "\n", + "The run method expects input in json format. The Run() method retrieves API keys behind the scenes to make sure that the call is authenticated. The service has a timeout (default of ~30 seconds) which does not allow passing the large test dataset. To overcome this, you can batch data and send it to the service." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentences = [\n", + " [\"This is sentence1\", \"This is sentence1\"],\n", + " [\"A hungry cat.\", \"A sleeping cat\"],\n", + " [\"Its summer time \", \"Winter is coming\"],\n", + "]\n", + "data = {\"data\": sentences}\n", + "data = json.dumps(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set up a Timer to see how long the model takes to predict\n", + "t = Timer()\n", + "\n", + "t.start()\n", + "score = aci_service.run(input_data=data)\n", + "t.stop()\n", + "\n", + "print(\"Time elapsed: {}\".format(t))\n", + "\n", + "result = json.loads(score)\n", + "try:\n", + " output = result[\"result\"]\n", + " print(\"Number of samples predicted: {}\".format(len(output)))\n", + " print(output)\n", + "except:\n", + " print(result[\"error\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we'll calculate the Pearson Correlation on the test set.\n", + "\n", + "**What is Pearson Correlation?**\n", + "\n", + "Our evaluation metric is Pearson correlation ($\\rho$) which is a measure of the linear correlation between two variables. The formula for calculating Pearson correlation is as follows: \n", + "\n", + "$$\\rho_{X,Y} = \\frac{E[(X-\\mu_X)(Y-\\mu_Y)]}{\\sigma_X \\sigma_Y}$$\n", + "\n", + "This metric takes a value in [-1,1] where -1 represents a perfect negative correlation, 1 represents a perfect positive correlation, and 0 represents no correlation. We utilize the Pearson correlation metric as this is the main metric that [SentEval](http://nlpprogress.com/english/semantic_textual_similarity.html), a widely-used evaluation toolkit for evaluation sentence representations, uses for the STS Benchmark dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load test set sentences\n", + "data = pd.read_csv(\"data/test.csv\")\n", + "train_y = data[\"score\"].values.flatten()\n", + "train_x = data.drop(\"score\", axis=1).values.tolist()\n", + "data = {\"data\": train_x[:500]}\n", + "data = json.dumps(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Set up a Timer to see how long the model takes to predict\n", + "with Timer() as t:\n", + " score = aks_service.run(input_data=data)\n", + " print(\"Time elapsed: {}\".format(t))\n", + "\n", + "result = json.loads(score)\n", + "\n", + "try:\n", + " output = result[\"result\"]\n", + " print(\"Number of sample predicted : \".format(len(output)))\n", + " print(output)\n", + "except:\n", + " print(result[\"error\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get Pearson Correlation\n", + "print(pearsonr(output, train_y[:500])[0])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scenarios/sentence_similarity/baseline_deep_dive.ipynb b/scenarios/sentence_similarity/baseline_deep_dive.ipynb index e99de3109..922862120 100644 --- a/scenarios/sentence_similarity/baseline_deep_dive.ipynb +++ b/scenarios/sentence_similarity/baseline_deep_dive.ipynb @@ -81,14 +81,23 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n", + "Gensim version: 3.7.3\n" + ] + } + ], "source": [ "#Import Packages\n", "import sys\n", "# Set the environment path\n", - "sys.path.append(\"../../../\") \n", + "sys.path.append(\"../../\") \n", "import os\n", "from collections import Counter\n", "import math\n", @@ -124,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -156,20 +165,53 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:01<00:00, 247KB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data downloaded to C:\\Users\\cocochra\\AppData\\Local\\Temp\\tmpp2a0cw_t\\raw\\stsbenchmark\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████████████| 401/401 [00:01<00:00, 243KB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data downloaded to C:\\Users\\cocochra\\AppData\\Local\\Temp\\tmpp2a0cw_t\\raw\\stsbenchmark\n" + ] + } + ], "source": [ "# Produce a pandas dataframe for the training and test sets\n", - "sts_train = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n", - "sts_test = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"test\")" + "train_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n", + "test_raw = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"test\")\n", + "\n", + "# Clean the sts dataset\n", + "sts_train = stsbenchmark.clean_sts(train_raw)\n", + "sts_test = stsbenchmark.clean_sts(test_raw)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -188,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -308,7 +350,7 @@ "9 A man is playing a trumpet. " ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -347,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -372,7 +414,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -504,7 +546,7 @@ "4 [man, seated, playing, cello, .] " ] }, - "execution_count": 7, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -522,7 +564,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -550,7 +592,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -579,7 +621,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -589,7 +631,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -598,7 +640,7 @@ "11498" ] }, - "execution_count": 11, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -623,7 +665,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -656,14 +698,16 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Vector file already exists. No changes made.\n" + "100%|████████████████████████████████████████████████████████████████████████████| 1.61M/1.61M [01:08<00:00, 23.4kKB/s]\n", + "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp_gpu\\lib\\site-packages\\smart_open\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n", + " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n" ] } ], @@ -691,7 +735,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -739,7 +783,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -775,7 +819,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -805,7 +849,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -856,7 +900,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -887,7 +931,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -922,14 +966,16 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 25, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Vector file already exists. No changes made.\n" + "100%|████████████████████████████████████████████████████████████████████████████| 2.13M/2.13M [01:58<00:00, 17.9kKB/s]\n", + "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp_gpu\\lib\\site-packages\\smart_open\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n", + " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n" ] } ], @@ -939,7 +985,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -968,7 +1014,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -1009,14 +1055,14 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "10.4GB [35:23, 4.88MB/s] \n" + "100%|████████████████████████████████████████████████████████████████████████████| 2.56M/2.56M [01:46<00:00, 24.0kKB/s]\n" ] } ], @@ -1026,16 +1072,16 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\azureml\\lib\\site-packages\\ipykernel_launcher.py:12: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n", + "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp_gpu\\lib\\site-packages\\ipykernel_launcher.py:12: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n", " if sys.path[0] == '':\n", - "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\azureml\\lib\\site-packages\\ipykernel_launcher.py:29: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n" + "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp_gpu\\lib\\site-packages\\ipykernel_launcher.py:29: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n" ] } ], @@ -1058,21 +1104,21 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\azureml\\lib\\site-packages\\ipykernel_launcher.py:13: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n", + "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp_gpu\\lib\\site-packages\\ipykernel_launcher.py:13: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n", " del sys.path[0]\n", - "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\azureml\\lib\\site-packages\\ipykernel_launcher.py:14: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n", + "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp_gpu\\lib\\site-packages\\ipykernel_launcher.py:14: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n", " \n", - "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\azureml\\lib\\site-packages\\ipykernel_launcher.py:21: DeprecationWarning: Call to deprecated `wmdistance` (Method will be removed in 4.0.0, use self.wv.wmdistance() instead).\n", - "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\azureml\\lib\\site-packages\\ipykernel_launcher.py:16: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n", + "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp_gpu\\lib\\site-packages\\ipykernel_launcher.py:21: DeprecationWarning: Call to deprecated `wmdistance` (Method will be removed in 4.0.0, use self.wv.wmdistance() instead).\n", + "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp_gpu\\lib\\site-packages\\ipykernel_launcher.py:16: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n", " app.launch_new_instance()\n", - "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\azureml\\lib\\site-packages\\ipykernel_launcher.py:17: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n" + "C:\\Users\\cocochra\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp_gpu\\lib\\site-packages\\ipykernel_launcher.py:17: DeprecationWarning: Call to deprecated `__contains__` (Method will be removed in 4.0.0, use self.wv.__contains__() instead).\n" ] } ], @@ -1109,7 +1155,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -1123,8 +1169,8 @@ " Returns:\n", " list: predicted values for sentence similarity of test set examples\n", " \"\"\"\n", - " stop_word_param = 'english' if rm_stopwords else None\n", - " \n", + " stop_word_param = \"english\" if rm_stopwords else None\n", + "\n", " tf = TfidfVectorizer(\n", " input=\"content\",\n", " analyzer=\"word\",\n", @@ -1132,21 +1178,30 @@ " stop_words=stop_word_param,\n", " sublinear_tf=True,\n", " )\n", - "\n", - " all_sentences = df[[\"sentence1\", \"sentence2\"]]\n", - " corpus = all_sentences.values.flatten().tolist()\n", + " all_sentences = df[[\"sentence1\", \"sentence2\"]]\n", + " corpus = np.concatenate([df[\"sentence1\"].values, df[\"sentence2\"].values])\n", " tfidf_matrix = np.array(tf.fit_transform(corpus).todense())\n", + " num_samples = len(df.index)\n", " \n", - " df['sentence1_tfidf'] = df.apply(lambda x: tfidf_matrix[2*x.name,:], axis=1)\n", - " df['sentence2_tfidf'] = df.apply(lambda x: tfidf_matrix[2*x.name+1,:], axis=1)\n", - " df['predictions'] = df.apply(lambda x: calculate_cosine_similarity(x.sentence1_tfidf, x.sentence2_tfidf) if \n", - " (sum(x.sentence1_tfidf) != 0 and sum(x.sentence2_tfidf) != 0) else 0,axis=1)\n", - " return df['predictions'].tolist()" + " # calculate the cosine similarity between pairs of tfidf embeddings\n", + " # first pair at index 0 and n in tfidf_matrix, second pair at 1 and n+1, etc.\n", + " df[\"predictions\"] = df.apply(\n", + " lambda x: calculate_cosine_similarity(\n", + " tfidf_matrix[int(x.name), :], tfidf_matrix[num_samples + int(x.name), :]\n", + " )\n", + " if (\n", + " sum(tfidf_matrix[int(x.name), :]) != 0\n", + " and sum(tfidf_matrix[num_samples + int(x.name), :]) != 0\n", + " )\n", + " else 0,\n", + " axis=1,\n", + " )\n", + " return df[\"predictions\"].tolist()" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 34, "metadata": { "scrolled": true }, @@ -1185,7 +1240,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -1206,7 +1261,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -1264,7 +1319,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 37, "metadata": { "scrolled": true }, @@ -1294,7 +1349,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1314,30 +1369,33 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 39, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Word2vec Cosine 0.6337760059182685\n", - "Word2vec Cosine with Stop Words 0.647674307797345\n", - "Word2vec WMD 0.6578256301323717\n", - "Word2vec WMD with Stop Words 0.5697910628727219\n", - "GLoVe Cosine 0.642064729899729\n", - "GLoVe Cosine with Stop Words 0.5639670964748242\n", - "GLoVe WMD 0.6272339050920003\n", - "GLoVe WMD with Stop Words 0.48560149551724\n", - "fastText Cosine 0.6288780924569854\n", - "fastText Cosine with Stop Words 0.5958470751204787\n", - "fastText WMD 0.5275208457920849\n", - "fastText WMD with Stop Words 0.44198752510004097\n", - "TF-IDF Cosine 0.6683811410442564\n", - "TF-IDF Cosine with Stop Words 0.7034695168223283\n", - "Doc2vec Cosine 0.4984144504392967\n", - "Doc2vec Cosine with Stop Words 0.4172218818503345\n" - ] + "data": { + "text/plain": [ + "{'Word2vec Cosine': 0.6476606845766778,\n", + " 'Word2vec Cosine with Stop Words': 0.6683808069062863,\n", + " 'Word2vec WMD': 0.6574175839579567,\n", + " 'Word2vec WMD with Stop Words': 0.5689438215886101,\n", + " 'GLoVe Cosine': 0.6688056947022161,\n", + " 'GLoVe Cosine with Stop Words': 0.6049380247374541,\n", + " 'GLoVe WMD': 0.6267300417407605,\n", + " 'GLoVe WMD with Stop Words': 0.48470008225931194,\n", + " 'fastText Cosine': 0.6707510007525627,\n", + " 'fastText Cosine with Stop Words': 0.6771300330824099,\n", + " 'fastText WMD': 0.6394958913339955,\n", + " 'fastText WMD with Stop Words': 0.5177829727556036,\n", + " 'TF-IDF Cosine': 0.6749213786510483,\n", + " 'TF-IDF Cosine with Stop Words': 0.7118087132257667,\n", + " 'Doc2vec Cosine': 0.528387685928394,\n", + " 'Doc2vec Cosine with Stop Words': 0.45572884639905675}" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1350,19 +1408,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We investigate our 8 models with and without stop words (16 different results total). The results show that TF-IDF bag-of-words document embeddings (without stop words) combined with the cosine similarity performs the best, with a Pearson correlation of 0.7034. " + "We investigate our 8 models with and without stop words (16 different results total). The results show that TF-IDF bag-of-words document embeddings combined with the cosine similarity performs the best." ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 40, "metadata": { "scrolled": true }, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhMAAAEWCAYAAADchhUKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzs3XecXVW5//HPl4C0UKQaVIh0gUCAECAGBMGGDQUFRCSgInKFCwrKT7yC7YIXFAsIl6vSO4oFUDokpJCQEJJQgpRIFektgCR5fn+s5zA7J2dmzuRMpiTf9+s1r+yzy9prrzOw16y99vMoIjAzMzNbWEv1dgXMzMysf3NnwszMzFrizoSZmZm1xJ0JMzMza4k7E2ZmZtYSdybMzMysJe5MmFm3kvQjSc9I+mdv16UvkPQ+SX+X9IqkPXu7PgCSRkm6rfL5FUnr92adukJSSNqwif12kfRYT9RpSefOhNkSTtIsSa/lDeUpSWdLGriQZb0b+CawWUS8o3tr2m/9ADgtIgZGxB/rN9a1//OSrs527DFZt4e6u1xJt+SNf6u69X/M9bt09zmtd7gzYWYAn4iIgcA2wHbAd7tagKSlgfWAZyPiXwt5/OJoPeDuTvaptf8g4CngV4u8Vj3nfuCLtQ+SVgd2AJ7utRpZt3NnwszeEhGPA38FtgCQtIqk30p6UtLj+QhjQG4bJWmspFMlPQfcAlwPrJN/ZZ+T+31S0t2SXsi/VN9bO1/+Vf5tSdOAVyUtneuOkTRN0qt5/rUl/VXSy5JukPT2ShmXS/qnpBcljZa0eWXbOZJOz7/2X5Z0u6QNKts3l3S9pOdyVOY7uX4pScdKelDSs5Iuk7Rae+0m6SuSHshy/ixpnVz/ILA+8Jdsk2U7af/XgSuAzSplf0zSnZJekvSopBMq25aTdEHW8QVJkySt3dl316D+bz02aKLNNq202UxJn+vomoALgX0q594PuBL4d6XMZSX9XNIT+fPzalvl78OTue3gurovK+kUSY/kd3impOXbuc5vZ1u8nHXfrZO6W5PcmTCzt+Tw+h7AnbnqXGAOsCGwNfAh4MuVQ7YHHgLWAj4IfBR4IofNR0naGLgYOBJYE7iGcmN9W6WM/YCPAatGxJxct1eWtzHwCUoH5zvAGpT/bx1ROf6vwEZZhymUm1fVfsD3gbcDDwA/zmtdCbgB+BuwTl7jjXnMEcCewPtz2/PA6e202QeAE4HPUUYW/gFcAhARGwCPkCMPEfFGozIqZa0A7ANMqKx+lfKX/arZTl9T29yLA4FVgHcDqwOHAq/lts6+u46012YrUjqMF1Haez/g19UOXANPAPfk+clrOa9un+MooxVDga2A4eTomKSPAEdTfh82AnavO/YnlN+ToXmt7wS+V18JSZsAXwe2i4iVgA8Dszqot3VFRPjHP/5Zgn8o/0N9BXiBciP8NbA8sDbwBrB8Zd/9gJtzeRTwSF1ZuwCPVT7/F3BZ5fNSwOPALpVzH9ygPvtXPv8eOKPy+XDgj+1cy6pAAKvk53OA31S27wHcV7mWO9sp515gt8rnQcCbwNIN9v0t8D+VzwNz38GV69m9yfafQ7n5Dulg/58Dp+bywcA4YMu6fZr57m6rbAtgwybabB9gTN25/hc4vp263kLpwHyB0qncBLg/tz1W+T14ENijctyHgVm5/DvgpMq2jWv1BUTpbG1Q2b4j8HD972Pu/y9KZ2SZ3v7vbnH7WVyfUZpZ1+wZETdUV0gaAiwDPCmptnop4NHKbtXlRtahdFAAiIh5kh6l/PXYURlPVZZfa/B5YNZxAOWv5s9SRj7m5T5rAC/mcvWtktm1Yyl/zT/YTr3XA66UNK+ybi7lJv143b7rUEZEAIiIVyQ9S7nGWe2UX2/PiLghr+dTwK2SNouIf0raHjiJ8ujpbcCywOV53Pl5HZdIWhW4gPJX/np0/t11pL02Ww/YXtILle1LZz068gfgp8Cz7ew73+9JLq9T2Ta5blvNmsAKwOTKdQpY4HFORDwg6UjgBGBzSdcC34iIJzqpuzXBjznMrD2PUv66XSMiVs2flSOiOqTdWdrhJyg3IABU/o//bua/IbeSuvjzlJvv7pTh/sG1UzVx7KPABh1s+2jluleNiOWizCmpV3+NK1IeOTTat0MRMTci/kDpuIzM1RcBfwbeHRGrAGeS1xcRb0bE9yNiM2AE8HHKY4RmvruF8Shwa127DIyIr3VyXbMpj6O+RuPOxHxtCKyb6wCepPzOVLfVPEPpXG5eqc8qUSazNqrHRRExMs8VlEck1g3cmTCzhiLiSeA64KeSVs5JiRtIen8XirkM+Jik3SQtQ3lt9A3K0Hx3WCnLe5byF+p/d+HYq4B3SDoyJ/GtlKMAUG7YP5a0HoCkNSV9qp1yLgIOkjQ0Jw3+N3B7RMzq6sWo+BRlrsK9uXol4LmIeF3ScEoHqrb/rpKG5IjGS5THK3O76btr5CpgY0kHSFomf7ZTZVJtB74DvL+ddrkY+G628xqUOQ8X5LbLgFGSNss5JcfXDoqIecD/AadKWgtA0jslfbj+BJI2kfSB/I5ep3RC5jZ74dYxdybMrCNfpAyt30OZhHgFZf5AUyJiJuV5+a8of0V+gjIZ8d8dHti88yjD3o9nHSd0vPt8dXuZMqnvE5Rh/b8Du+bmX1BGA66T9HKWu3075dxImRvye8pf0RsA+3bxOv4i6RVKh+DHwIERUXud9DDgB1mP71FurjXvoHwnL1E6H7fSdhNu6btrJNvsQ5Tre4LSbj+hPHrp7NgnIuK2djb/CLgDmAZMpzw2+lEe91fKPJGbKJNBb6o79tu5foKklyiTajdpcI5lKY+Lnsl6r0Xp4Fg3UEQrI4xmZma2pPPIhJmZmbXEnQkzMzNriTsTZmZm1hJ3JszMzKwlDlplS4Q11lgjBg8e3NvVMDPrVyZPnvxMRKzZ2X7uTNgSYfDgwdxxxx29XQ0zs35F0j8638uPOczMzKxF7kyYmZlZS9yZMDMzs5a4M2FmZmYtcWfCzMzMWuLOhJmZmbXEnQkzMzNriTsTZmZm1hIHrbIlwvTHX2TwsVf3djXMbAkz66SP9XYVeoRHJszMzKwl7kz0MEmnSjqy8vlaSb+pfP6ppG+0UP4Jko7O5ZMl3SdpmqQrJa3aWu07PffReb4Zku6S9MWFKOPQhTnOzMx6jzsTPW8cMAJA0lLAGsDmle0jgLHNFCRpQCe7XA9sERFbAvcD/6/LtW2SpEOBDwLDI2ILYGdAXS0nIs6MiPO6u35mZrbouDPR88aSnQlKJ2IG8LKkt0taFngvcKeKk/Ov/OmS9gGQtIukmyVdBEzPdcdJminpBmCT2oki4rqImJMfJwDvyv1vl/RWB0bSLZK2lbSipN9JmiTpTkmfyu0DJJ2S9Zgm6fAG1/Ud4LCIeCnP/WJEnJvH75blTc/yl831J0m6J8s8JddVR1ZukfQTSRMl3S9pp0p9Ts56TpP01da+EjMza4UnYPawiHhC0hxJ61I6FeOBdwI7Ai8C0yLi35L2AoYCW1FGLyZJGp3FDKeMODwsaVtgX2Bryvc5BZjc4NQHA5fm8iXA54DjJQ0C1omIyZL+G7gpIg7ORyITs4PyReA9wNYRMUfSatWCJa0ErBQRD9afVNJywDnAbhFxv6TzgK/lv58GNo2I6OARzNIRMVzSHsDxwO7Al4AXI2K77JiMlXRdRDxcd+5DgEMABqzcaQZdMzNbSB6Z6B210YlaZ2J85fO43GckcHFEzI2Ip4Bbge1y28TKjXMn4MqImJ2jAn+uP5mk44A5wIW56jLgs7n8OeDyXP4QcKykqcAtwHLAupQb+Jm1UY6IeK7+FEC0c62bAA9HxP35+VzKI5CXgNeB30j6DDC7neP/kP9OBgZX6vnFrOftwOrARvUHRsRZETEsIoYNWGGVdoo3M7NWeWSid9TmTQyhPOZ4FPgm5Qb7u9yno/kGr9Z9bu9GjqQDgY9TRgYCICIel/SspC2BfYDaYwIBe0XEzLoyOuosEBEvSXpV0voR8VB9Fdo5Zo6k4cBulJGVrwMfaLDrG/nvXNp+XwUcHhHXtlcnMzPrOR6Z6B1jKTf453Lk4TlgVcqjjvG5z2hgn5wfsCblr/mJDcoaDXxa0vL5uOETtQ2SPgJ8G/hkRNT/5X8J8C1glYiYnuuuBQ7PzgOSts711wGHSlo616/Ggk4ETpe0cu6zcj5muA8YLGnD3O8A4FZJA/Pc1wBHUh7pNOtayqOSZfJcG0tasQvHm5lZN/LIRO+YTpkHcVHduoER8Ux+vpLSubiLMirwrYj4p6RNqwVFxBRJlwJTgX8AYyqbTwOWBa7P/sGEiDg0t10B/AL4YWX/HwI/B6Zlh2IWpdPzG2DjXP8m8H9ZdtUZwEDK3I43gTeBn0bE65IOAi7Pzsgk4ExgNeBPOadCwFGdtlqb31AeeUzJej4N7NmF483MrBspR77NFmvDhg2LO+64o7erYWbWr0iaHBHDOtvPjznMzMysJe5MmJmZWUs8Z8KWCE70ZWZ9yeKWAMwjE2ZmZtaSDjsTclKqzsrosaRUkq6RtGr+HFZZv4ukq5o4focMoz1V0r2STqgcP6KTw5ut45WS9qx8ninpu5XPv88AVQtb/jmS9m61nmZm1r06G5lwUqoO9GRSqojYIyJeoMSjOKyz/Rs4FzgkIoYCW1CiYALsQluukFZVf19WB16hvN5asyNtET47VItpYWZmfV9nnQknpeqBpFSSviXpiFw+VdJNlbpckMuzJK0BnARskCMMJ2cRAyVdkSMtF2bshXprAU/m9c6NiHskDQYOBY7K8naStJ6kG7OuN6rkEKmNCpwpaUxe38cbnKP6+zICuApYM38/3gO8lrEylpN0drbxnZJ2zXOMknS5pL8A1+Vxp2W7X53XUGuzBb4PMzPrHR3+9eekVD2WlGo0JZz2L4FhwLIq0R1HMn8QKoBjsz2HZp13yfbcHHiCckN/H3Bb3XGnAjMl3QL8DTg3ImZJOhN4JSJqHaS/AOdFxLmSDs461R5dDAbeD2wA3Cxpw4h4vXKOycAWkt5G+X25FVif0uncmrZRrP8AiIghKkG4rpO0cW7bEdgyIp5TeSSyCSXs+NrAPcDv8jvt9PuQE32ZmfWIZiZgOinVok9KNRnYNjs6b1DaeBilveo7E41MjIjHImIeJRLm4PodIuIHWeZ1wOcpHYpGdqQtMuf5lO+25rKImBcRfwceAuqjcb4B3A1sA+yQ19ve78v5ecx9lMidtc7E9ZXvbGfafq+eAG7K9U19H070ZWbWM5rpTNQnpZpAueFU50t0d1Kq/atJqYBqUqpLKufcKyKG5s+6EXEvHXcWyE7Mq5LWb1SFdo6ZQxlh+T3lr/T2bsQdJaWq1fM9EXFdXflvUkJXH0Rp7zHArpQRgHvbu5YG560/d/11PBgRZ1CSa22lMq+hM9HOcqPPUOq/M2X053nK70utM9Etvy9d+D7MzKwHNDsy4aRUiz4p1Wjg6Px3DGUuw9Rap6riZWClLpyfPO/HKnMpNqJ0Ol5oUN44yqMogP2Z/3HJZyUtJWkDyuOL+bKLprGULKR35edplFGKdSmjFlCucf+s18a5rVFZo4F98/dqEKWDRYvfh5mZdbNmZsw7KVXPJKUaAxwHjI+IVyW9ToNHHBHxrKSxkmYAfwWajcR0AHCqpNmUx0j7R8TcnCNxhcoE1sOBIyjzEo7Juh5UKWMm5RHW2sChdfMlasZROhonZn3nSPoX8Gg+hgH4NXCmpOlZl1ER8YYWnDd6JSUt+XTKGz635vqVWPjvw8zMupkTfVlTJJ0DXBURV/R2XRaGE32ZmXWdnOjLzMzMeoIDA1lTImJUb9ehFc7NYWZ9XX/O1+GRCTMzM2uJOxN9iBbDXCiStsoYG7XP+0maXXm7ZYikabl8i6RHKm+dIOmPkl7J5cGSXlOJmnmvSrTRAxdFvc3MrHnuTPQti2MulOnAevkqMJRruI8SEbP2uXpNL1AieJIdnEF15T0YEVtHxHspr7AelW/gmJlZL3Fnom9Z7HKh5Ougk4Dtc9W2wOnMn8OjmvzrEtriXHyGtqiiC4iIh4BvUF5nNTOzXuLORB+SIaPrc6HcTonhMYzMhUK5ydZyoewOnJxBnaBEhjwuIjbT/LlQPkNbiPN6B1NiVkBbLhRUyYVCiYFxU0RsRwkedXIG3zqEtlwoW9IWBr1qHDAi959HCX9e7UxURyZuBHbOkZV9acvR0p4p1IX1rpF0iKQ7JN0xd/aLnRRjZmYLy52Jvmdxy4VSvabhwKRMsrahSrTUgTnCUDOXEnVzH2D5iJjVflOVS2hvg3NzmJn1DL8a2vfU50J5lJJR9CXgd7lPd+dC2a2aC0VSNRdKLWV6LRfKzLoyOsyFkiZQOjsjaQvB/hhl5GFcg/0voUS/PKGTcqGMujSTv8TMzBYRj0z0PYtdLpSIeJnSKRpVuYbxlLwajToTYyjhuC9usO0tkgYDpwC/6mg/MzNbtNyZ6HtquVAm1K17sS4XyjRKLpSbyFwo9QVFxBTKnIOplAyb9blQVqLkQpkq6czKtisoowaXVdb9EFiGkvNkBm15Un4DPJLr76KkN29kLLBsRDyan8dTcngs0JmI4pTK9VZtUHs1NOv3q4g4u51zmplZD3BuDlsiODeHmVnXOTeHmZmZ9Qh3JszMzKwlfpvDlghO9GVm/TmRVl/nkQkzMzNryUJ3JpyUqueSUklaR9IVuTxU0h6VbW+1UydlHFwJeT2jEg57lKR1FqZedeVL0jOS3p6fB0kKSSMr+zwtafUWzjFL0hqt1tXMzLpXKyMTTkrVQ0mpIuKJiNg7Pw4F9uho/3qS3kUJhz0y23AHyqulUGI/tNyZyKBXtdDfUNrqTtp+RzYBnomIZ5ussx/BmZn1E610JpyUqpuSUkm6JiNOkvX9Xi7/UNKXc5RjhqS3AT+gBKyaWmtLYLO89ockNUp6tRbwMvBK1uWViHhY0t6UnB8XZnnLS9ot6zA923DZrMssST/JEZaJkjZscJ7q78QI4GfM37kYl2WtJ+nG/A5uVMlFgqRzJP1M0s3ATyStLum6rM//kpE/8/u9WtJd2S77YGZmvWahOxNOSgV0X1Kq0cBOklam5Ml4X64fSSXQVLbn94BLI2JoRNTOtynwYUp7Hq98LFNxF/AU8LCksyV9Isu7ArgD2D8ihlLCYp8D7BMRQygTdL9WKeeliBhOCXj18wbX8dZoVdblj8C783O17U4Dzqt8B7+slLExsHtEfBM4HrgtIram5BVZN/f5CPBERGwVEVsAf2tQFyf6MjPrIa1OwHRSqu5JSjWGEhJ7JHA1MFDSCsDg+lwY7bg6It7IiJH/AtauboyIuZQb8N6Ux0SnSjqhQTmbAA9HxP35+dysV83FlX93ZEETga2zI7ZMRLwCPJSjGNXfiR2Bi3L5/LzumsuzvuS5L8hruBp4PtdPB3bPkZKdIqJhT8GJvszMekarnYn6pFQTKDeK6l+h3Z2Uav9qUiqgmpTqkso598q/3odGxLoRcW+uXxRJqX7F/KGn29NeUqpJlNGcnSijFHcCXwEmN1EmwBuV5bk0eOU3Q1RPjIgTKdeyV4NyOvquYP62W6AdM8fHA5TRoym5egJljsdaQHsdo2pZnf5OZGdnW0qn4sTaYyEzM+sd3TEy4aRULSalyscXj1JGVyZkmUczfy6NmpcpOTWapvI2yDaVVUOBfzQo7z5gcGU+xAGUkaSafSr/jqexsZS2qrbdfwITap1ASjvW5prsTxndaWR0bkfSR4HamyLrALMj4gJKm27TzvFmZtYDWu1MOClV9yWlGgM8lZ2lMZRJpo06EzdTJlxWJ2B2ZhngFJXXa6dSOgP/mdvOAc7M9QIOAi6XNJ0yZ6Ta1stKuj2PPaqdc42ltFWtMzElr6XadkcAB6m8ZntApS71vk+ZkzKF8ujqkVw/BJiYdT4O+FHHl29mZouSE31ZUyTNAoa102nq85zoy8ys6+REX2ZmZtYTHBjImhIRg3u7Dq1wbg4z6wsW1/wgHpkwMzOzlrgz0Y9JWlvSRRn5crKk8ZI+ndt2kXRVk+WcIOnEunVDc+JoV+pzdE7ynJHRKb/YleOzjEMX5jgzM+s97kz0U/na6x+B0RGxfkTUIoi+ayGKu5i21z5r9qUtsFQz9TkU+CAwPKNS7kzncSsWEBFnRsR5XT3OzMx6jzsT/dcHgH9HxFuvbkbEPyJigTgWVY1yb2SUzRckbV/Z9XNkEDBJH8pRjymSLpc0sEHR3wEOy+ilRMSLEXFue+fM9SdJuidzdJyS66rZYm+p5AO5X9JOuX6ASr6XSXnsVxeuCc3MrDu4M9F/bU5blMmmSFqO9nNvXEwGkpK0A/BsRPxdJeX3dyn5Mrah5PL4Rl25KwErZejxps6ZAcM+DWyeOTraixWxdOYDOZKSqwPgS5RYJttRopV+RdJ7GpzbuTnMzHqAOxOLCUmn5zyFSR3s1lHujUuAvVXSye9LW0TPHYDNgLEZJOpAYL3609N+mPL2zvkS8DrwG0mfAeojm9bUMrFOBgbn8oeAL2Z9bgdWBzaqP9C5OczMeoZfDe2/7qaSXyMi/iNHETqKzNTuHIaIeDQDU70/y92xcsz1EbFfB8e+JOlVSevXJUJr95wRMUfScGA3Sufl65RHN/VqeUeqOUcEHB4R17ZXJzMz6zkemei/bgKWk1RNEb5CJ8d0lnvjYuBU4MGIeCzXTQDeVztG0gqSNm5Q9onA6Spp1JG0sqRD2jtnzrtYJSKuoTzCGNr5Jb/lWsqjkmXyXBurZCo1M7Ne4JGJfioiQtKelHTi3wKepmTc/HZlt90kPVb5/Fnacm8sTclWWs29cTnwC+DwynmeljQKuLg2cZIyh+J+5ncGMBCYJOlN4E3gpxHxuqRG51wN+FPOqRDt5/po5DeURx5T8q2Wp4E9u3C8mZl1I+fmsCWCc3OYmXWdc3OYmZlZj3BnwszMzFriORO2RHCiL7Ml0+KaWKuv8ciEmZmZtaTHOhNOStUaSddIWjV/Dqusb6rtJO0g6XZJUyXdK+mEyvEjuqmOV+YbJrXPMyV9t/L59xmgamHLP0fS3q3W08zMulePdCaclKp1EbFHRLwArAoc1tn+DZwLHBIRQ4EtgMty/S5At3QmgHG1siStDrxCW/ArcnlcMwXla6RmZtYP9NTIhJNSdZCUStK3JB2Ry6dKuqlSlwtyeVZGuDwJ2CBHGE7OIgZKuiJHWi7Mzlu9tYAn83rnRsQ9kgYDhwJHZXk7SVpP0o1Z1xslrZvnP0fSmZLG5PV9vME5xtLWMRkBXAWsqeI9wGsR8U9Jy0k6O9v4Tkm75jlG5Xf2F+C6PO60bPer8xpqbbbA92FmZr2jpzoTTkrVcVKq0cBOuTyM0jlYBhgJjKnb91hKhMqhEXFMrts6z7kZsD7wvgZ1OxWYmY8ivippuYiYRQkgdWqWNwY4DTgvr/NC4JeVMgZTwm1/DDgz26tqMrCFpLdROhPjgZnAe/Pz2NzvPwCyjfcDzq2UtSNwYER8gNLmmwBDgK/QNurR1PchJ/oyM+sRvTIBU05KVZ+UajKwbXZ03qDchIdROhj1nYlGJkbEYxExD5haOfdbIuIHWeZ1wOeBv7VT1o60PTI6n9KhqbksIuZFxN+Bh4BN687xBiVnyDaU7+L2vJYR+VN7xDEyyyYi7gP+AdRCdF8fEc/l8s7AxTmS8gQlhDg0+X040ZeZWc/oqc5E7QYDlKRUlARPa3ZwTIdJqYBZtCWluqxyzPX5V/bQiNgsIr5Ud+xLwKuS1m/2nBExBxgO/J4Strm9G3FHSalqdXpPRFxXV/6beT0HUW64Y4BdgQ2AZiaWvlFZrp67/joejIgzKG2/Vc5r6Ey0s9zoM5T670wZ/Xmektuj1pmojUx0ND/l1c7O0YXvw8zMekBPdSaclKrzpFSjgaPz3zGUuQxTY8F45y8DK3Xh/OR5P1aZS7ERpdPxQoPyxpGPkID9gdsq2z4raSlJG1Aep8xscKqxwFeBu/LzNMooxbqUTiWUa9w/67VxbmtU1mhg35x3MojSwaLF78PMzLpZj8yYd1KqppJSjQGOA8ZHxKuSXqfBI46IeFbSWEkzgL8CzUZiOoDS/rOBOcD+ETE3JzteIelTlLY8AvidpGOyrgdVyphJ6dCtDRwaEa83OM84SkfjxKzvHEn/Ah7NxzAAv6bMuZiedRkVEW80mDd6JWXy7nTKd1jrTK7Ewn8fZmbWzZzoy5oi6Rzgqoi4orfrsjCc6MvMrOvkRF9mZmbWExwYyJoSEaN6uw6tcG4OM+spS2I+EI9MmJmZWUvcmVhMqY/kQpG0VcbYqH3eT9LsytstQyRNy+VbJD1SeesESX+U9EouD5b0WkbNvFcl2uiBzdTDzMwWHXcmFkN5M+4ruVCmA+tlQC4o8Sbuo0TtrH0eW9n/BTKCp6RVgUF15T0YEVtHxHuzHkflGzhmZtZL3JlYPPWZXCj5OugkoHb8tsDpzJ/Do5r86xLa4lx8hraooguIiIco4dKP6Oi6zMxs0XJnYvHUZ3KhpHHAiAzWNQ+4hfk7E9WRiRuBnSUNyHNe2knVp1AX1rtyTc7NYWbWA9yZWAL0ci4UaMsmOhyYlEnWNpS0JjAwRxhq5lKibu4DLJ/JyDq8vPY2ODeHmVnP8Kuhi6e7KTlLgJILJUcROora1GEuFEmzaMuFsmPlmOsjYr9O6jOBkjF1JCXxF8BjlI7JuAb7X0KJfnlCJ+VCmXvR1GRQMzNbNDwysXjqU7lQIuJl4FFgFG2difGUvBqNOhNjKOG4L26w7S2SBgOnAB3OBTEzs0XLnYnFUCYH2xN4v6SHJU2kPLZYIBdK7YfyF34tL8l0ytyG+lwom5MTL/M8T1M6CBfn650TaGf+AuVRx7KZ8RVKZ2J9GnQmojglIp5pUM4GtVdDKdlifxURZ3fUHmZmtmg5N4ctEZybw8ys65ybw8zMzHqEOxNmZmbWEr/NYUsEJ/oy6zlLYqKrJZ1HJszMzKwlfaIz4aRUndZrHUlXVK5nj7prPrqJMg7OMNnTJM2Q9KlcP0rSOgtTr7ryJekZSW/Pz4OlrmS3AAAgAElEQVQkhaSRlX2elrR6C+eYlfEyzMysD+n1zoSTUnUuIp6IiL3z41Bgj472ryfpXcBxwMiI2JISuXJabh4FtNyZyNdRb6ctoNUI4M78F0mbAM9ExLNN1tmP4MzM+ole70zgpFRIukbSlrl8p6Tv5fIPJX05RzlmSHob8ANgH0lTJdU6TpvlSMlDkholvVoLeBl4JevySkQ8LGlvYBhwYZa3fKN2zbrMkvSTHGGZWAluVVULm11rq58xf+diXJa1nqQbc5TkRknr5vpzJP1M0s3ATyStLum6rM//klE6Ja0o6WqVEOEzKu1gZma9oC90JpyUCkYDO0laGZhDjnZQwk+Pqe0UEf8GvgdcGhFDI6J2vk2BD1NyXxxfeyxTcRfwFPCwpLMlfSLLu4LSDvtHxFAgaL9dAV6KiOHAacDPG1zHONraajhlxOnd+bnadqcB5+UoyYXALytlbEz5jr4JHA/cFhFbA38G1s19PgI8ERFbRcQWwN8a1MWJvszMekhf6EzMR0tmUqoxWf+RwNXAQEkrAINztKUzV0fEGxkx8l/A2tWNETGXcgPeG7gfOFXSCQ3K6ahdoa0tL6ZtxKFqIrB1dsSWiYhXgIdyFKM6qrMjbY+ezs/rrrk860ue+4K8hquB53P9dGD3HCnZKSIa9hSc6MvMrGf0hefSTkpVHq0MAx4CrgfWAL4CTG6iTIA3KstzafC95pyGicBESdcDZzeoc7vtWiumneXaOWZLegA4mLbRpgmUOR5rAe11jKplvdrBttp57pe0bZZ7oqTrIuIHndTdzMwWkb4wMrHEJ6XKxxePUuZ3TMgyj6byiKPiZWClBus7Ovc6kraprBoK/KNBeZ216z6Vf8fT2FhKW1Xb7j+BCdEWu30cbXNN9qeM7jQyOrcj6aNA7U2RdYDZEXEBpU23aed4MzPrAb3emXBSqreMAZ6KiNm5/C4adyZupky4rE7A7MwywCmS7stHPPtQbvBQ5kicmetFx+26rKTb89ij2jnXWEpb1ToTU/Jaqm13BHBQfg8HVOpS7/uUOSlTgA8Bj+T6IZQRlqmUt1R+1PHlm5nZouREX9aUfHQ0rJ1OU5/nRF9mZl0nJ/oyMzOzntAXJmBaPxARg3u7Dq1wbg4z60lLWn4Sj0yYmZlZS9yZ6KMkHaGS0+PCLh63qqTDcnlITtScKum5nOA6VdINC1Gf9SXt28H2TSX9VdLfs96XSFqri+cYIKnRpFMzM+vD3Jnouw4D9oiI/bt43Kp5LBExPSNlDqVEkDwmP+++EPVZn7bXOecjaXngKsrbKhtlLpL/A7qU1Csi5kbETgtRNzMz60XuTPRBks6k3Lz/LOkoScMljcvXTMepJM1C0uaZJ2Nq5rnYCDiJ8krqVEknd3KeY/P4aWrLB7JjHvs2SQMl3SPpvVnurrmtPv/HAZREbdfUVkTEjRFxr0q+j3NVcn1MkbRznmeIpEmVuq8vaWlJL+T23VXydvxB0kxJ51XqvZ2kW1UyzP5V0tqYmVmv8QTMPigiDpX0EWDXiHhGJWfHzhExR9LuwH9TonseCvwiIi5USQI2ADgW2CJHI9qlksZ8XUpSMwHXSBoREeMk/Y2SUOztwNnZKTgW+HpE7NmguC1oP1rnEZREbkMkbZ7n2YgyenJKRFyqkkysUfTNbSgh0P8FTFDJtXIn8Avgk9k2+wM/BA5pcI2H1NYPWHnNjprDzMxa4M5E/7AKcG7ehIMShApKYKjjVFKM/yETmjVb5oeAj1JuzgADKUm2xlESbE0GXmL+RF8LYyRwMkBE3C3pCWDDPM93Ja2XdX9AC6YdnxARTwJkgKrBwOuUgGQ35LUOoIQ7X0BEnAWcBbDsoI0cUMXMbBHxY47+4YfAzZkh8xPAcgARcRHwSeA14FpJH+hCmQJ+VJtTEREbRsQ5uW0NSkjzlYFlmyjrbkq69vbOs4CIOB/4NCWvyPW1xx91GuUcETCtUu8hEfHRJupoZmaLiDsT/cMqwOO5PKq2UtL6wEMR8UvKBMstaT53x7XAl1QyfCLpXSoJ1qD8NX8sJSz5ibmuo3LPp4RD/0ilbntI2oz582u8FxgEPCBp/Yh4ICJ+QcmUumUTdQa4B3inpOFZ5tvy8YmZmfUSdyb6h/+hZMccSxnWr9kHmJGPADYFzouIZylp1md0NAEzJ0teQZmLMJ2SO2SgpIOBVyPiMuDHlORo76c8Dhmgkh7+iLqyZlNGTI7KV0PvAb4APE1JbLZ8nuNC4IuZ2Ozzku7Ouq9PphrvTES8QUml/jNJd2W9tm/mWDMzWzScm8OWCM7NYWbWdc7NYWZmZj3CnQkzMzNriV8NtSWCE32ZWVctacm6WuGRCTMzM2tJt3UmnJhq0ZL0aUnH5PJnJG1a2XabpM4iXg6QdHq+5TE9w2ivJ2mpjG7ZHXXcVtIdlc8HSHpF0oD8vLWkKS2Uv2G+/WFmZn1Idz7mOAz4aEQ83MXjaompfh0R04GhAJLOAa6KiCsWsj61xFSX1G9QW2KqI2r5JCTtRklM9a9mTxARc4EeSUwVEVdWPn4GmAfc14UiPk+5vi0jYp6kdSkRLpeixJQ4qRuqeRewoaQV8nXREcD9wFbAlPw8ttnCJC0dEXO6oV5mZrYIdcvIhJyYqqXEVFnOQ7m8hqR5kkbk5/GSBkv6sqSfS9oJ2AM4NesyOIvZN9tmZu3YOoOAJyNiXl7vIxHxQrbTSlnWeXnOb+UIxgxJh+e6DTMuxPnZNpdlp+wteeOfAgzPVVsDZ1A6EeS/47K8D+Y5p0v6P5XcIkh6TNJ/qcTU+HS23TRJ4ym5SGpttsD30eCazcysB3RLZyIiDgWeoCSmOpXyF/POEbE18D1KYipoS0w1FBhGyalwLPBghkY+pr1zaP7EVEOBESqJqcYDtcRUPyUTU2W5N2e5v6wrrqnEVJROx/l5o6slphoKbJfXW28b4D8oyaneK2kHlSRWvwD2iohtKcGZfljXfnOAh7LTNTLrtlPerNeKiFmVfccA1wBH5bXVtikihgPHUNq83iXAZ7KDd0rlscixwMtZ1hdVIkvuT+kQ7AgcJqkWnXIz4PRsm9eBrzY4zzjKd7MSJRz2aObvTIyVtALwu2yTIZTQ3dVEXa9GxPsi4nLgHOBrEbEj8wfs6vT7kHSIpDsk3TF39osNqmpmZt1hUU3AXAW4XNIM4FRKYiYoiam+I+nbwHoR8VoXyqwmpppCSRa1cW47Hvg4MITSoWjFSEp4aCLibspNqpqY6lvAuyPi9QbHToiIJ/PxRy0x1XtpS0w1lXLzfneDY8cAO+fPiZTHJ9sDtzdZ7z/kv5PzvPOJiEeATYDjctXNknZpUM5OwO8jYnZEvAz8kdImAA9HxIRcvqCyvmospdOwAzAxImYCm0h6B7BM1uO9wN8j4sE85jzKdddcCmWUBlg+ImqPRs6v7NPp9xERZ0XEsIgYNmCFVRpU1czMusOi6kw4MVXXE1ONodzIh1Hmc6xBucGObuJ6queunbfRNbweEddExNHAT4BPNdito7Sj9eFSG4VPHU/pBL0vlwH+CXyWtvkSnaU2fbWTczT7fZiZWQ9YlCMTTkxVNJuYajzwfsojln8D04GvUDoZ9Zpts7eovGkxKJeXoozi/KM2wVFt6b9HU+YqLC9pIKXDUavDeyRtl8v7AbfVnyfnYTxFeURU60xMAI4k50tQ2mSjyjyHLwC3NijrGeB1STvmqv0r17Ow34eZmXWzRdWZcGKqtnM1lZgqH/k8QdsNdwxltOWeBsVeTHlcVJ2A2Zl3AFfno6fplNGhM3Lbb4Fpks6LiIlZ/iRKJ+CMfMsGyojOVyRNA1akdOIaGQsMiIgn8/N4SpuNy2udDXwJ+EO28xvA/7VT1kHA/+YEzFcq6xfq+zAzs+7nRF/WFEkbAlfkhMd+x4m+zMy6Tk70ZWZmZj3BuTmsKRHxABlQrD9ybg6z/sn5MfoHj0yYmZlZS9yZ6KfUj3KhSPqLpI9XPj+oSj4QSX+S9EmVKKIh6cDKtu1y3ZH5+YKs512S7leJVrpOV+trZmbdx52J/uswYI+I2L/TPedXy4VCREyvxb+gvKp7TH7efSHqU8uF0sg4MgqmSijxFyjRNWt2oO0tlul15exLyflRdVREbEV5I2g6cJOkZRaizmZm1g3cmeiH1P9yodSiYpL//hFYJ8vbCHghY0oAPASsrJKjRMAHKTFGFhAR8yLiFOA5SoRUMzPrBZ6A2Q9FxKEZcGvXiHhG0sqUXChzJO1OyYWyF225UC5UyS8ygBLca4vOXvHU/LlQBFyjkgtlnKRaLpS3k7lQ8rHF1yNizwbFTQKGZmCsEZTOwWaSNqaMUNRnEv09JTbHvZRw4m920iRTKKMU882wlHQImfNjwMprdlKEmZktLHcmFg+rAOfmX/kB1Ib8xwPHSXoX8IeI+Hv5Y78p1VwoAAMpuVDGUXKhTKakMP9aZwVFxGuSZlLeBtmeEm59M0rHYkfaHnHUXEqJUno/JYBWZ2HX2wuBfhYZWGvZQRs5oIqZ2SLixxyLh76eCwVKh2EXYLmIeIkSXXNE/sw3MhERj+f53w/c0kTZQymjGGZm1gvcmVg89PVcKFA6DF+jbaTjTkpis3dQUtbX+y/g2xExr70CVRwFrA5c38Q1mZnZIuDOxOKhT+dCSWMpk0bHZ/lvAs9S0pQv8AgiIm6LiD+3U71TM89J7dHJB7I8MzPrBc7NYUsE5+YwM+s65+YwMzOzHuHOhJmZmbXEr4baEsGJvsza52Ra1iqPTJiZmVlLeqQz4aRUrSelkvRjSbvm8jckLZfLS0t6oYnjB0m6Jutyj6Q/d9YWC1HHb0o6pfL5txkts/b5KEk/a6H8L0v6eav1NDOz7tVTIxNOStViUqqIOC4ibs6P3yADU3XBj4CrI2KriNgM+G6u76gtuuqttktDgNUl1X7PFghQ1Z6MIeGRMzOzfmCR/89aTkoFdJyUStIISZfl8l6SXpW0jKQVJf09118gac8M0rQWMKY6KiPppBx1GC9prQZVGAQ8VqnPtFycry0kLZ8jKNMlTZG0c5b/ZUlXSrpW0kxJ321wjsmUnBvLSlqNEm57BiV0NlRCZ0v6lkqsixmSDs91G+bnMyn5Ngblee+XdAulI1e73n1z37sk3YyZmfWaRT4B00mpFtAoKdUkYNtc3gm4B9iGkg9jQvXgiDhV0jeBnSLihaznKsCtEXFsPkY4mNJJqDoNuEjSFOCGbIsnKW38VltI+jbw74gYImlzSltulGUMB7YA/g1MknRVREyt1O3fkmbktbw96/4oMELSy1nuk5KGA/tneQOAiZJuBWZTOh4H5e/NuyiRMLehRNgcXWmP44FdIuIpSas2amg50ZeZWY/ojWHkVYDL86ZzKrB5rh8PfCdvZutFxGtdKLOalGoKsCElKRWUm87HKUPuP+2soDxvNSnVxKxbLY9Eo6RUnwP2oySl6swCSakyeuMjedMeBvwc2JnSsRjTRJmvRcRfc3kyMLjBOa4BNgB+S7lh3ylp9QZljaQk2SIi7gaeoLQnwLUR8XxEvEoZsRnZ4PjayM4ISrs1arudgN9HxOyIeLmurAcjYlIu7wDcGBHPRsS/KVE4q+c5T9KXaef3OCLOiohhETFswAqrNNrFzMy6QW90JpyUqnFSqjHAxyh/nd9IueGOpPw13pl/V5bn0s6IU96UL4yILwBTadwZ6CitaH241EbhU2vzJnaktNsMymhGte06OserTZwD4CuUjuJg4C5Jb++gTDMzW4R6a2TCSakWNJoysXJcRPwzz7VBRDQ6X7PtUj3/bpKWz+WVgfcAjzQoazTlEQQ5v2QQ8EBu+5DKGzYrAJ+i8WTK2sjEqtl5mUeZxPox2kYmRgOfzvkZA7OsRiMwE4DdJK2Wj772rmxbPyImUNr+eeCdzbeGmZl1p94IWvU/wLmSvgHcVFm/D/AFSW8C/wR+EBHPSRqbj0T+GhHHNCowIq6RtCklKRWUG+TnJX2STEqVcwvGqySlGk8mpQJ+mx2YqgWSUkl6ltLZaZiUqoPrPVXS94Hls7z2klKNp9y4ayMRMyhvszRyFnCDpEeBj3Rw7qrtgNOyfZcCzoiIO2vzU2ptAfwK+F+V5F5vAl/MuRAAtwEXUR6XnF+dL1GT82JeBKZVVk+gzI+YnvtMlHQxZa4IWZfpkjasK+sxST/K458Aqsk1TpX0Hsoox3URMaPJdjAzs27mRF/WlJybsEVEHNnbdVkYTvRlZtZ1cqIvMzMz6wkembAlwrKDNopBBzp4ppn1L72dN8UjE2ZmZtYj3JnopyStrrZcJf+U9Hjlc1SWp0oa3OD4cyTtncu3ZFTLaZLuk3RaNRCUpLlNlLexSu6PB1TysFymEo68q9dVH8fDzMz6OKcg76ci4llKzAoknQC8kuG6kfRKZ1FDG9g/Iu7ItztOBP5EiZ0BJShWu+WpJB27GvhGRPwl1+0KrAk81ZVKRMSIzvcyM7O+xCMTNp+MNPktYF1JWzV52OeB8bWORJZzc0TMkLScpLNVcn3cqbbMp41ysSDplfx3lxwxuSJHSy5Uvp8qaVtJt0qarJIrZFB3toGZmXWNRyYWT8tLqsWAeDgiPt2VgyNibsad2JSSBbWz8raghPFu5D+yzCEZC+Q6lTwnjXKx1NuaEm79CUrsj/dJup0SC+NTEfG0pH2AH1PykczHuTnMzHqGOxOLpw4fSzSpGvK6lfJGUm7+RMR9kv5ByZsyHjguk3n9ISL+3uDYiRHxGEB2ZgZTomluAVyfAxUDgCcbnTgizqIE+GLZQRv5tSUzs0XEjzmWEPmoYaqka5rYdwAlMVqjHCKN3E1b1tMFimu0sslcLG9Ulms5RwTcXcnDMiQiPtTgWDMz6yHuTCwhIuKgvPnu0dF+kpahTMB8NCKmdbRvxUWUNONvvRAt6SOShjB/ro+NKaniZ6pxLpZmzATWlLRjrb4qqdLNzKyXuDNhNRdKmkbJCbIiJflWUzJt+8eBwyX9XdI9lCRu/wJ+Tcn9MZ2Srn1URLxBycUyIx9fbAqc1+S5/k1J+PWTnNcxlZJYzMzMeokjYNoSwbk5zMy6zhEwzczMrEe4M2FmZmYt8auhtkSY/viLDD726t6uhplZj+qpRGEemTAzM7OW9FhnwompWiPpk5KOzeU9JW1W2XaLpA4nyEhaStIvJc3I0NaTJL0nt32nm+q4VSVSJpL2kzQ7XzdF0pB8Y2Rhyx8saUZ31NXMzLpPjz3mcGKq1kTEnynxGAD2BK4C7ulCEfsA6wBbRsS8jDz5am77DvDf3VDN6cB6klaKiJcpr2zeRwmLPTE/j222MEkDImJuN9TLzMwWoX7/mGNxSEwlaYCkh1SsKmmepJ1z2xhJG0oalSMwIyiRI0/OumyQxXw263e/pJ0aXPMg4MmImJfX+1hEPC/pJDL3hqQL85zfyBGMGZKOzHWD87rOzeu/QtIKdd/FPGASsH2u2hY4nbY4ECOAcVnebtm+0yX9TtKyuX6WpO9Jui2vaVtJd0kaT+b56Oj7MDOzntdXOhO1m9lUSVd29eD867WWmKqZ8ppKTAXsB5ybIxm1xFRDgWHAYw2O3Ro4EtgMWJ+SmGoZSm6KvSNiW+B3lMRU9fW/P48bmXXbKW+w74qIByr7jqOMUByTES0fzE1LR8TwPP/xDep2GfCJbJOfSto6yzuWHMmJiP0lbQscROkQ7AB8pbYvsAlwVkRsCbwEHNbgPOMo0TBXBOYBtzB/Z2Jstuc5wD7ZzksDX6uU8XpEjIyIS4CzgSMiYse683T6fUg6RNIdku6YO/vFBlU1M7Pu0Fc6E7Wb2dCuZrisWCAx1UKWNxI4H0piKqCamOo7kr4NrJdRH+tNzL/451EiMw6m3IBriammAt8F3tXg2DHAzvlzYtZjO8pf+s34Q/47Oc87n0yYtQnw/yg3+Rsl7dagnJHAlRHxakS8kuXWRjoejYjaY4oLct96YymdhuHApOzsbChpTWBgRDyU9Xg4Iu7PY87N6665FEDSKsCqEXFrrj+/sk+n30dEnBURwyJi2IAVVmlQVTMz6w59pTOxAC15ianGUG7aw4FrgFWBXSi5LZpRO3ftvI2u4Y2I+GtEHEOZI7Fng90aXn+tiE4+A0ygdIJGUm74UEYN9iUfcXRyDmiby6F2ztHs92FmZj2gz3YmlsDEVLdT/qKfFxGvU0Y2vkrpZNR7GVipyfOT591G0jq5vFTW/x+5+c1sRyjXv6ekFfJRxacrdVi3dh2UR0C31Z8nJ14+SsnNUetMjKc8fql1Ju4DBkvaMD8fANxKnYh4AXhRUm0EZP/K9Szs92FmZt2sz3YmmrBYJabKczxK+cseyg18JcobEvUuAY7JCYwbNNjeyFrAX1RerZwGzAFOy21nAdMkXRgRUyjzGSZSOji/iYg7c797gQOz3VcDzmjnXGOBZSPi0fw8njKHZFxe6+uUeRmXZzvPA85sp6yDgNNzAmb1UcZCfR9mZtb9nOjLmqISq+OqiNiil6uyUJzoy8ys6+REX2ZmZtYTnJvDmhIRsyhvpfRLzs1h1vt6Kk+E9TyPTJiZmVlL3JnoY9SWV+TujPz4jXz7YmHKulAlh8mMjDK5TOdHLZx8S+WknNA6I6NTfnQhyvmBpN0XRR3NzGzRcGei76kF3Noc+CCwB40jWjbjQsqbDkOA5YEvd08VG/ohJWT3FjlJ8xN08fVVgIj4XkTc0N2VMzOzRcediT4sIv4FHAJ8XUV7eUMGSDol10+TdHgef00kyque71LJHjpL82dZfUDS2pLWlPR7lYyikyS9L7cPrJx3mqS9qvVUydHxFeDwfMWViHgqIi7L7fvlsTMk/aRS53PUlsX0qFxfzQ47S9L3JU3JfTbN9SvmSMukbIemXws2M7Pu5wmYfVxEPJSPOdYCvpDrhuSN9boMrHUQ8B5g64iYI2m1ahn5eOMA4D8zY+ifKMGozpa0PTArIp6SdBFwakTcJmld4FrgvcB/AS9mHg0kvb2umhsCj0TES/X1z0BZP6FEHH0+67wnJabGO2uvmlY7N3WeiYhtJB0GHE0ZXTkOuCkiDs7jJkq6ISJerR4o6RBKZ4wBK6/ZbhubmVlrPDLRP9TCT7eXN2R34MyImJPbnqs7/tfA6IioRbK8lBL0CUqY60tzeXfgtAwE9WdgZUkr5frTa4VFxPNdqPt2wC0R8XTW70JKHo6HgPUl/UrSRyiJwxpplHPkQ8CxWc9bgOUokUrn49wcZmY9wyMTfVyGjZ5Lic7ZXk6LdnNYSDoeWJMSmrtmPG3Jt/YEfpTrlwJ2rE+aJand8tMDlFDbK2U47fq6LSDTn28FfJiSqfVzwMENdm2Uc0TAXhExs4M6mZlZD/HIRB+WN/szgdNy3kPDvCHAdcChkpbObavlv1+m3Kz3y0ymAGRZVwI/A+6NiGdz03XA1yvnH9rO+vkec0TEbOC3wC8lvS33GSTpC5SQ3O+XtIZKQrb9gFslrQEsFRG/pzxG2aYLTXMtJRS68lxbd7K/mZktQu5M9D3L114NBW6g3Mi/n9vayxvyG+ARSn6Nu4DP5/5nAmsD47PM71XOcyllDsallXVHAMNykuU9wKG5/kfA23Oy5F3Arg3q/V3gaeAelfwffwSejognKWnPbwbuAqZExJ+AdwK35KOKc3KfZv0QWCavd0Z+NjOzXuLcHLZEcG4OM7Ouk3NzmJmZWU9wZ8LMzMxa4rc5bIngRF+2pHFSLetJHpkwMzOzlrTUmZCTUvVYUipJ4/LfwZI+X1k/StJpTRz/8Qw9fZekeyR9NdfvKWmzbqrjnbXXSSUtLenVfD20tn2ypK68Alpf/i2SOp0IZGZmPavVkQknpeqhpFQRMSIXB9P26mdTsmN2FvCJiNgK2JoSORJK0Kpu6UwA44BaPbeixMAYkXVYEVif8npoM3X2Izgzs36i2x5zOCnVwielkvRrSZ/M5Ssl/S6XvyTpR7n8Su5+ErBTjggdlevWkfS3HGn5nwZfz0qU+THP5vW+EREzJY0APgmcnOVtIGmopAnZdlcqA1TlqMDPJY3Ldhje4DxjaetMjKDEuagFvhpOiTExV9Jqkv6Y55ggacs8xwmSzpJ0HXCepOUlXZL7XUrpZLb7fZiZWe/o1jkTEfFQlrkWJUQymRxqP+BcSctROhy1pFRbUkYk3qK2pFR/y6iNtaRUqJKUCvgFJSnVdsBelMBNUElKleXfVFfNZpJSfYByE9xOJSnVUDIpVV7P2e00wTMRsQ1wBiUpFbQlpdqOEuzp5PwrvWo0sFMuv5O2kYKRwJi6fY8FxuSI0Km5bigl18YQYB9J764ekLk6/gz8Q9LFkvaXtFREjMv1x2R5DwLnAd/OtpvO/CNNK+YIyWHA7xpcf3VkYkRe1xsq+T1GUDobUIJw3Znn+E6es2Zb4FMR8Xnga8Ds3O/Hua12vZ1+H5IOkXSHpDvmzn6x0S5mZtYNFsUETCel6npSqjGU0YbNgHuApyQNAnak3KA7c2NEvBgRr+fx69XvEBFfBnajjPocTYPOgKRVgFUj4tZcdS7l+msuzrJGU9p7vkyfETELeJukd1AeWc0EJgHbUzoTtWup/m7cBKye5wb4cyU3yM7ABbnfNGBarm/q+3CiLzOzntGtz6XlpFQLlZQqIh7Pxwkfofw1v1qe45UGdWzkjcpy9dz155kOTJd0PvAwMKqJsucropPPUL6vvYEnIyIkTQDeR3nMMSH3adTOtbJebWd924rmvw8zM+sB3TYyISelak+zSanGA0dS2m0MZfSg/hEHwMt0cbJoziPZpbJqKGWkaL7yIuJF4HlJtUcuBwC3Vo7bJ8sbSXmU1OjZwVjgqLye2nV9EfhnRLyQ66q/G7tQHg81Gl2o7rcFUJtb0cr3YWZm3azVzoSTUnWu2aRUY4ClI+IBYApldKJRZ2IaMDTLHDYAAAltSURBVEflFc9mJx4K+JbKq7dTKd/RqNx2CXBMTg7dADiQMq9jGqXT8YNKOc+rvKJ6JvClds41lvLWxniAbNMBzP+45gTyu6NMKD2wnbLOAAbmft+iPKKB1r4PMzPrZk70ZU2RdAtwdET0y2xZTvRlZtZ1cqIvMzMz6wkODGRNiYhdersOZmbWN3lkwszMzFrizoSZmZm1xJ0JMzMza4k7E2ZmZtYSdybMzMysJe5MmJmZWUvcmTAzM7OWOAKmLREkvUzJDdOfrAE809uV6KL+WGfon/V2nXtOf6x3d9V5vYhYs7OdHLTKlhQzmwkJ25dIusN17hn9sd6uc8/pj/Xu6Tr7MYeZmZm1xJ0JMzMza4k7E7akOKu3K7AQXOee0x/r7Tr3nP5Y7x6tsydgmpmZWUs8MmFmZmYtcWfCzMzMWuLOhC02JH1E0kxJD0g6tsH2ZSVdmttvlzS452u5oCbqvbOkKZLmSNq7N+pYr4k6f0PSPZKmSbpR0nq9Uc+6OnVW50MlTZc0VdJtkjbrjXrW66zelf32lhSSev0VxibaepSkp7Otp0r6cm/Us65OnbazpM/l7/Xdki7q6To20kRbn1pp5/slvbBIKhLx/9u79xi5yjqM49+nLYReEJCKEWlcwKVqSWnTraURFayJGsgWY41tIKFGSFSwMRgvaNWi4Q8kBqNAqNRaKsQqVaGKWi9tqRq3tFB6Jca2ttJogq0KlhZalsc/zrt1MjuzcybjnNmZ/X2SSWbOvGf2OWfn8s573jk/xyUubX8BRgN7gQuAU4FtwFvK2nwcuDddnw/8oE1ydwFTgZXAvDbJfAUwLl3/WKv3dc7Mryq53gv8sh32dWp3OrAR6AN6hntmYCFwV6v3b52Zu4GtwFnp9jntkLus/SeA5c3IEiMToVO8Fdhje5/t48AqYG5Zm7nA/en6amCOJBWYsZKauW3vt70deKUVASvIk3m97aPpZh9wXsEZy+XJ/HzJzfHAcJidnud5DfBV4GvAi0WGqyJv5uEkT+YbgLtt/wvA9rMFZ6yk3n29APh+M4JEZyJ0itcDz5TcPpiWVWxj+2XgOeDsQtJVlyf3cFNv5o8Av2hqotpyZZZ0o6S9ZB/MiwrKNpSauSVNBybZ/lmRwYaQ9/nxgXQYbLWkScVEqypP5ouAiyT9QVKfpPcWlq663K/FdKjxfGBdM4JEZyJ0ikojDOXfLPO0KdpwzFRL7sySrgV6gDuamqi2XJlt3237QuCzwOKmp6ptyNySRgF3Ap8qLFFtefb1T4Eu21OB3/C/EcNWyZN5DNmhjsvJvuEvk3Rmk3PVUs/7x3xgte3+ZgSJzkToFAeB0m835wF/q9ZG0hjgDOCfhaSrLk/u4SZXZknvBr4A9Np+qaBs1dS7n1cBVzc1UT61cp8OXAxskLQfuBRY0+JJmDX3te3DJc+J+4AZBWWrJu/7xyO2T9j+C1nhwO6C8lVTz/N6Pk06xAHRmQidYzPQLel8SaeSvXDWlLVZA1yXrs8D1jnNSmqhPLmHm5qZ09D7UrKOxHA4tpwnc+kHw5XAnwvMV82QuW0/Z3ui7S7bXWTzU3ptb2lNXCDfvn5dyc1e4OkC81WS53X4MNnEYiRNJDvssa/QlIPlev+QNBk4C/hjs4JEZyJ0hDQH4iZgLdkb0w9t75L0FUm9qdl3gLMl7QFuBqr+zK4oeXJLminpIPBBYKmkXa1LnHtf3wFMAB5KP0lraQcpZ+ab0k/+niJ7flxX5eEKkzP3sJIz86K0r7eRzU1Z2Jq0mZyZ1wKHJe0G1gOftn24NYkzdTw/FgCrmvnlKU6nHUIIIYSGxMhECCGEEBoSnYkQQgghNCQ6EyGEEEJoSHQmQgghhNCQ6EyEEEIIoSHRmQghtDVJ/ennpzslPSRpXKszVSNpgqSlkvamn0ZulDSrSX9rhWpUmU3VO88tub1suFRLDe0lOhMhhHZ3zPY02xcDx4GP5l1R0ujmxapoGdlZV7ttTyE7v8LEPCsqM6psWaP5FwInOxO2r7e9u8HHDCNQdCZCCJ3kd8AbIasLIunxNGqxdOCDV9KRdFKfTcBsSV+StDmNbHx7oJKspEWSdqdiVKvSsldLejgt65M0NS1fImm5pA2S9kkaVCRM0oXALGCx7VcAUrXHR9P9N6cMOyV9Mi3rkvS0pHuAJ4FJFfLPkPSYpCckrS07u+TA3x60jWnUogd4MO2jsSl/T1pngaQdaZ3bSx7riKTbJG1L++C1/5f/XGhr0ZkIIXQEZfVW3gfskPRm4EPA22xPA/qBa1LT8cBO27Ns/x64y/bMNLIxFrgqtfscMD0VoxoY7bgV2JqWfR5YWRLhTcB7yMpCf1nSKWURpwBPVSq0JGkG8GGyzsalwA3KTkkOMBlYaXu67QOl+YFNwLeAebZnAMuB2yrsnkHbaHs1sAW4Jo3sHCvJcy5wO/AuYBowU9JArZLxQJ/tS4CNZKW5wwgXnYkQQrsbm06BvQX4K9lp0+eQFY/anO6bA1yQ2vcDPypZ/wpJmyTtIPvwnJKWbyf71n4t8HJadhnwPQDb68hOz35Guu9R2y/ZPgQ8C9Tzjf0y4Ce2X7B9BPgx8PZ03wHbfSVtS/NPJiv09eu0nYvJij2Vq7aN1cwENtj+Rzpl84PAO9J9x4GBcudPAF05tzF0sDGtDhBCCA06lkYfTkqHKu63fUuF9i8OjA5IOg24B+ix/YykJcBpqd2VZB+gvcAXJU1h6JLPpZVR+xn8/roLuETSqIHDHKWRh9i+F6rlT+vtsj272so1trHqakPcd6KkxkOl7QwjUIxMhBA60W+BeZLOgZNzHd5Qod3Ah+ohSRPIqsmSJjpOsr0e+AxwJlnhso2kwyWSLgcO2X4+TyDbe8lGT24tmZfRLWluetyrJY2TNB54P9n8j1r+BLxG0uz0eKekTk/NbUz+Q1bGvNwm4J2SJqa5JguAx/JsZxiZokcZQug4tndLWgz8KnUMTgA3AgfK2v1b0n3ADmA/WUlngNHAA+kQhoA7U9slwHclbQeOUn9l0euBrwN7JB0FDpNVn3xS0grg8dRume2tkrpqbOfxNJHymynrGOAbZKMgtbYRYAVwr6RjwOySdf4u6Ray6pgCfm77kTq3NYwgUTU0hBBCCA2JwxwhhBBCaEh0JkIIIYTQkOhMhBBCCKEh0ZkIIYQQQkOiMxFCCCGEhkRnIoQQQggNic5ECCGEEBryX6qQQSxAZo33AAAAAElFTkSuQmCC\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAg8AAAEWCAYAAADhFHRsAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOzdd7xcVbn/8c+XgLRQpBpUiHSBQIAQIAYEwYYNBQVEpKiIXOGCgvITr2C74AXFAsLlqvSOYgGUDgkpBBJCEkqQEqkivQWQJM/vj/UMZ2cy55yZ03JO8n2/XueVmV3WXnvNwF6z9trPo4jAzMzMrFlLLOwKmJmZ2cDizoOZmZm1xJ0HMzMza4k7D2ZmZtYSdx7MzMysJe48mJmZWUvceTCzHiXpR5KekfTPhV2X/kDS+yT9XdIrknZf2PUBkHSApFsr71+RtO7CrFMrJIWk9ZvYbidJj/VFnRY37jyYLeYkzZL0Wl5AnpJ0lqTBXSzr3cA3gU0i4h09W9MB6wfAqRExOCL+WL+yrv2fl3RVtmOfybo91NPlSro5L/Rb1C3/Yy7fqaePaX3DnQczA/hERAwGtgK2Ab7bagGSlgTWAZ6NiH91cf9F0TrA3Z1sU2v/IcBTwK96vVZ9537gi7U3klYFtgOeXmg1sm5z58HM3hIRjwN/BTYDkLSSpN9KelLS43lLYlCuO0DSOEmnSHoOuBm4Dlgrf0Wfndt9UtLdkl7IX6LvrR0vf3V/W9I04FVJS+ayoyVNk/RqHn9NSX+V9LKk6yW9vVLGZZL+KelFSWMkbVpZd7ak0/LX/MuSbpO0XmX9ppKuk/Rcjrp8J5cvIekYSQ9KelbSpZJWaa/dJH1F0gNZzp8lrZXLHwTWBf6SbbJ0J+3/OnA5sEml7I9JulPSS5IelXR8Zd0yks7POr4g6XZJa3b22TWo/1u3AZpos40rbTZT0uc6OifgAmCvyrH3Aa4A/l0pc2lJP5f0RP79vNpW+X14MtcdVFf3pSWdLOmR/AzPkLRsO+f57WyLl7Puu3RSd2uHOw9m9pYcLt8NuDMXnQPMAdYHtgQ+BHy5ssu2wEPAGsAHgY8CT+Qw+AGSNgQuAo4AVgeuplxI31YpYx/gY8DKETEnl+2R5W0IfILSofkOsBrl/1uHV/b/K7BB1mEK5WJVtQ/wfeDtwAPAj/NcVwCuB/4GrJXneEPucziwO/D+XPc8cFo7bfYB4ATgc5SRg38AFwNExHrAI+TIQkS80aiMSlnLAXsBEyuLX6X8cl852+lraps7sT+wEvBuYFXgEOC1XNfZZ9eR9tpseUoH8UJKe+8D/LraYWvgCeCePD55LufWbXMsZTRiOLAFMJIc/ZL0EeAoyvdhA2DXun1/QvmeDM9zfSfwvfpKSNoI+DqwTUSsAHwYmNVBva0jEeE///lvMf6j/A/0FeAFyoXv18CywJrAG8CylW33AW7K1wcAj9SVtRPwWOX9fwGXVt4vATwO7FQ59kEN6rNv5f3vgdMr7w8D/tjOuawMBLBSvj8b+E1l/W7AfZVzubOdcu4Fdqm8HwK8CSzZYNvfAv9TeT84tx1aOZ9dm2z/OZSL7bAOtv85cEq+PggYD2xet00zn92tlXUBrN9Em+0FjK071v8Cx7VT15spHZYvUDqRGwH357rHKt+DB4HdKvt9GJiVr38HnFhZt2GtvoAonav1Kuu3Bx6u/z7m9v+idD6WWtj/3Q30v0X1HqOZtWb3iLi+ukDSMGAp4ElJtcVLAI9WNqu+bmQtSocEgIiYJ+lRyq/Djsp4qvL6tQbvB2cdB1F+FX+WMrIxL7dZDXgxX1ef+phd25fya/3Bduq9DnCFpHmVZXMpF+XH67ZdizLiAUBEvCLpWco5zmqn/Hq7R8T1eT6fAm6RtElE/FPStsCJlFtJbwOWBi7L/c7L87hY0srA+ZRf8evQ+WfXkfbabB1gW0kvVNYvmfXoyB+AnwLPtrPtfN+TfL1WZd3kunU1qwPLAZMr5ylggdszEfGApCOA44FNJV0DfCMinuik7taAb1uYWXsepfx6XS0iVs6/FSOiOkTdWVreJygXHABU/g//bua/AHcnte/nKRfbXSnD90Nrh2pi30eB9TpY99HKea8cEctEmRNSr/4cl6fcQmi0bYciYm5E/IHSURmdiy8E/gy8OyJWAs4gzy8i3oyI70fEJsAo4OOU2wLNfHZd8ShwS127DI6Ir3VyXrMpt5e+RuPOw3xtCKydywCepHxnqutqnqF0Jjet1GelKJNPG9XjwogYnccKyi0P6wJ3HsysoYh4ErgW+KmkFXMS4XqS3t9CMZcCH5O0i6SlKI9xvkEZau8JK2R5z1J+gf53C/teCbxD0hE56W6F/JUP5QL9Y0nrAEhaXdKn2innQuBAScNzkt9/A7dFxKxWT0bFpyhzDe7NxSsAz0XE65JGUjpMte13ljQsRyxeotwumdtDn10jVwIbStpP0lL5t40qk2A78B3g/e20y0XAd7OdV6PMWTg/110KHCBpk5wTclxtp4iYB/wfcIqkNQAkvVPSh+sPIGkjSR/Iz+h1SqdjbrMnbvNz58HMOvJFylD5PZRJg5dT7v83JSJmUu53/4ryK/ETlMmD/+5wx+adSxnGfjzrOLHjzeer28uUSXifoAzT/x3YOVf/gvJr/1pJL2e527ZTzg2UuR2/p/xKXg/Yu8Xz+IukVygdgB8D+0dE7fHOQ4EfZD2+R7mY1ryD8pm8ROls3ELbRbdbn10j2WYfopzfE5R2+wnlVkpn+z4REbe2s/pHwB3ANGA65TbQj3K/v1LmedxImbx5Y92+387lEyW9RJkEu1GDYyxNuf3zTNZ7DUqHxrpAEd0ZMTQzM7PFjUcezMzMrCXuPJiZmVlL3HkwMzOzlrjzYGZmZi1xkChbLKy22moxdOjQhV0NM7MBZfLkyc9ExOr1y915sMXC0KFDueOOOxZ2NczMBhRJ/2i03LctzMzMrCXuPJiZmVlL3HkwMzOzlrjzYGZmZi1x58HMzMxa4s6DmZmZtcSdBzMzM2uJOw9mZmbWEgeJssXC9MdfZOgxVy3sapjZYmzWiR9b2FXoMR55MDMzs5a489DHJJ0i6YjK+2sk/aby/qeSvtGN8o+XdFS+PknSfZKmSbpC0srdq32nxz4qjzdD0l2SvtiFMg7pyn5mZtZ33Hnoe+OBUQCSlgBWAzatrB8FjGumIEmDOtnkOmCziNgcuB/4fy3XtkmSDgE+CIyMiM2AHQG1Wk5EnBER5/Z0/czMrOe489D3xpGdB0qnYQbwsqS3S1oaeC9wp4qT8lf8dEl7AUjaSdJNki4EpueyYyXNlHQ9sFHtQBFxbUTMybcTgXfl9rdJeqvDIulmSVtLWl7S7yTdLulOSZ/K9YMknZz1mCbpsAbn9R3g0Ih4KY/9YkSck/vvkuVNz/KXzuUnSronyzw5l1VHTm6W9BNJkyTdL2mHSn1OynpOk/TV7n0kZmbWCk+Y7GMR8YSkOZLWpnQiJgDvBLYHXgSmRcS/Je0BDAe2oIxO3C5pTBYzkjKi8LCkrYG9gS0pn+cUYHKDQx8EXJKvLwY+BxwnaQiwVkRMlvTfwI0RcVDe4piUHZIvAu8BtoyIOZJWqRYsaQVghYh4sP6gkpYBzgZ2iYj7JZ0LfC3//TSwcUREB7dUloyIkZJ2A44DdgW+BLwYEdtkR2ScpGsj4uG6Yx8MHAwwaMUFMsqamVkXeeRh4aiNPtQ6DxMq78fnNqOBiyJibkQ8BdwCbJPrJlUulDsAV0TE7PzV/+f6g0k6FpgDXJCLLgU+m68/B1yWrz8EHCNpKnAzsAywNuWCfUZtFCMinqs/BBDtnOtGwMMRcX++P4dyS+Ml4HXgN5I+A8xuZ/8/5L+TgaGVen4x63kbsCqwQf2OEXFmRIyIiBGDllupneLNzKxVHnlYOGrzHoZRbls8CnyTckH9XW7T0XyBV+vet3fhRtL+wMcpv/wDICIel/SspM2BvYDasL+APSJiZl0ZHXUOiIiXJL0qad2IeKi+Cu3sM0fSSGAXysjJ14EPNNj0jfx3Lm3fVwGHRcQ17dXJzMx6j0ceFo5xlAv6czmy8BywMuXWxYTcZgywV97fX53ya31Sg7LGAJ+WtGzePvhEbYWkjwDfBj4ZEfW/7C8GvgWsFBHTc9k1wGHZWUDSlrn8WuAQSUvm8lVY0AnAaZJWzG1WzNsG9wFDJa2f2+0H3CJpcB77auAIyi2aZl1DufWxVB5rQ0nLt7C/mZl1g0ceFo7plHkMF9YtGxwRz+T7Kyidibsov/q/FRH/lLRxtaCImCLpEmAq8A9gbGX1qcDSwHXZH5gYEYfkusuBXwA/rGz/Q+DnwLTsQMyidHJ+A2yYy98E/i/LrjodGEyZm/Em8Cbw04h4XdKBwGXZ+bgdOANYBfhTzokQcGSnrdbmN5RbGFOynk8Du7ewv5mZdYNyJNtskTZixIi44447FnY1zMwGFEmTI2JE/XLftjAzM7OWuPNgZmZmLfGcB1ssODGWmQ0EAyV5lkcezMzMrCUddh7kJE6dldFnSZwkXS1p5fw7tLJ8J0lXNrH/dhmWeqqkeyUdX9l/VCe7N1vHKyTtXnk/U9J3K+9/nwGhulr+2ZL27G49zcysezobeXASpw70ZRKniNgtIl6gxIM4tLPtGzgHODgihgObUaJMAuxEW66N7qp+X1YFXqE8blqzPW0RNDtUiylhZmb9T2edBydx6oMkTpK+JenwfH2KpBsrdTk/X8+StBpwIrBejiCclEUMlnR5jqRckLEP6q0BPJnnOzci7pE0FDgEODLL20HSOpJuyLreoJKDo/ar/wxJY/P8Pt7gGNXvyyjgSmD1/H68B3gtY1UsI+msbOM7Je2cxzhA0mWS/gJcm/udmu1+VZ5Drc0W+DzMzKxvdPjrzkmc+iyJ0xhKeOpfAiOApVWiJ45m/qBPAMdkew7POu+U7bkp8ATlAv4+4Na6/U4BZkq6GfgbcE5EzJJ0BvBKRNQ6RH8Bzo2IcyQdlHWq3YoYCrwfWA+4SdL6EfF65RiTgc0kvY3yfbkFWJfSydyStlGq/wCIiGEqQa+ulbRhrtse2DwinlO5xbERJYz3msA9wO/yM+3085ATY5mZ9YpmJkw6iVPvJ3GaDGydHZs3KG08gtJe9Z2HRiZFxGMRMY8SaXJo/QYR8YMs81rg85QORCPb0xb58jzKZ1tzaUTMi4i/Aw8B9dEu3wDuBrYCtsvzbe/7cl7ucx8lMmat83Bd5TPbkbbv1RPAjbm8qc/DibHMzHpHM52H+iROEykXmOp8h55O4rRvNYkTUE3idHHlmHtExPD8Wzsi7qXjzgHZaXlV0rqNqtDOPnMoIyi/p/wKb+/C21ESp1o93xMR19aV/yYlFPSBlPYeC+xM+YV/b3vn0uC49ceuP48HI+J0SjKqLVTmJXQm2nnd6D2U+u9IGd15nvJ9qXUeeuT70sLnYWZmvaDZkQcncer9JE5jgKPy37GUuQhTa52oipeBFVo4Pnncj1XmQmxA6WS80KC88ZRbSwD7Mv/tj89KWkLSepTbEfNl30zjKFk678r30yijEGtTRiWgnOO+Wa8Nc12jssYAe+f3agilQ0U3Pw8zM+umZma0O4lT3yRxGgscC0yIiFclvU6DWxYR8aykcZJmAH8Fmo18tB9wiqTZlNtC+0bE3JzjcLnKhNPDgMMp8wqOzroeWCljJuWW1JrAIXXzHWrGUzoWJ2R950j6F/Bo3lYB+DVwhqTpWZcDIuINLTjP8wpKmu7plCdwbsnlK9D1z8PMzLrJibGsKZLOBq6MiMsXdl26womxzMxaJyfGMjMzs57gQDzWlIg4YGHXoTuc28LM+puBkseiEY88mJmZWUvceehHtAjmEpG0Rca4qL3fR9LsytMnwyRNy9c3S3qk8lQIkv4o6ZV8PVTSaypRKe9Viea5f2/U28zM2ufOQ/+yKOYSmQ6sk4/mQjmH+ygRJ2vvq+f0AiVCJtmhGVJX3oMRsWVEvJfySOmR+YSMmZn1EXce+pdFLpdIPp55O7BtLtoaOI35c2BUk2VdTFucic/QFrVzARHxEPANyuOlZmbWR9x56EcyBHN9LpHbKDE0RpC5RCgX1VoukV2BkzKIEpTIi8dGxCaaP5fIZ2gLGV7vIErMCGjLJYIquUQoMShujIhtKMGaTspgVwfTlktkc9rCileNB0bl9vMo4cSrnYfqyMMNwI45crI3bTlO2jOFujDZNZIOlnSHpDvmzn6xk2LMzKxZ7jz0P4taLpHqOY0Ebs+kZOurRCMdnCMINXMpUS33ApaNiFntN1U5hfZWOLeFmVnv8KOa/U99LpFHKRk3XwJ+l9v0dC6RXaq5RCRVc4nUUojXconMrCujw1wiaSKlczOatpDmj1FGFsY32P5iSnTJ4zspF8qoSjP5P8zMrId45KH/WeRyiUTEy5RO0AGVc5hAyUvRqPMwlhLe+qIG694iaShwMvCrjrYzM7Oe5c5D/1PLJTKxbtmLdblEplFyidxI5hKpLygiplDmDEylZKCszyWyAiWXyFRJZ1TWXU4ZFbi0suyHwFKUnCEzaMsz8hvgkVx+FyXddyPjgKUj4tF8P4GSA2OBzkMUJ1fOt2q92qOaWb9fRcRZ7RzTzMx6gXNb2GLBuS3MzFrn3BZmZmbWI9x5MDMzs5b4aQtbLDgxltniaSAnn+rPPPJgZmZmLely58FJnPouiZOktSRdnq+HS9qtsu6tduqkjIMqIaRnVMJLHyBpra7Uq658SXpG0tvz/RBJIWl0ZZunJa3ajWPMkrRad+tqZmbd052RBydx6qMkThHxRETsmW+HA7t1tH09Se+ihJcenW24HeVRTyixF7rdecggU7VQ2lDa6k7aviMbAc9ExLNN1tm31MzM+qnudB6cxKmHkjhJujojOpL1/V6+/qGkL+coxgxJbwN+QAkQNbXWlsAmee4PSWqUJGoN4GXglazLKxHxsKQ9KTkzLsjylpW0S9Zherbh0lmXWZJ+kiMokySt3+A41e/EKOBnzN+ZGJ9lrSPphvwMblDJ5YGksyX9TNJNwE8krSrp2qzP/5KRNfPzvUrSXdkue2FmZn2my50HJ3ECei6J0xhgB0krUvJMvC+Xj6YS2Cnb83vAJRExPCJqx9sY+DClPY9T3mapuAt4CnhY0lmSPpHlXQ7cAewbEcMpYabPBvaKiGGUCbVfq5TzUkSMpASY+nmD83hrNCrr8kfg3fm+2nanAudWPoNfVsrYENg1Ir4JHAfcGhFbUvJyrJ3bfAR4IiK2iIjNgL81qIsTY5mZ9ZLuTph0EqeeSeI0lhJiejRwFTBY0nLA0PpcEu24KiLeyIiM/wLWrK6MiLmUC+6elNs+p0g6vkE5GwEPR8T9+f6crFfNRZV/t2dBk4Ats+O1VES8AjyUoxTV78T2wIX5+rw875rLsr7ksc/Pc7gKeD6XTwd2zZGQHSKiYc/AibHMzHpHdzsP9UmcJlIuDNVfmT2dxGnfahInoJrE6eLKMffIX+fDI2LtiLg3l/dGEqdfMX8o5/a0l8TpdspozQ6UUYg7ga8Ak5soE+CNyuu5NHgEN0M+T4qIEyjnskeDcjr6rGD+tlugHTNHxgOU0aEpuXgiZY7GGkB7HaFqWZ1+J7JzszWlE3FC7TaPmZn1jZ4YeXASp24mccrbEY9SRk8mZplHMX8uipqXKTkpmqbytMZWlUXDgX80KO8+YGhlPsN+lJGimr0q/06gsXGUtqq23X8CE2udPko71uaK7EsZvWlkTK5H0keB2pMcawGzI+J8Sptu1c7+ZmbWC7rbeXASp55L4jQWeCo7R2Mpk0IbdR5uokyQrE6Y7MxSwMkqj7tOpVz8/zPXnQ2ckcsFHAhcJmk6Zc5Hta2XlnRb7ntkO8caR2mrWudhSp5Lte0OBw5Ueex1v0pd6n2fMqdkCuVW1CO5fBgwKet8LPCjjk/fzMx6khNjWVMkzQJGtNNJ6vecGMvMrHVyYiwzMzPrCQ7EY02JiKELuw7d4dwWZjYQDJRcHB55MDMzs5a48zCASVpT0oUZWXKypAmSPp3rdpJ0ZZPlHC/phLplw3OiZyv1OSonZc7I6I9fbGX/LOOQruxnZmZ9x52HASofQ/0jMCYi1o2IWoTOd3WhuItoewyzZm/aAjk1U59DgA8CIzPq4450HjdiARFxRkSc2+p+ZmbWd9x5GLg+APw7It56lDIi/hERC8SRqGqUuyKjWL4gadvKpp8jg25J+lCOakyRdJmkwQ2K/g5waEYHJSJejIhz2jtmLj9R0j2Z4+LkXFbNpnpzJZ/G/ZJ2yOWDVPKl3J77frVrTWhmZl3hzsPAtSltURybImkZ2s9dcREZuEnSdsCzEfF3lRTY36Xkm9iKkgvjG3XlrgCskKG8mzpmBuj6NLBp5rhoL1bDkplP4whKrguAL1FiiWxDiQb6FUnvaXBs57YwM+sF7jwsIiSdlvMMbu9gs45yV1wM7KmSXn1v2iJmbgdsAozLoEz7A+vUH572w363d8yXgNeB30j6DFAfObSmlql0MjA0X38I+GLW5zZgVWCD+h2d28LMrHf4Uc2B624q+Ski4j9ylKCjSEjtzkGIiEczENT7s9ztK/tcFxH7dLDvS5JelbRuXeKwdo8ZEXMkjQR2oXRWvk65FVOvlrejmrNDwGERcU17dTIzs97jkYeB60ZgGUnVlNnLdbJPZ7krLgJOAR6MiMdy2UTgfbV9JC0nacMGZZ8AnKaSVhxJK0o6uL1j5ryJlSLiasotieGdn/JbrqHc+lgqj7WhSiZPMzPrAx55GKAiIiTtTkmv/S3gaUpGym9XNttF0mOV95+lLXfFkpRsntXcFZcBvwAOqxznaUkHABfVJjpS5kDcz/xOBwYDt0t6E3gT+GlEvC6p0TFXAf6UcyJE+7kyGvkN5RbGlHzq5Glg9xb2NzOzbnBuC1ssOLeFmVnrnNvCzMzMeoQ7D2ZmZtYSz3mwxYITY5lZVwyURFV9zSMPZmZm1pI+6zw4iVP3SLpa0sr5d2hleVNtJ2k7SbdJmirpXknHV/Yf1UN1vCKfAKm9nynpu5X3v8+AUF0t/2xJe3a3nmZm1j190nlwEqfui4jdIuIFYGXg0M62b+Ac4OCIGA5sBlyay3cCeqTzAIyvlSVpVeAV2oJNka/HN1NQPtZpZmb9UF+NPDiJUwdJnCR9S9Lh+foUSTdW6nJ+vp6VESRPBNbLEYSTsojBki7PkZQLsrNWbw3gyTzfuRFxj6ShwCHAkVneDpLWkXRD1vUGSWvn8c+WdIaksXl+H29wjHG0dURGAVcCq6t4D/BaRPxT0jKSzso2vlPSznmMA/Iz+wtwbe53arb7VXkOtTZb4PMwM7O+0VedBydx6jiJ0xhgh3w9gtIZWAoYDYyt2/YYSgTI4RFxdC7bMo+5CbAu8L4GdTsFmJm3Fr4qaZmImEUJ2HRKljcWOBU4N8/zAuCXlTKGUsJXfww4I9urajKwmaS3UToPE4CZwHvz/bjc7j8Aso33Ac6plLU9sH9EfIDS5hsBw4Cv0Daq0dTnISfGMjPrFQtlwqScxKk+idNkYOvs2LxBueiOoHQo6jsPjUyKiMciYh4wtXLst0TED7LMa4HPA39rp6ztabsFdB6lA1NzaUTMi4i/Aw8BG9cd4w1Kzo2tKJ/FbXkuo/KvdstidJZNRNwH/AOohby+LiKey9c7AhflSMkTlJDc0OTn4cRYZma9o686D7ULClCSOFESIq3ewT4dJnECZtGWxOnSyj7X5a/o4RGxSUR8qW7fl4BXJa3b7DEjYg4wEvg9JQxyexfejpI41er0noi4tq78N/N8DqRcYMcCOwPrAc1MBH2j8rp67PrzeDAiTqe0/RY5L6Ez0c7rRu+h1H9HyujO85TcGLXOQ23koaP5Ja92dowWPg8zM+sFfdV5cBKnzpM4jQGOyn/HUuYiTI0F44e/DKzQwvHJ436sMhdiA0on44UG5Y0nbwkB+wK3VtZ9VtISktaj3B6Z2eBQ44CvAnfl+2mUUYi1KZ1IKOe4b9Zrw1zXqKwxwN45b2QIpUNFNz8PMzPrpj6Z0e4kTk0lcRoLHAtMiIhXJb1Og1sWEfGspHGSZgB/BZqNfLQfpf1nA3OAfSNibk5OvFzSpyhteTjwO0lHZ10PrJQxk9KBWxM4JCJeb3Cc8ZSOxQlZ3zmS/gU8mrdVAH5NmTMxPetyQES80WCe5xWUybbTKZ9hrfO4Al3/PMzMrJucGMuaIuls4MqIuHxh16UrnBjLzKx1cmIsMzMz6wkOxGNNiYgDFnYdusO5LcystyyO+S888mBmZmYtcedhEaV+kktE0hYZ46L2fh9JsytPnwyTNC1f3yzpkcpTIUj6o6RX8vVQSa9lVMp7VaJ57t9MPczMrOe487AIyotvf8klMh1YJwNgQYn3cB8lKmbt/bjK9i+QETIlrQwMqSvvwYjYMiLem/U4Mp+QMTOzPuLOw6Kp3+QSycczbwdq+28NnMb8OTCqybIupi3OxGdoi9q5gIh4iBJ+/PCOzsvMzHqWOw+Lpn6TSySNB0ZlcKx5wM3M33mojjzcAOwoaVAe85JOqj6FujDZlXNybgszs17gzsNiYCHnEoG2bJsjgdszKdn6klYHBucIQs1cSlTLvYBlM3lXh6fX3grntjAz6x1+VHPRdDcl5wdQconkKEFHUZI6zCUiaRZtuUS2r+xzXUTs00l9JlIyio6mJMoCeIzSERnfYPuLKdElj++kXChzJ5qavGlmZj3DIw+Lpn6VSyQiXgYeBQ6grfMwgZKXolHnYSwlvPVFDda9RdJQ4GSgw7kcZmbWs9x5WARlMq3dgfdLeljSJMptiAVyidT+KL/ga3k9plPmJtTnEtmUnCiZx3ma0iG4KB+3nEg78w8oty6WzoyoUDoP69Kg8xDFyRHxTINy1qs9qknJpvqriDiro/YwM7Oe5dwWtlhwbgszs9Y5t4WZmZn1CHcezMzMrCV+2sIWC06MZda7FsfkUIszjzyYmZlZS/pF58FJnDqt11qSLq+cz25153xUE2UclGGnp0maIelTufwASWt1pV515UvSM5Lenu+HSHcQ6GwAACAASURBVApJoyvbPC1p1W4cY1bGqzAzs4VooXcenMSpcxHxRETsmW+HA7t1tH09Se8CjgVGR8TmlMiQ03L1AUC3Ow/5eOhttAWQGgXcmf8iaSPgmYh4tsk6+5aamVk/tdA7DziJE5KulrR5vr5T0vfy9Q8lfTlHMWZIehvwA2AvSVMl1TpKm+RIyEOSGiWJWgN4GXgl6/JKRDwsaU9gBHBBlrdso3bNusyS9JMcQZlUCSZVVQtDXWurnzF/Z2J8lrWOpBtyFOQGSWvn8rMl/UzSTcBPJK0q6dqsz/+SUTAlLS/pKpWQ2zMq7WBmZn2gP3QenMQJxgA7SFoRmEOOZlDCOY+tbRQR/wa+B1wSEcMjona8jYEPU3JHHFe7zVJxF/AU8LCksyR9Isu7nNIO+0bEcCBov10BXoqIkcCpwM8bnMd42tpqJGVE6d35vtp2pwLn5ijIBcAvK2VsSPmMvgkcB9waEVsCfwbWzm0+AjwREVtExGbA3xrUxYmxzMx6SX/oPMxHi2cSp7FZ/9HAVcBgScsBQ3M0pTNXRcQbGZHxX8Ca1ZURMZdywd0TuB84RdLxDcrpqF2hrS0vom1EoWoSsGV2vJaKiFeAh3KUojpqsz1tt5LOy/OuuSzrSx77/DyHq4Dnc/l0YNccCdkhIhr2DJwYy8ysd/SH+8pO4lRulYwAHgKuA1YDvgJMbqJMgDcqr+fS4HPNOQmTgEmSrgPOalDndtu1Vkw7r2vHmC3pAeAg2kaTJlLmaKwBtNcRqpb1agfrase5X9LWWe4Jkq6NiB90UnczM+sh/WHkYbFP4pS3Ix6lzM+YmGUeReWWRcXLwAoNlnd07LUkbVVZNBz4R4PyOmvXvSr/TqCxcZS2qrbdfwIToy0W+nja5orsSxm9aWRMrkfSR4HakxxrAbMj4nxKm27Vzv5mZtYLFnrnwUmc3jIWeCoiZufrd9G483ATZYJkdcJkZ5YCTpZ0X96y2YtyQYcyx+GMXC46btelJd2W+x7ZzrHGUdqq1nmYkudSbbvDgQPzc9ivUpd636fMKZkCfAh4JJcPo4ygTKU8RfKjjk/fzMx6khNjWVPyVtCIdjpJ/Z4TY5mZtU5OjGVmZmY9oT9MmLQBICKGLuw6dIdzW5jZQDEQ8oR45MHMzMxa4s5DPyXpcJWcGBe0uN/Kkg7N18NyYuVUSc/lhNSpkq7vQn3WlbR3B+s3lvRXSX/Pel8saY0WjzFIUqNJomZm1o+489B/HQrsFhH7trjfyrkvETE9I1EOp0RoPDrf79qF+qxL2+OV85G0LHAl5WmSDTKXx/8BLSXBioi5EbFDF+pmZmZ9yJ2HfkjSGZSL9Z8lHSlppKTx+djneJUkU0jaNPNMTM08ERsAJ1IeEZ0q6aROjnNM7j9Nbfk0ts993yZpsKR7JL03y90519Xnz9iPktjs6tqCiLghIu5VyZdxjkqujCmSdszjDJN0e6Xu60paUtILuX5XlbwXf5A0U9K5lXpvI+kWlQysf5W0JmZm1mc8YbIfiohDJH0E2DkinlHJebFjRMyRtCvw35TomYcAv4iIC1SSZg0CjgE2y9GGdqmk9V6bkgRMwNWSRkXEeEl/oyTgejtwVnYCjgG+HhG7NyhuM9qPhnk4JfHZMEmb5nE2oIyOnBwRl6gk32oU3XIrSkjxfwETVXKV3An8Avhkts2+wA+Bgxuc48G15YNWXL2j5jAzsxa48zAwrASckxfdoAR9ghKI6ViVlNt/yARgzZb5IeCjlIsxwGBKUqrxlIRUk4GXmD8xVleMBk4CiIi7JT0BrJ/H+a6kdbLuD2jBNNwTI+JJgAwINRR4nRIA7Po810GU8OELiIgzgTMBlh6ygQOamJn1EN+2GBh+CNyUGSQ/ASwDEBEXAp8EXgOukfSBFsoU8KPanIiIWD8izs51q1FChK8ILN1EWXdT0pe3d5wFRMR5wKcpeTmuq93OqNMoZ4eAaZV6D4uIjzZRRzMz6yHuPAwMKwGP5+sDagslrQs8FBG/pEyI3Jzmc19cA3xJJQMmkt6lkpAMyq/1Yyhhvk/IZR2Vex4lvPhHKnXbTdImzJ+f4r3AEOABSetGxAMR8QtKJtHNm6gzwD3AOyWNzDLflrdDzMysj7jzMDD8DyV75DjKMH3NXsCMHNLfGDg3Ip6lpB2f0dGEyZzceDllLsF0Su6NwZIOAl6NiEuBH1OSib2fcntjkEq69MPryppNGRE5Mh/VvAf4AvA0JRHYsnmMC4AvZiKwz0u6O+u+Lpl6uzMR8QYltfjPJN2V9dq2mX3NzKxnOLeFLRac28LMrHXObWFmZmY9wp0HMzMza4kf1bTFghNjmdlA1F+TZHnkwczMzFrSY50HJ3LqXZI+LenofP0ZSRtX1t0qqbOIkoMknZZPYUzPsNTrSFoio0f2RB23lnRH5f1+kl6RNCjfbylpSjfKXz+fzjAzs4WoJ29bHAp8NCIebnG/WiKnX0fEdGA4gKSzgSsj4vIu1qeWyOni+hVqS+R0eC0fg6RdKImc/tXsASJiLtAniZwi4orK288A84D7Wiji85Tz2zwi5klamxJBcglKTIcTe6CadwHrS1ouH98cBdwPbAFMyffjmi1M0pIRMacH6mVmZj2oR0Ye5ERO3UrklOU8lK9XkzRP0qh8P0HSUElflvRzSTsAuwGnZF2GZjF7Z9vMrO1bZwjwZETMy/N9JCJeyHZaIcs6N4/5rRyhmCHpsFy2fsZlOC/b5tLshL0lL/RTgJG5aEvgdEqngfx3fJb3wTzmdEn/p5KbA0mPSfovlZgWn862myZpAiWXR63NFvg8GpyzmZn1gh7pPETEIcATlEROp1B+Ee8YEVsC36MkcoK2RE7DgRGUnATHAA9mqOGj2zuG5k/kNBwYpZLIaQJQS+T0UzKRU5Z7U5b7y7rimkrkROlknJcXtloip+HANnm+9bYC/oOSzOm9krZTSfr0C2CPiNiaEgzph3XtNwd4KDtZo7NuO+TFeY2ImFXZdixwNXBknlttnSJiJHA0pc3rXQx8Jjt0J1ducxwDvJxlfVElcuO+lA7A9sChkmrRHzcBTsu2eR34aoPjjKd8NitQwkuPYf7OwzhJywG/yzYZRgmFXU1s9WpEvC8iLgPOBr4WEdszf4CsTj8PSQdLukPSHXNnv9igqmZm1hW9NWFyJeAySTOAUyiJjKAkcvqOpG8D60TEay2UWU3kNIWSXGnDXHcc8HFgGKUD0R2jKeGWiYi7KRelaiKnbwHvjojXG+w7MSKezNsZtURO76UtkdNUysX63Q32HQvsmH8nUG6HbAvc1mS9/5D/Ts7jziciHgE2Ao7NRTdJ2qlBOTsAv4+I2RHxMvBHSpsAPBwRE/P1+ZXlVeMonYTtgEkRMRPYSNI7gKWyHu8F/h4RD+Y+51LOu+YSKKMwwLIRUbvVcV5lm04/j4g4MyJGRMSIQcut1KCqZmbWFb3VeXAip9YTOY2lXLhHUOZjrEa5oI5p4nyqx64dt9E5vB4RV0fEUcBPgE812KyjtJz14UgbhSedQOn0vC9fA/wT+Cxt8x06S/35aifHaPbzMDOzXtCbIw9O5FQ0m8hpAvB+yi2TfwPTga9QOhX1mm2zt6g8CTEkXy9BGaX5R21CotrSYY+hzDVYVtJgSgejVof3SNomX+8D3Fp/nJxH8RTllk+t8zAROIKc70Bpkw0q8xS+ANzSoKxngNclbZ+L9q2cT1c/DzMz66be6jw4kVPbsZpK5JS3cJ6g7QI7ljKack+DYi+i3P6pTpjszDuAq/JW0nTK6M/pue63wDRJ50bEpCz/dspF//R8CgbKiM1XJE0Dlqd02hoZBwyKiCfz/QRKm43Pc50NfAn4Q7bzG8D/tVPWgcD/5oTJVyrLu/R5mJlZ9zkxljVF0vrA5TlBccBxYiwzs9bJibHMzMysJzi3hTUlIh4gA3gNRM5tYWZ9rb/mpegJHnkwMzOzlrjzMEBpAOUSkfQXSR+vvH9QlXwakv4k6ZMqUTpD0v6VddvksiPy/flZz7sk3a8SDXStVutrZmZd587DwHUosFtE7NvplvOr5RIhIqbX4k9QHp09Ot/v2oX61HKJNDKejDKpEpr7BUr0yprtaHvKZHpdOXtTcmZUHRkRW1Ce2JkO3ChpqS7U2czMusCdhwFIAy+XSC3qJPnvH4G1srwNgBcypgPAQ8CKKjk+BHyQEuNjARExLyJOBp6jRCA1M7M+4AmTA1BEHJIBrnaOiGckrUjJJTJH0q6UXCJ70JZL5AKV/ByDKMG0NuvskUvNn0tEwNUquUTGS6rlEnk7mUskb0N8PSJ2b1Dc7cDwDEQ1itIZ2ETShpQRiPpMm7+nxMa4lxKe+81OmmQKZRRivhmRkg4mc2YMWnH1ToowM7NmufOwaFgJOCd/xQdQG8KfABwr6V3AHyLi7+XHfFOquUQABlNyiYyn5BKZTEnp/bXOCoqI1yTNpDytsS0lfPkmlI7E9rTdsqi5hBIF9H5KwKrOwpi3F1L8TDKQ1dJDNnBAEzOzHuLbFouG/p5LBEoHYSdgmYh4iRK9clT+zTfyEBGP5/HfD9zcRNnDKaMUZmbWB9x5WDT091wiUDoIX6NtJONOSiKwd1BSuNf7L+DbETGvvQJVHAmsClzXxDmZmVkPcOdh0dCvc4mkcZRJnhOy/DeBZylpuxe4pRARt0bEn9up3imZJ6R2K+QDWZ6ZmfUB57awxYJzW5iZtc65LczMzKxHuPNgZmZmLfGjmrZYcGIss/5rUU4gtajyyIOZmZm1pE86D07i1P0kTpJ+LGnnfP0NScvk6yUlvdDE/kMkXZ11uUfSnztriy7U8ZuSTq68/21Go6y9P1LSz7pR/pcl/by79TQzs+7pq5EHJ3HqZhKniDg2Im7Kt98gA0G14EfAVRGxRURsAnw3l3fUFq16q+3SMGBVSbXv2QIBodqTMRw8MmZm1g/1+v+c5SROQMdJnCSNknRpvt5D0quSlpK0vKS/5/LzJe2eQZHWAMZWR10knZijChMkrdGgCkOAxyr1mZYv52sLScvmCMl0SVMk7Zjlf1nSFZKukTRT0ncbHGMyJWfF0pJWoYSvnkEJRQ2VUNSSvqUSa2KGpMNy2fr5/gxKvoohedz7Jd1M6bjVznfv3PYuSTdhZmZ9ptcnTDqJ0wIaJXG6Hdg6X+8A3ANsRcknMbG6c0ScIumbwA4R8ULWcyXglog4Jm8LHETpFFSdClwoaQpwfbbFk5Q2fqstJH0b+HdEDJO0KaUtN8gyRgKbAf8Gbpd0ZURMrdTt35Jm5Lm8Pev+KDBK0stZ7pOSRgL7ZnmDgEmSbgFmUzoaB+b35l2USJNbUSJYjqm0x3HAThHxlKSVGzW0nBjLzKxXLIxh4ZWAy/IicwqwaS6fAHwnL17rRMRrLZRZTeI0BVifksQJykXm45Qh9J92VlAet5rEaVLWrZaHoVESp88B+1CSOHVmgSROGR3xkbxIjwB+DuxI6UiMbaLM1yLir/l6MjC0wTGuBtYDfku5QN8padUGZY2mJKUiIu4GnqC0J8A1EfF8RLxKGZEZ3WD/2sjNKEq7NWq7HYDfR8TsiHi5rqwHI+L2fL0dcENEPBsR/6ZEuawe51xJX6ad73FEnBkRIyJixKDlVmq0iZmZdcHC6Dw4iVPjJE5jgY9Rfn3fQLnAjqb82u7Mvyuv59LOiFJehC+IiC8AU2l88e8o7WZ9ONJG4Ulr8x62p7TbDMpoRbXtOjrGq00cA+ArlI7hUOAuSW/voEwzM+tBC2vkwUmcFjSGMhFyfET8M4+1XkQ0Ol6z7VI9/i6Sls3XKwLvAR5pUNYYyi0Fcn7IEOCBXPchlSdglgM+RePJj7WRh5WzszKPMun0Y7SNPIwBPp3zKwZnWY1GWCYCu0haJW9l7VlZt25ETKS0/fPAO5tvDTMz646FESTqf4BzJH0DuLGyfC/gC5LeBP4J/CAinpM0Lm9x/DUijm5UYERcLWljShInKBfEz0v6JJnEKecGTFBJ4jSBTOIE/DY7LFULJHGS9Cylc9MwiVMH53uKpO8Dy2Z57SVxmkC5UNdGGmZQnjZp5EzgekmPAh/p4NhV2wCnZvsuAZweEXfW5pfU2gL4FfC/Ksmw3gS+mHMZAG4FLqTc/jivOt+hJue1vAhMqyyeSJnfMD23mSTpIspcD7Iu0yWtX1fWY5J+lPs/AVSTU5wi6T2UUYxrI2JGk+1gZmbd5MRY1pScW7BZRByxsOvSFU6MZWbWOjkxlpmZmfUEjzzYYmHpIRvEkP0dnNLMBo7+kPPDIw9mZmbWI9x5GKAkraq2XB//lPR45X1UXk+VNLTB/mdL2jNf35xRI6dJuk/SqdXAS5LmNlHehiq5Mx5QyWNyqUp471bPqz6OhpmZ9TNOyT1ARcSzlJgRSDoeeCXDXyPplc6icjawb0TckU9fnAD8iRK7AkoQqnbLU0nSdRXwjYj4Sy7bGVgdeKqVSkTEqM63MjOzhckjDzafjOT4LWBtSVs0udvngQm1jkOWc1NEzJC0jKSzVHJl3Km2zKCNcpkg6ZX8d6ccEbk8R0MuUD4vKmlrSbdImqySa2NIT7aBmZl1zCMPi6ZlJdViMDwcEZ9uZeeImJtxHzamZAntrLzNKGGxG/mPLHNYxuK4ViVPSKNcJvW2pIQvf4ISe+N9km6jxKL4VEQ8LWkv4MeUfB7zcW4LM7Pe4c7DoqnD2wxNqoaQ7k55oykXeyLiPkn/oOQdmQAcm8mv/hARf2+w76SIeAwgOy9DKdEqNwOuy4GIQcCTjQ4cEWdSAmqx9JAN/FiRmVkP8W2LxUTeOpgq6eomth1ESSTWKAdHI3fTlhV0geIaLWwyl8kblde1nB0C7q7kMRkWER9qsK+ZmfUSdx4WExFxYF5sd+toO0lLUSZMPhoR0zratuJCStrttx5KlvQRScOYP1fGhpTU6TPVOJdJM2YCq0vavlZfldThZmbWR9x5sJoLJE2j5NRYnpKsqimZxvzjwGGS/i7pHkrSs38Bv6bkzphOSV9+QES8QcllMiNvR2wMnNvksf5NSZD1k5yXMZWSiMvMzPqII0zaYsG5LczMWucIk2ZmZtYj3HkwMzOzlvhRTVssTH/8RYYec9XCroaZWZ/qreRaHnkwMzOzlvRZ58GJnLpH0iclHZOvd5e0SWXdzZIWmNBSt/8Skn4paUaGir5d0nty3Xd6qI5bVCJRImkfSbPz8U8kDcsnOrpa/lBJM3qirmZm1nV9dtvCiZy6JyL+TImHALA7cCVwTwtF7AWsBWweEfMysuOrue47wH/3QDWnA+tIWiEiXqY8QnkfJcz0pHw/rtnCJA2KiLk9UC8zM+tBA/62xaKQyEnSIEkPqVhZ0jxJO+a6sZLWl3RAjrCMokRmPCnrsl4W89ms3/2SdmhwzkOAJyNiXp7vYxHxvKQTydwVki7IY34jRyhmSDoilw3N8zonz/9yScvVfRbzgNuBbXPR1sBptMVhGAWMz/J2yfadLul3kpbO5bMkfU/SrXlOW0u6S9IEMk9GR5+HmZn1vv7SeahdvKZKuqLVnfPXaS2RUzPlNZXICdgHOCdHKmqJnIYDI4DHGuy7JXAEsAmwLiWR01KU3A57RsTWwO8oiZzq639/7jc667ZDXlDfFREPVLYdTxmBODojRj6Yq5aMiJF5/OMa1O1S4BPZJj+VtGWWdww5UhMR+0raGjiQ0gHYDvhKbVtgI+DMiNgceAk4tMFxxlOiTS4PzANuZv7Ow7hsz7OBvbKdlwS+Vinj9YgYHREXA2cBh0fE9nXH6fTzkHSwpDsk3TF39osNqmpmZl3RXzoPtYvX8FYzQFYskMipi+WNBs6DksgJqCZy+o6kbwPrZFTFepPyF/08SuTDoZQLbi2R01Tgu8C7Guw7Ftgx/07IemxD+SXfjD/kv5PzuPPJBFMbAf+PclG/QdIuDcoZDVwREa9GxCtZbm0k49GIqN12OD+3rTeO0kkYCdyenZv1Ja0ODI6Ih7IeD0fE/bnPOXneNZcASFoJWDkibsnl51W26fTziIgzI2JERIwYtNxKDapqZmZd0V86DwvQ4pfIaSzlIj0SuBpYGdiJkhuiGbVj147b6BzeiIi/RsTRlDkOuzfYrOH514ro5D3AREqnZzTlAg9lVGBv8pZFJ8eAtrkYaucYzX4eZmbWC/pt52ExTOR0G+UX+7yIeJ0ycvFVSqei3svACk0enzzuVpLWytdLZP3/kavfzHaEcv67S1oubz18ulKHtWvnQbmlc2v9cXKi5KOU3Ba1zsMEyu2UWufhPmCopPXz/X7ALdSJiBeAFyXVRjj2rZxPVz8PMzPrpn7beWjCIpXIKY/xKOWXO5QL9gqUJxjqXQwcnRMO12uwvpE1gL+oPOo4DZgDnJrrzgSmSbogIqZQ5iNMonRofhMRd+Z29wL7Z7uvApzezrHGAUtHxKP5fgJlDsj4PNfXKfMqLst2ngec0U5ZBwKn5YTJ6q2JLn0eZmbWfU6MZU1RiZVxZURstpCr0iVOjGVm1jo5MZaZmZn1BOe2sKZExCzKUyMDknNbmA0cvZWPwXqORx7MzMysJe489DNqy8txd0ZW/EY+HdGVsi5QyQEyI6M4LtX5Xl2TT5GcmBNQZ2T0x492oZwfSNq1N+poZmY9w52H/qcW4GpT4IPAbjSOGNmMCyhPIgwDlgW+3DNVbOiHlBDYm+Wkyk/Q4uOkABHxvYi4vqcrZ2ZmPcedh34sIv4FHAx8XUV7eTcGSTo5l0+TdFjuf3UkyqOX71LJrjlL82chfUDSmpJWl/R7lYybt0t6X64fXDnuNEl7VOupkuPiK8Bh+cgpEfFURFya6/fJfWdI+kmlzmerLcvnkbm8mj11lqTvS5qS22ycy5fPkZTbsx2afkzXzMy6zxMm+7mIeChvW6wBfCGXDcsL6bUZyOpA4D3AlhExR9Iq1TLydsV+wH9mRs0/UYI/nSVpW2BWRDwl6ULglIi4VdLawDXAe4H/Al7MPBRIentdNdcHHomIl+rrn4GpfkKJ6Pl81nl3SkyLd9Ye/ax2Zuo8ExFbSToUOIoyenIscGNEHJT7TZJ0fUS8Wt1R0sGUzheDVly93TY2M7PWeORhYKiFc24v78auwBkRMSfXPVe3/6+BMRFRixR5CSXIEpSw0Zfk612BUzPw0p+BFSWtkMtPqxUWEc+3UPdtgJsj4ums3wWUPBYPAetK+pWkj1ASbTXSKGfHh4Bjsp43A8tQIoHOx7ktzMx6h0ce+rkMwzyXEv2yvZwQ7eaAkHQcsDol1HXNBNqSVe0O/CiXLwFsX59kSlK75acHKKGrV8jw1PV1W0CmA98C+DAlk+nngIMabNooZ4eAPSJiZgd1MjOzXuKRh34sL+5nAKfmvIWGeTeAa4FDJC2Z61bJf79MuTjvk5k+AciyrgB+BtwbEc/mqmuBr1eOP7yd5fPdtoiI2cBvgV9KeltuM0TSFyghrt8vaTWVBGb7ALdIWg1YIiJ+T7ktslULTXMNJbS48lhbdrK9mZn1IHce+p9la49qAtdTLtzfz3Xt5d34DfAIJT/FXcDnc/szgDWBCVnm9yrHuYQyh+KSyrLDgRE5KfIe4JBc/iPg7Tm58S5g5wb1/i7wNHCPSv6MPwJPR8STlDTgNwF3AVMi4k/AO4Gb89bD2blNs34ILJXnOyPfm5lZH3FuC1ssOLeFmVnr5NwWZmZm1hPceTAzM7OW+GkLWyw4MZZZ85yYyjrjkQczMzNrSbc6D3ISpz5L4iRpfP47VNLnK8sPkHRqE/t/PEM53yXpHklfzeW7S9qkh+p4Z+3xTklLSno1H9esrZ8sqZVHMuvLv1nSAhN3zMysb3V35MFJnPooiVNEjMqXQ2l7FLMp2RE7E/hERGwBbEmJzAglSFSPdB6A8UCtnltQYlCMyjosD6xLeVyzmTr7lpqZWT/VY7ctnMSp60mcJP1a0ifz9RWSfpevvyTpR/n6ldz8RGCHHPE5MpetJelvOZLyPw0+nhUo81uezfN9IyJmShoFfBI4KctbT9JwSROz7a5QBoTKX/0/lzQ+22Fkg+OMo63zMIoSZ6IWaGokJcbDXEmrSPpjHmOipM3zGMdLOlPStcC5kpaVdHFudwmlU9nu52FmZn2jR+c8RMRDWeYalJDDZDKlfYBzJC1D6WDUkjhtThlxeIvakjj9LaMi1pI4oUoSJ+AXlCRO2wB7UAIlQSWJU5Z/Y101m0ni9AHKRW8blSROw8kkTnk+Z7XTBM9ExFbA6ZQkTtCWxGkbSnClk/JXeNUYYId8/U7aRgJGA2Prtj0GGJsjPqfksuGUXBXDgL0kvbu6Q+a6+DPwD0kXSdpX0hIRMT6XH53lPQicC3w72246848kLZ8jIIcCv2tw/tWRh1F5Xm+o5McYRelcQAl6dWce4zt5zJqtgU9FxOeBrwGzc7sf57ra+Xb6eUg6WNIdku6YO/vFRpuYmVkX9MaESSdxaj2J01jKaMImwD3AU5KGANtTLsiduSEiXoyI13P/deo3iIgvA7tQRnWOosHFX9JKwMoRcUsuOody/jUXZVljKO09XybMiJgFvE3SOyi3oGYCtwPbUjoPtXOpfjduBFbNYwP8uZJbY0fg/NxuGjAtlzf1eTgxlplZ7+jR+8pyEqcuJXGKiMfz9sBHKL/WV8ljvNKgjo28UXldPXb9caYD0yWdBzwMHNBE2fMV0cl7KJ/XnsCTERGSJgLvo9y2mJjbNGrnWlmvtrO8bUHzn4eZmfWCHht5kJM4tafZJE4TgCMo7TaWMjpQf8sC4GVanNyZ80B2qiwaThkJmq+8iHgReF5S7RbKfsAtlf32yvJGU24NNboXMA44Ms+ndl5fBP4ZES/ksup3YyfK7Z5GowfV7TYDanMjuvN5mJlZN3W38+AkTp1rNonTWGDJiHgAmEIZfWjUeZgGzFF5KKDjQAAACWlJREFU5LLZiYICvqXyKOxUymd0QK67GDg6J3OuB+xPmZcxjdLJ+EGlnOdVHhk9A/hSO8caR3mqYgJAtukg5r/9cjz52VEmgO7fTlmnA4Nzu29RbrlA9z4PMzPrJifGsqZIuhk4KiIGZHYpJ8YyM2udnBjLzMzMeoID8VhTImKnhV0HMzPrHzzyYGZmZi1x58HMzMxa4s6DmZmZtcSdBzMzM2uJOw9mZmbWEncezMzMrCXuPJiZmVlLHGHSFguSXqbkVhkoVgOeWdiVaJHr3Ddc59430OoLvVfndSJi9fqFDhJli4uZjUKs9leS7hhI9QXXua+4zr1voNUX+r7Ovm1hZmZmLXHnwczMzFrizoMtLs5c2BVo0UCrL7jOfcV17n0Drb7Qx3X2hEkzMzNriUcezMzMrCXuPJiZmVlL3HmwRYakj0iaKekBScc0WL+0pEty/W2ShvZ9LReoU2d13lHSFElzJO25MOpYr4k6f0PSPZKmSbpB0joLo551deqszodImi5pqqRbJW2yMOpZV6cO61zZbk9JIWmhPlrYRBsfIOnpbOOpkr68MOpZV6dO21jS5/L7fLekC/u6jg3q01k7n1Jp4//f3v3HyFHWcRx/f9pC6A8EpGJEGg/wqFpS2vRqaUQFa6IGcsVYYxtIqBESFWwMxh9o1aLhDyQGo0Co1FoqxCpVoYpaf7SlarzSQulPora1lUYTbBWwtNByfPxjnqvrdvd2xrqze9vvK7lkd/aZvc/M7e0+88yz8/2jpGeaEsR2/MTPkP8BhgM7gfOAk4FNwJuq2nwUuDvdng18bwhk7gImAkuBWUNkP18GjEq3PzJE9vMrKm73Aj9v98yp3anAWqAP6GnnvMBc4I5W7tf/IXM3sBE4I90/q90zV7X/GLC4GVli5CF0ijcDO2zvsn0YWAbMrGozE7g33V4OzJCkEjNWa5jZ9m7bm4GXWxGwhjyZV9s+mO72AeeUnLFanszPVdwdDbR6Jnme1zPAl4GvAC+UGa6GvHnbSZ7M1wF32v4ngO2nS85Yreh+ngN8txlBovMQOsVrgacq7u9Ny2q2sf0S8CxwZinpasuTud0Uzfwh4GdNTdRYrsySrpe0k+zDeF5J2eppmFnSZGCc7Z+UGayOvK+L96XTWcsljSsnWl15Ml8AXCDpd5L6JL27tHS15f7/S6cLzwVWNSNIdB5Cp6g1glB99JinTZnaLU8euTNLuhroAW5raqLGcmW2faft84FPA/Obnmpwg2aWNAy4HfhEaYkGl2cf/xjosj0R+BX/GQVslTyZR5CduriU7Ch+kaTTm5xrMEXeM2YDy233NyNIdB5Cp9gLVB7JnAP8tV4bSSOA04B/lJKutjyZ202uzJLeCXwO6LX9YknZ6im6n5cBVzY1UWONMp8KXAiskbQbuBhY0cJJkw33se39Fa+Fe4ApJWWrJ+97xkO2j9j+M1lxve6S8tVS5LU8myadsoDoPITOsR7olnSupJPJ/nFWVLVZAVyTbs8CVjnNKmqRPJnbTcPMaTh9IVnHodXniCFf5soPhMuBP5WYr5ZBM9t+1vZY2122u8jmlvTa3tCauLn28Wsq7vYCT5aYr5Y8/38Pkk0ARtJYstMYu0pN+d9yvWdIGg+cAfy+WUGi8xA6QprDcAOwkuxN6fu2t0n6kqTe1OxbwJmSdgA3AnW//laGPJklTZW0F3g/sFDSttYlzr2fbwPGAA+kr4u1tEOUM/MN6at4T5C9Nq6p83SlyJm5beTMOy/t401kc0rmtiZtJmfmlcB+SduB1cAnbe9vTeJCr4s5wLJmHhzF5alDCCGEUEiMPIQQQgihkOg8hBBCCKGQ6DyEEEIIoZDoPIQQQgihkOg8hBBCCKGQ6DyEEIY0Sf3pK6FbJT0gaVSrM9UjaYykhZJ2pq8trpU0rUm/a4kaVGJNlS7Prri/qB0qiob2F52HEMJQd8j2JNsXAoeBD+ddUdLw5sWqaRHZVU27bU8gu9bB2DwrKjOsatnx5p8LHO082L7W9vbjfM5wAojOQwihk/wGeD1ktTUkPZpGJRYOfNBKOpAuqrMOmC7pC5LWp5GLbw5UWpU0T9L2VMhpWVr2SkkPpmV9kiam5QskLZa0RtIuSccU1pJ0PjANmG/7ZYBUHfHh9PiNKcNWSR9Py7okPSnpLuBxYFyN/FMkPSLpMUkrq67kOPC7j9nGNCrRA9yf9tHIlL8nrTNH0pa0zq0Vz3VA0i2SNqV98Or/y18uDCnReQghdARl9UreA2yR9EbgA8BbbE8C+oGrUtPRwFbb02z/FrjD9tQ0cjESuCK1+wwwORVyGhjNuBnYmJZ9FlhaEeENwLvIyiZ/UdJJVREnAE/UKlQkaQrwQbLOxcXAdcou8w0wHlhqe7LtPZX5gXXAN4BZtqcAi4FbauyeY7bR9nJgA3BVGrk5VJHnbOBW4B3AJGCqpIF6H6OBPtsXAWvJylaHE0x0HkIIQ93IdFnpDcBfyC5DPoOs8NL69NgM4LzUvh/4QcX6l0laJ2kL2YflhLR8M9lR+dXAS2nZJcB3AGyvIrvc+WnpsYdtv2h7H/A0UOSI/BLgR7aft30A+CHw1vTYHtt9FW0r848nK5D1y7Sd88mKJVWrt431TAXW2P57uiTy/cDb0mOHgYEy4I8BXTm3MXSQEa0OEEIIx+lQGl04Kp16uNf2TTXavzBw9C/pFOAuoMf2U5IWAKekdpeTfWD2Ap+XNIHBSyJXVg/t59j3123ARZKGDZy2qIw8yPY9Xy9/Wm+b7en1Vm6wjXVXG+SxIxU1E2ptZzgBxMhDCKET/RqYJeksODpX4XU12g18iO6TNIas2ippYuI426uBTwGnkxX7Wks6/SHpUmCf7efyBLK9k2x05OaKeRXdkmam571S0ihJo4H3ks3faOQPwKskTU/Pd1Lq5DTcxuRfZOW9q60D3i5pbJorMgd4JM92hhND9BhDCB3H9nZJ84FfpI7AEeB6YE9Vu2ck3QNsAXaTlTwGGA7cl05JCLg9tV0AfFvSZuAgxatvXgt8Fdgh6SCwn6xS4+OSlgCPpnaLbG+U1NVgOw+niY9fT1lHAF8jG+VotI0AS4C7JR0Cples8zdJN5FVkhTwU9sPFdzW0MGiqmYIIYQQConTFiGEEEIoJDoPIYQQQigkOg8hhBBCKCQ6DyGEEEIoJDoPIYQQQigkOg8hhBBCKCQ6DyGEEEIo5N91/SlE9JWKHwAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] @@ -1393,7 +1451,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -1403,9 +1461,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/papermill.record+json": { + "results": { + "Doc2vec Cosine": 0.528387685928394, + "Doc2vec Cosine with Stop Words": 0.45572884639905675, + "GLoVe Cosine": 0.6688056947022161, + "GLoVe Cosine with Stop Words": 0.6049380247374541, + "GLoVe WMD": 0.6267300417407605, + "GLoVe WMD with Stop Words": 0.48470008225931194, + "TF-IDF Cosine": 0.6749213786510483, + "TF-IDF Cosine with Stop Words": 0.7118087132257667, + "Word2vec Cosine": 0.6476606845766778, + "Word2vec Cosine with Stop Words": 0.6683808069062863, + "Word2vec WMD": 0.6574175839579567, + "Word2vec WMD with Stop Words": 0.5689438215886101, + "fastText Cosine": 0.6707510007525627, + "fastText Cosine with Stop Words": 0.6771300330824099, + "fastText WMD": 0.6394958913339955, + "fastText WMD with Stop Words": 0.5177829727556036 + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Record results with papermill for tests\n", "pm.record(\"results\", results)" @@ -1428,7 +1513,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.5" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb b/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb index 41fc46729..4148272e4 100644 --- a/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb +++ b/scenarios/sentence_similarity/gensen_aml_deep_dive.ipynb @@ -127,11 +127,18 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failure while loading azureml_run_type_providers. Failed to load entrypoint hyperdrive = azureml.train.hyperdrive:HyperDriveRun._from_run_dto with exception module 'azureml.train.hyperdrive' has no attribute 'HyperDriveRun'.\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -188,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "metadata": { "scrolled": true }, @@ -242,7 +249,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -285,7 +292,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -301,9 +308,9 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 5, "metadata": { - "scrolled": true + "scrolled": false }, "outputs": [ { @@ -484,7 +491,7 @@ "4 2267923837.jpg#2r1e entailment NaN NaN NaN NaN " ] }, - "execution_count": 22, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -511,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -591,7 +598,7 @@ "4 There are children present " ] }, - "execution_count": 23, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -622,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -643,7 +650,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 8, "metadata": { "scrolled": true }, @@ -654,7 +661,7 @@ "'../../data\\\\clean/snli_1.0'" ] }, - "execution_count": 25, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -682,7 +689,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -691,7 +698,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -713,29 +720,29 @@ "source": [ "**Prerequisites:**\n", "\n", - "Upload the all the local files under `data_folder` to the path `./data/processed/` on the default datastore.\n", + "Upload the all the local files under `data_folder` to the default datastore.\n", "\n", "**Note: To download data required to train a GenSen model in the original paper, run code [here](https://github.com/Maluuba/gensen/blob/master/get_data.sh). By training on the original datasets (training time around 20 hours), it will reproduce the results in the [paper](https://arxiv.org/abs/1804.00079). For simplicity, we will train on a smaller dataset, which is SNLI preprocessed in [1 Data Loading and Preprocessing](#1-Data-Loading-and-Preprocessing) for showcasing the example.**" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "$AZUREML_DATAREFERENCE_6faee69b569b4268b8bf027b0bb4fd73" + "$AZUREML_DATAREFERENCE_liqungensen" ] }, - "execution_count": 29, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ds.upload(src_dir=data_folder, target_path=\"data/processed\", overwrite=True, show_progress=False)" + "ds.upload(src_dir=data_folder, overwrite=True, show_progress=False)" ] }, { @@ -769,7 +776,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -777,13 +784,13 @@ "output_type": "stream", "text": [ "Found existing compute target.\n", - "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-06-14T20:39:31.676000+00:00', 'errors': None, 'creationTime': '2019-06-03T21:18:34.507970+00:00', 'modifiedTime': '2019-06-03T21:18:50.790782+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 8, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n" + "{'currentNodeCount': 2, 'targetNodeCount': 2, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 2, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-06-21T20:14:04.778000+00:00', 'errors': None, 'creationTime': '2019-06-19T02:57:39.833104+00:00', 'modifiedTime': '2019-06-19T02:58:11.339451+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n" ] } ], "source": [ "# choose a name for your cluster\n", - "cluster_name = \"gpugensen\"\n", + "cluster_name = \"gensen-mlflow\"\n", "\n", "try:\n", " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", @@ -791,7 +798,7 @@ "except ComputeTargetException:\n", " print('Creating a new compute target...')\n", " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n", - " max_nodes=8)\n", + " max_nodes=2)\n", "\n", " # create the cluster\n", " compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n", @@ -814,7 +821,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -836,19 +843,11 @@ "metadata": {}, "source": [ "### 2.3.1 Prepare Training Script\n", - "Now you will need to create your training script. In this tutorial, the script for distributed training of GENSEN is already provided for you at `train.py`. In practice, you should be able to take any custom PyTorch training script as is and run it with Azure ML without having to modify your code.\n", + "Now you will need to create your training script. In this tutorial, the script for distributed training of GENSEN is already provided for you at `gensen_train.py`. \n", "\n", - "However, if you would like to use Azure ML's [metric logging](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#logging) capabilities, you will have to add a small amount of Azure ML logic inside your training script. In this example, at each logging interval, we will log the loss for that minibatch to our Azure ML run.\n", + "In this example, we use MLflow to log your metrics. We also use the [Azure ML-Mlflow](https://pypi.org/project/azureml-mlflow/) package to log these metrics to the Azure Portal. This is done with no change to the provided training script!\n", "\n", - "To do so, in `train.py`, we will first access the Azure ML `Run` object within the script:\n", - "```Python\n", - "from azureml.core.run import Run\n", - "run = Run.get_context()\n", - "```\n", - "Later within the script, we log the loss metric to our run:\n", - "```Python\n", - "run.log('loss', loss.item())\n", - "```" + "In this example the script provided logs the loss for that minibatch to our Azure ML portal." ] }, { @@ -872,7 +871,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -881,14 +880,12 @@ "'../../utils_nlp/gensen/gensen_config.json'" ] }, - "execution_count": 36, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import shutil\n", - "\n", "gensen_folder = os.path.join(project_folder,'utils_nlp/gensen/')\n", "shutil.copy('gensen_train.py', gensen_folder)\n", "shutil.copy('gensen_config.json', gensen_folder)" @@ -904,7 +901,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -919,12 +916,12 @@ "### 2.3.3 Create a PyTorch Estimator\n", "The Azure ML SDK's PyTorch estimator enables you to easily submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch).\n", "\n", - "`sample_config.json` defines all the hyperparameters and paths when training GenSen model. The trained model will be saved in `data/models/example` to Azure Blob Storage. **Remember to clean `data/models/example` folder in order to save new models.**" + "`gensen_config.json` defines all the hyperparameters and paths when training GenSen model. The trained model will be saved in `models` to Azure Blob Storage. **Remember to clean `models` folder in order to save new models.**" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -944,11 +941,12 @@ " script_params=script_params,\n", " compute_target=compute_target,\n", " entry_script='utils_nlp/gensen/gensen_train.py',\n", - " node_count=4,\n", + " node_count=2,\n", " process_count_per_node=1,\n", " distributed_training=MpiConfiguration(),\n", " use_gpu=True,\n", - " conda_packages=['scikit-learn=0.20.3', 'h5py', 'nltk']\n", + " conda_packages=['scikit-learn=0.20.3', 'h5py', 'nltk'],\n", + " pip_packages=['azureml-mlflow>=1.0.43.1','numpy>=1.16.0']\n", " )" ] }, @@ -981,7 +979,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 9, "metadata": { "scrolled": true }, @@ -990,7 +988,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Submitting C:\\Users\\lishao\\Project\\Rotation2\\NLP directory for run. The size of the directory >= 25 MB, so it can take a few minutes.\n" + "Submitting E:\\Projects\\NLP-BP\\temp\\nlp directory for run. The size of the directory >= 25 MB, so it can take a few minutes.\n" ] }, { @@ -998,9 +996,9 @@ "output_type": "stream", "text": [ "Run(Experiment: pytorch-gensen,\n", - "Id: pytorch-gensen_1560797674_e36e44f4,\n", + "Id: pytorch-gensen_1561150688_f84eab04,\n", "Type: azureml.scriptrun,\n", - "Status: Preparing)\n" + "Status: Queued)\n" ] } ], @@ -1021,7 +1019,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Horovod on AzureML**\n", + "#### Horovod on AzureML\n", "\n", "[Horovod](https://github.com/horovod/horovod) is a distributed training framework for TensorFlow, PyTorch etc. to make distributed Deep Learning fast and easy to use. We have created 2 nodes in the GPU cluster on AzureML. By using Horovod, we can use those two machines to train the model in parallel. In theory, the model trains faster on AzureML than on VM which uses single machine because it converges faster which we will get lower loss. However, by using more nodes, the model may take more time in communicating with each node. The communication time could be ignored when the model is trained on the large datasets.\n", "\n", @@ -1030,11 +1028,10 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "**Interpret the Training Results**\n", + "#### Interpret the Training Results\n", "\n", "The following chart shows the model validation loss (the less loss, the better performance) with different nodes with AmlCompute:\n", "\n", @@ -1045,28 +1042,29 @@ "From the chart, we can tell training with more nodes, the performance is getting better with lower loss." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Azureml Widget allows an easy way to stream updates of the logged metrics right into your notebook. To use this feature install the widget by running the commands below. \n", + "\n", + "```\n", + "conda install ipywidgets\n", + "\n", + "jupyter nbextension install --py --user azureml.widgets\n", + "\n", + "jupyter nbextension enable azureml.widgets --user --py\n", + "\n", + "```" + ] + }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": { - "scrolled": true + "scrolled": false }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "19d55fcc0871444da604b1d828d9eac4", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "RunDetails(run).show()" ] @@ -1088,7 +1086,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "run.wait_for_completion(show_output=True) # this provides a verbose log" @@ -1117,6 +1117,35 @@ " ```" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3.6 Clean up after training\n", + "\n", + "We finally delete the training script `gensen_train.py` and config file `gensen_config.json` from the project directory." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "gensen_train = os.path.join(project_folder,'utils_nlp/gensen/gensen_train.py')\n", + "gensen_config = os.path.join(project_folder,'utils_nlp/gensen/gensen_config.json')\n", + "\n", + "if os.path.isfile(gensen_train):\n", + " os.remove(gensen_train)\n", + "else:\n", + " print(\"Error: %s file not found\" % gensen_train)\n", + " \n", + "if os.path.isfile(gensen_config):\n", + " os.remove(gensen_config)\n", + "else:\n", + " print(\"Error: %s file not found\" % gensen_config)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1138,7 +1167,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1167,7 +1196,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1197,54 +1226,11 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": null, "metadata": { "scrolled": false }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c61e610d4601486e9f41fd852320b47b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO',…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5c47f13e11c646cd865d4f286b70ab0c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "RunDetails(hyperdrive_run).show()" ] @@ -1334,9 +1320,9 @@ } ], "kernelspec": { - "display_name": "Python 3", + "display_name": "Python NLP CPU", "language": "python", - "name": "python3" + "name": "nlp_cpu" }, "language_info": { "codemirror_mode": { diff --git a/scenarios/sentence_similarity/gensen_config.json b/scenarios/sentence_similarity/gensen_config.json index f7e47a76c..54aff67b7 100644 --- a/scenarios/sentence_similarity/gensen_config.json +++ b/scenarios/sentence_similarity/gensen_config.json @@ -15,20 +15,20 @@ }, "data": {"paths": [ { - "train_src": "data/processed/snli_1.0_train.txt.s1.tok", - "train_trg": "data/processed/snli_1.0_train.txt.s2.tok", - "val_src": "data/processed/snli_1.0_dev.txt.s1.tok", - "val_trg": "data/processed/snli_1.0_dev.txt.s1.tok", + "train_src": "snli_1.0_train.txt.s1.tok", + "train_trg": "snli_1.0_train.txt.s2.tok", + "val_src": "snli_1.0_dev.txt.s1.tok", + "val_trg": "snli_1.0_dev.txt.s1.tok", "taskname": "snli" } ], "max_src_length": 90, "max_trg_length": 90, "task": "multi-seq2seq-nli", - "save_dir": "data/models/example", - "nli_train": "data/processed/snli_1.0_train.txt.clean.noblank", - "nli_dev": "data/processed/snli_1.0_dev.txt.clean.noblank", - "nli_test": "data/processed/snli_1.0_test.txt.clean.noblank" + "save_dir": "models/", + "nli_train": "snli_1.0_train.txt.clean.noblank", + "nli_dev": "snli_1.0_dev.txt.clean.noblank", + "nli_test": "snli_1.0_test.txt.clean.noblank" }, "model": { "dim_src": 2048, diff --git a/scenarios/sentence_similarity/gensen_train.py b/scenarios/sentence_similarity/gensen_train.py index 7b704a157..b29fb1dcb 100644 --- a/scenarios/sentence_similarity/gensen_train.py +++ b/scenarios/sentence_similarity/gensen_train.py @@ -15,20 +15,20 @@ This training process is based on GPU only. """ -import logging import argparse -import os import json +import logging +import os import time +import horovod.torch as hvd +import mlflow import numpy as np import torch import torch.backends.cudnn as cudnn import torch.nn as nn import torch.nn.functional as f import torch.optim as optim -from azureml.core.run import Run -import horovod.torch as hvd from utils_nlp.gensen.multi_task_model import MultitaskModel from utils_nlp.gensen.utils import ( @@ -37,10 +37,8 @@ compute_validation_loss, ) -# get the Azure ML run object -run = Run.get_context() - cudnn.benchmark = True +logger = logging.getLogger(__name__) hvd.init() if torch.cuda.is_available(): @@ -138,6 +136,7 @@ def evaluate( model_state, ): """ Function to validate the model. + Args: model_state(dict): Saved model weights. config(dict): Config object. @@ -146,9 +145,11 @@ def evaluate( loss_criterion(nn.CrossEntropyLoss): Cross entropy loss. monitor_epoch(int): Current epoch count. min_val_loss(float): Minimum validation loss - min_val_loss_epoch(int): Epoch where the minimum validation loss was seen. + min_val_loss_epoch(int): Epoch where the minimum validation + loss was seen. save_dir(str): Directory path to save the model dictionary. starting_time(time.Time): Starting time of the training. + Returns: bool: Whether to continue training or not. """ @@ -172,7 +173,9 @@ def evaluate( # Horovod: print output only on first rank. if hvd.rank() == 0: # log the best val accuracy to AML run - run.log("Best Validation Loss", np.float(validation_loss)) + logging.info( + "Best Validation Loss: {}".format(np.float(validation_loss)) + ) # If the validation loss is small enough, and it starts to go up. # Should stop training. @@ -182,8 +185,6 @@ def evaluate( min_val_loss_epoch = monitor_epoch model_state = model.state_dict() - run.log("Validation Loss", validation_loss) - print(monitor_epoch, min_val_loss_epoch, min_val_loss) logging.info( "Monitor epoch: %d Validation Loss: %.3f Min Validation Epoch: " "%d Loss : %.3f " @@ -275,312 +276,333 @@ def train(config, data_folder, learning_rate=0.0001): config(dict): Loaded json file as a python object. data_folder(str): Path to the folder containing the data. learning_rate(float): Learning rate for the model. - """ owd = os.getcwd() + os.chdir(data_folder) try: - save_dir = config["data"]["save_dir"] - - os.chdir(data_folder) - - if not os.path.exists("./log"): - os.makedirs("./log") - - os.makedirs(save_dir, exist_ok=True) - - setup_logging(config) - - batch_size = config["training"]["batch_size"] - src_vocab_size = config["model"]["n_words_src"] - trg_vocab_size = config["model"]["n_words_trg"] - max_len_src = config["data"]["max_src_length"] - max_len_trg = config["data"]["max_trg_length"] - model_state = {} - - train_src = [item["train_src"] for item in config["data"]["paths"]] - train_trg = [item["train_trg"] for item in config["data"]["paths"]] - tasknames = [item["taskname"] for item in config["data"]["paths"]] - - # Keep track of indicies to train forward and backward jointly - if ( - "skipthought_next" in tasknames - and "skipthought_previous" in tasknames - ): - skipthought_idx = tasknames.index("skipthought_next") - skipthought_backward_idx = tasknames.index("skipthought_previous") - paired_tasks = { - skipthought_idx: skipthought_backward_idx, - skipthought_backward_idx: skipthought_idx, - } - else: - paired_tasks = None - skipthought_idx = None - skipthought_backward_idx = None - - train_iterator = BufferedDataIterator( - train_src, - train_trg, - src_vocab_size, - trg_vocab_size, - tasknames, - save_dir, - buffer_size=1e6, - lowercase=True, - seed=(hvd.rank() + 1) * 12345, - ) + with mlflow.start_run(): + save_dir = config["data"]["save_dir"] + if not os.path.exists("./log"): + os.makedirs("./log") - nli_iterator = NLIIterator( - train=config["data"]["nli_train"], - dev=config["data"]["nli_dev"], - test=config["data"]["nli_test"], - vocab_size=-1, - vocab=os.path.join(save_dir, "src_vocab.pkl"), - seed=(hvd.rank() + 1) * 12345, - ) + os.makedirs(save_dir, exist_ok=True) - src_vocab_size = len(train_iterator.src[0]["word2id"]) - trg_vocab_size = len(train_iterator.trg[0]["word2id"]) + setup_logging(config) - # Logging set up. - logging.info("Finished creating iterator ...") - log_config(config) - logging.info( - "Found %d words in source : " - % (len(train_iterator.src[0]["id2word"])) - ) - for idx, taskname in enumerate(tasknames): - logging.info( - "Found %d target words in task %s " - % (len(train_iterator.trg[idx]["id2word"]), taskname) - ) - logging.info("Found %d words in src " % src_vocab_size) - logging.info("Found %d words in trg " % trg_vocab_size) - - weight_mask = torch.ones(trg_vocab_size).cuda() - weight_mask[train_iterator.trg[0]["word2id"][""]] = 0 - loss_criterion = nn.CrossEntropyLoss(weight=weight_mask).cuda() - nli_criterion = nn.CrossEntropyLoss().cuda() - - model = MultitaskModel( - src_emb_dim=config["model"]["dim_word_src"], - trg_emb_dim=config["model"]["dim_word_trg"], - src_vocab_size=src_vocab_size, - trg_vocab_size=trg_vocab_size, - src_hidden_dim=config["model"]["dim_src"], - trg_hidden_dim=config["model"]["dim_trg"], - bidirectional=config["model"]["bidirectional"], - pad_token_src=train_iterator.src[0]["word2id"][""], - pad_token_trg=train_iterator.trg[0]["word2id"][""], - nlayers_src=config["model"]["n_layers_src"], - dropout=config["model"]["dropout"], - num_tasks=len(train_iterator.src), - paired_tasks=paired_tasks, - ).cuda() - - optimizer = setup_horovod(model, learning_rate=learning_rate) - logging.info(model) - - n_gpus = config["training"]["n_gpus"] - model = torch.nn.DataParallel(model, device_ids=range(n_gpus)) - - task_losses = [[] for _ in tasknames] - task_idxs = [0 for _ in tasknames] - nli_losses = [] - updates = 0 - nli_ctr = 0 - nli_epoch = 0 - monitor_epoch = 0 - nli_mbatch_ctr = 0 - mbatch_times = [] - min_val_loss = 10000000 - min_val_loss_epoch = -1 - rng_num_tasks = len(tasknames) - 1 if paired_tasks else len(tasknames) - logging.info("Commencing Training ...") - start = time.time() - while True: - # Train NLI once every 10 minibatches of other tasks - if nli_ctr % 10 == 0: - minibatch = nli_iterator.get_parallel_minibatch( - nli_mbatch_ctr, batch_size * n_gpus - ) - optimizer.zero_grad() - class_logits = model( - minibatch, -1, return_hidden=False, paired_trg=None - ) + batch_size = config["training"]["batch_size"] + src_vocab_size = config["model"]["n_words_src"] + trg_vocab_size = config["model"]["n_words_trg"] + max_len_src = config["data"]["max_src_length"] + max_len_trg = config["data"]["max_trg_length"] + model_state = {} + + train_src = [item["train_src"] for item in config["data"]["paths"]] + train_trg = [item["train_trg"] for item in config["data"]["paths"]] + tasknames = [item["taskname"] for item in config["data"]["paths"]] - loss = nli_criterion( - class_logits.contiguous().view(-1, class_logits.size(1)), - minibatch["labels"].contiguous().view(-1), + # Keep track of indicies to train forward and backward jointly + if ( + "skipthought_next" in tasknames + and "skipthought_previous" in tasknames + ): + skipthought_idx = tasknames.index("skipthought_next") + skipthought_backward_idx = tasknames.index( + "skipthought_previous" ) + paired_tasks = { + skipthought_idx: skipthought_backward_idx, + skipthought_backward_idx: skipthought_idx, + } + else: + paired_tasks = None + skipthought_idx = None + skipthought_backward_idx = None + + train_iterator = BufferedDataIterator( + train_src, + train_trg, + src_vocab_size, + trg_vocab_size, + tasknames, + save_dir, + buffer_size=1e6, + lowercase=True, + seed=(hvd.rank() + 1) * 12345, + ) - # nli_losses.append(loss.data[0]) - nli_losses.append(loss.item()) - loss.backward() - torch.nn.utils.clip_grad_norm(model.parameters(), 1.0) - optimizer.step() + nli_iterator = NLIIterator( + train=config["data"]["nli_train"], + dev=config["data"]["nli_dev"], + test=config["data"]["nli_test"], + vocab_size=-1, + vocab=os.path.join(save_dir, "src_vocab.pkl"), + seed=(hvd.rank() + 1) * 12345, + ) - # For AML. - run.log("loss", loss.item()) + src_vocab_size = len(train_iterator.src[0]["word2id"]) + trg_vocab_size = len(train_iterator.trg[0]["word2id"]) - nli_mbatch_ctr += batch_size * n_gpus - if nli_mbatch_ctr >= len(nli_iterator.train_lines): - nli_mbatch_ctr = 0 - nli_epoch += 1 - else: - # Sample a random task - task_idx = np.random.randint(low=0, high=rng_num_tasks) - - # Get a minibatch corresponding to the sampled task - minibatch = train_iterator.get_parallel_minibatch( - task_idx, - task_idxs[task_idx], - batch_size * n_gpus, - max_len_src, - max_len_trg, + # Logging set up. + logging.info("Finished creating iterator ...") + log_config(config) + logging.info( + "Found %d words in source : " + % (len(train_iterator.src[0]["id2word"])) + ) + for idx, taskname in enumerate(tasknames): + logging.info( + "Found %d target words in task %s " + % (len(train_iterator.trg[idx]["id2word"]), taskname) ) - - """Increment pointer into task and if current buffer is - exhausted, fetch new buffer. """ - task_idxs[task_idx] += batch_size * n_gpus - if task_idxs[task_idx] >= train_iterator.buffer_size: - train_iterator.fetch_buffer(task_idx) - task_idxs[task_idx] = 0 - - if task_idx == skipthought_idx: - minibatch_back = train_iterator.get_parallel_minibatch( - skipthought_backward_idx, - task_idxs[skipthought_backward_idx], - batch_size * n_gpus, - max_len_src, - max_len_trg, + logging.info("Found %d words in src " % src_vocab_size) + logging.info("Found %d words in trg " % trg_vocab_size) + + weight_mask = torch.ones(trg_vocab_size).cuda() + weight_mask[train_iterator.trg[0]["word2id"][""]] = 0 + loss_criterion = nn.CrossEntropyLoss(weight=weight_mask).cuda() + nli_criterion = nn.CrossEntropyLoss().cuda() + + model = MultitaskModel( + src_emb_dim=config["model"]["dim_word_src"], + trg_emb_dim=config["model"]["dim_word_trg"], + src_vocab_size=src_vocab_size, + trg_vocab_size=trg_vocab_size, + src_hidden_dim=config["model"]["dim_src"], + trg_hidden_dim=config["model"]["dim_trg"], + bidirectional=config["model"]["bidirectional"], + pad_token_src=train_iterator.src[0]["word2id"][""], + pad_token_trg=train_iterator.trg[0]["word2id"][""], + nlayers_src=config["model"]["n_layers_src"], + dropout=config["model"]["dropout"], + num_tasks=len(train_iterator.src), + paired_tasks=paired_tasks, + ).cuda() + + optimizer = setup_horovod(model, learning_rate=learning_rate) + logging.info(model) + + n_gpus = config["training"]["n_gpus"] + model = torch.nn.DataParallel(model, device_ids=range(n_gpus)) + + task_losses = [[] for _ in tasknames] + task_idxs = [0 for _ in tasknames] + nli_losses = [] + updates = 0 + nli_ctr = 0 + nli_epoch = 0 + monitor_epoch = 0 + nli_mbatch_ctr = 0 + mbatch_times = [] + min_val_loss = 10000000 + min_val_loss_epoch = -1 + rng_num_tasks = ( + len(tasknames) - 1 if paired_tasks else len(tasknames) + ) + logging.info("OS Environ: \n {} \n\n".format(os.environ)) + mlflow.log_param("learning_rate", learning_rate) + logging.info("Commencing Training ...") + start = time.time() + while True: + batch_start_time = time.time() + # Train NLI once every 10 minibatches of other tasks + if nli_ctr % 10 == 0: + minibatch = nli_iterator.get_parallel_minibatch( + nli_mbatch_ctr, batch_size * n_gpus ) - task_idxs[skipthought_backward_idx] += batch_size * n_gpus - if ( - task_idxs[skipthought_backward_idx] - >= train_iterator.buffer_size - ): - train_iterator.fetch_buffer(skipthought_backward_idx) - task_idxs[skipthought_backward_idx] = 0 - optimizer.zero_grad() - decoder_logit, decoder_logit_2 = model( - minibatch, - task_idx, - paired_trg=minibatch_back["input_trg"], - ) - - loss_f = loss_criterion( - decoder_logit.contiguous().view( - -1, decoder_logit.size(2) - ), - minibatch["output_trg"].contiguous().view(-1), + class_logits = model( + minibatch, -1, return_hidden=False, paired_trg=None ) - loss_b = loss_criterion( - decoder_logit_2.contiguous().view( - -1, decoder_logit_2.size(2) + loss = nli_criterion( + class_logits.contiguous().view( + -1, class_logits.size(1) ), - minibatch_back["output_trg"].contiguous().view(-1), + minibatch["labels"].contiguous().view(-1), ) - task_losses[task_idx].append(loss_f.data[0]) - task_losses[skipthought_backward_idx].append( - loss_b.data[0] - ) - loss = loss_f + loss_b + # nli_losses.append(loss.data[0]) + nli_losses.append(loss.item()) + loss.backward() + torch.nn.utils.clip_grad_norm(model.parameters(), 1.0) + optimizer.step() + nli_mbatch_ctr += batch_size * n_gpus + if nli_mbatch_ctr >= len(nli_iterator.train_lines): + nli_mbatch_ctr = 0 + nli_epoch += 1 else: - optimizer.zero_grad() - decoder_logit = model(minibatch, task_idx) + # Sample a random task + task_idx = np.random.randint(low=0, high=rng_num_tasks) - loss = loss_criterion( - decoder_logit.contiguous().view( - -1, decoder_logit.size(2) - ), - minibatch["output_trg"].contiguous().view(-1), + # Get a minibatch corresponding to the sampled task + minibatch = train_iterator.get_parallel_minibatch( + task_idx, + task_idxs[task_idx], + batch_size * n_gpus, + max_len_src, + max_len_trg, ) - task_losses[task_idx].append(loss.item()) + """Increment pointer into task and if current buffer is + exhausted, fetch new buffer. """ + task_idxs[task_idx] += batch_size * n_gpus + if task_idxs[task_idx] >= train_iterator.buffer_size: + train_iterator.fetch_buffer(task_idx) + task_idxs[task_idx] = 0 + + if task_idx == skipthought_idx: + minibatch_back = train_iterator.get_parallel_minibatch( + skipthought_backward_idx, + task_idxs[skipthought_backward_idx], + batch_size * n_gpus, + max_len_src, + max_len_trg, + ) + task_idxs[skipthought_backward_idx] += ( + batch_size * n_gpus + ) + if ( + task_idxs[skipthought_backward_idx] + >= train_iterator.buffer_size + ): + train_iterator.fetch_buffer( + skipthought_backward_idx + ) + task_idxs[skipthought_backward_idx] = 0 + + optimizer.zero_grad() + decoder_logit, decoder_logit_2 = model( + minibatch, + task_idx, + paired_trg=minibatch_back["input_trg"], + ) + + loss_f = loss_criterion( + decoder_logit.contiguous().view( + -1, decoder_logit.size(2) + ), + minibatch["output_trg"].contiguous().view(-1), + ) + + loss_b = loss_criterion( + decoder_logit_2.contiguous().view( + -1, decoder_logit_2.size(2) + ), + minibatch_back["output_trg"].contiguous().view(-1), + ) - loss.backward() - # For distributed optimizer need to sync before gradient - # clipping. - optimizer.synchronize() + task_losses[task_idx].append(loss_f.data[0]) + task_losses[skipthought_backward_idx].append( + loss_b.data[0] + ) + loss = loss_f + loss_b - torch.nn.utils.clip_grad_norm(model.parameters(), 1.0) - optimizer.step() + else: + optimizer.zero_grad() + decoder_logit = model(minibatch, task_idx) - end = time.time() - mbatch_times.append(end - start) + loss = loss_criterion( + decoder_logit.contiguous().view( + -1, decoder_logit.size(2) + ), + minibatch["output_trg"].contiguous().view(-1), + ) + + task_losses[task_idx].append(loss.item()) + + loss.backward() + # For distributed optimizer need to sync before gradient + # clipping. + optimizer.synchronize() + + torch.nn.utils.clip_grad_norm(model.parameters(), 1.0) + optimizer.step() + + end = time.time() + mbatch_times.append(end - batch_start_time) + + # Validations + if ( + updates % config["management"]["monitor_loss"] == 0 + and updates != 0 + ): + monitor_epoch += 1 + for idx, task in enumerate(tasknames): + logging.info( + "Seq2Seq Examples Processed : %d %s Loss : %.5f Num %s " + "minibatches : %d" + % ( + updates, + task, + np.mean(task_losses[idx]), + task, + len(task_losses[idx]), + ) + ) + mlflow.log_metric( + "validation_loss", + np.mean(task_losses[idx]), + step=monitor_epoch, + ) - # Validations - if ( - updates % config["management"]["monitor_loss"] == 0 - and updates != 0 - ): - monitor_epoch += 1 - for idx, task in enumerate(tasknames): logging.info( - "Seq2Seq Examples Processed : %d %s Loss : %.5f Num %s " - "minibatches : %d" + "Round: %d NLI Epoch : %d NLI Examples Processed : %d NLI " + "Loss : %.5f " % ( - updates, - task, - np.mean(task_losses[idx]), - task, - len(task_losses[idx]), + nli_ctr, + nli_epoch, + nli_mbatch_ctr, + np.mean(nli_losses), ) ) - run.log("Task Loss", np.mean(task_losses[idx])) + mlflow.log_metric( + "nli_loss", np.mean(nli_losses), step=nli_epoch + ) - logging.info( - "Round: %d NLI Epoch : %d NLI Examples Processed : %d NLI " - "Loss : %.5f " - % (nli_ctr, nli_epoch, nli_mbatch_ctr, np.mean(nli_losses)) - ) - run.log("NLI Loss", np.mean(nli_losses)) - logging.info( - "Average time per mininbatch : %.5f" - % (np.mean(mbatch_times)) - ) - run.log( - "Average time per mininbatch : ", np.mean(mbatch_times) - ) - task_losses = [[] for _ in tasknames] - mbatch_times = [] - nli_losses = [] - - # For validate and break if done. - logging.info("############################") - logging.info("##### Evaluating model #####") - logging.info("############################") - training_complete, min_val_loss_epoch, min_val_loss, model_state = evaluate( - config=config, - train_iterator=train_iterator, - model=model, - loss_criterion=loss_criterion, - monitor_epoch=monitor_epoch, - min_val_loss=min_val_loss, - min_val_loss_epoch=min_val_loss_epoch, - save_dir=save_dir, - starting_time=start, - model_state=model_state, - ) - if training_complete: - break - - logging.info("Evaluating on NLI") - evaluate_nli( - nli_iterator=nli_iterator, - model=model, - n_gpus=n_gpus, - batch_size=batch_size, - ) + logging.info( + "Average time per mininbatch : %.5f" + % (np.mean(mbatch_times)) + ) + mlflow.log_metric( + "minibatch_avg_duration", np.mean(mbatch_times) + ) + + task_losses = [[] for _ in tasknames] + mbatch_times = [] + nli_losses = [] + + # For validate and break if done. + logging.info("############################") + logging.info("##### Evaluating model #####") + logging.info("############################") + training_complete, min_val_loss_epoch, min_val_loss, model_state = evaluate( + config=config, + train_iterator=train_iterator, + model=model, + loss_criterion=loss_criterion, + monitor_epoch=monitor_epoch, + min_val_loss=min_val_loss, + min_val_loss_epoch=min_val_loss_epoch, + save_dir=save_dir, + starting_time=start, + model_state=model_state, + ) + if training_complete: + break + + logging.info("Evaluating on NLI") + evaluate_nli( + nli_iterator=nli_iterator, + model=model, + n_gpus=n_gpus, + batch_size=batch_size, + ) - updates += batch_size * n_gpus - nli_ctr += 1 - logging.info("Updates: %d" % updates) + updates += batch_size * n_gpus + nli_ctr += 1 + logging.info("Updates: %d" % updates) finally: os.chdir(owd) diff --git a/scenarios/text_classification/README.md b/scenarios/text_classification/README.md index e69de29bb..5a8e46488 100644 --- a/scenarios/text_classification/README.md +++ b/scenarios/text_classification/README.md @@ -0,0 +1,3 @@ +# Text Classification + +Text classification is a supervised learning method of learning and predicting the category or the class of a document given its text content. The state-of-the-art methods are based on neural networks of different architectures as well as pretrained language models or word embeddings. Text classification is a core task in natural language Processing and has numerous applications such as sentiment analysis, document indexing in digital libraries, hate speech detection, and general-purpose categorization in medical, academic, legal, and many other domains. diff --git a/scenarios/text_classification/tc_dac_bert_ar.ipynb b/scenarios/text_classification/tc_dac_bert_ar.ipynb index f7a56b655..8cea7806f 100644 --- a/scenarios/text_classification/tc_dac_bert_ar.ipynb +++ b/scenarios/text_classification/tc_dac_bert_ar.ipynb @@ -327,8 +327,8 @@ "outputs": [], "source": [ "tokenizer = Tokenizer(LANGUAGE, cache_dir=BERT_CACHE_DIR)\n", - "tokens_train = tokenizer.tokenize(df_train[text_col].astype(str))\n", - "tokens_test = tokenizer.tokenize(df_test[text_col].astype(str))" + "tokens_train = tokenizer.tokenize(list(df_train[text_col].astype(str)))\n", + "tokens_test = tokenizer.tokenize(list(df_test[text_col].astype(str)))" ] }, { diff --git a/scenarios/text_classification/tc_mnli_bert.ipynb b/scenarios/text_classification/tc_mnli_bert.ipynb index c7c2b0344..d4e40d6c3 100644 --- a/scenarios/text_classification/tc_mnli_bert.ipynb +++ b/scenarios/text_classification/tc_mnli_bert.ipynb @@ -256,14 +256,23 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 78540/78540 [00:26<00:00, 2991.68it/s]\n", + "100%|██████████| 52360/52360 [00:17<00:00, 2981.71it/s]\n" + ] + } + ], "source": [ "tokenizer = Tokenizer(LANGUAGE, to_lower=TO_LOWER, cache_dir=BERT_CACHE_DIR)\n", "\n", - "tokens_train = tokenizer.tokenize(df_train[TEXT_COL])\n", - "tokens_test = tokenizer.tokenize(df_test[TEXT_COL])" + "tokens_train = tokenizer.tokenize(list(df_train[TEXT_COL]))\n", + "tokens_test = tokenizer.tokenize(list(df_test[TEXT_COL]))" ] }, { @@ -275,20 +284,21 @@ "- Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n", "- Pad or truncate the token lists to the specified max length\n", "- Return mask lists that indicate paddings' positions\n", + "- Return token type id lists that indicate which sentence the tokens belong to (not needed for one-sequence classification)\n", "\n", "*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "tokens_train, mask_train = tokenizer.preprocess_classification_tokens(\n", + "tokens_train, mask_train, _ = tokenizer.preprocess_classification_tokens(\n", " tokens_train, MAX_LEN\n", ")\n", - "tokens_test, mask_test = tokenizer.preprocess_classification_tokens(\n", + "tokens_test, mask_test, _ = tokenizer.preprocess_classification_tokens(\n", " tokens_test, MAX_LEN\n", ")" ] @@ -303,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -322,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "scrolled": true }, @@ -338,17 +348,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "epoch:1/1; batch:1->246/2454; loss:1.824086\n", - "epoch:1/1; batch:247->492/2454; loss:0.446337\n", - "epoch:1/1; batch:493->738/2454; loss:0.298814\n", - "epoch:1/1; batch:739->984/2454; loss:0.265785\n", - "epoch:1/1; batch:985->1230/2454; loss:0.101790\n", - "epoch:1/1; batch:1231->1476/2454; loss:0.251120\n", - "epoch:1/1; batch:1477->1722/2454; loss:0.040894\n", - "epoch:1/1; batch:1723->1968/2454; loss:0.038339\n", - "epoch:1/1; batch:1969->2214/2454; loss:0.021586\n", - "epoch:1/1; batch:2215->2454/2454; loss:0.130719\n", - "[Training time: 0.980 hrs]\n" + "epoch:1/1; batch:1->246/2454; loss:1.584357\n", + "epoch:1/1; batch:247->492/2454; loss:0.110689\n", + "epoch:1/1; batch:493->738/2454; loss:0.208907\n", + "epoch:1/1; batch:739->984/2454; loss:0.423804\n", + "epoch:1/1; batch:985->1230/2454; loss:0.035525\n", + "epoch:1/1; batch:1231->1476/2454; loss:0.189890\n", + "epoch:1/1; batch:1477->1722/2454; loss:0.216201\n", + "epoch:1/1; batch:1723->1968/2454; loss:0.245825\n", + "epoch:1/1; batch:1969->2214/2454; loss:0.138958\n", + "epoch:1/1; batch:2215->2454/2454; loss:0.066018\n", + "[Training time: 0.963 hrs]\n" ] } ], @@ -376,14 +386,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "52384it [11:54, 88.97it/s] \n" + "52384it [11:51, 88.76it/s] \n" ] } ], @@ -403,7 +413,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -412,15 +422,15 @@ "text": [ " precision recall f1-score support\n", "\n", - " fiction 0.90 0.94 0.92 10275\n", - " government 0.97 0.93 0.95 10292\n", - " slate 0.88 0.85 0.87 10277\n", + " fiction 0.88 0.96 0.91 10275\n", + " government 0.94 0.94 0.94 10292\n", + " slate 0.91 0.80 0.85 10277\n", " telephone 0.99 1.00 0.99 11205\n", " travel 0.95 0.97 0.96 10311\n", "\n", - " accuracy 0.94 52360\n", - " macro avg 0.94 0.94 0.94 52360\n", - "weighted avg 0.94 0.94 0.94 52360\n", + " accuracy 0.93 52360\n", + " macro avg 0.93 0.93 0.93 52360\n", + "weighted avg 0.93 0.93 0.93 52360\n", "\n" ] } @@ -428,6 +438,13 @@ "source": [ "print(classification_report(labels_test, preds, target_names=label_encoder.classes_))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/tests/README.md b/tests/README.md index bdd57bd54..79abd7de2 100644 --- a/tests/README.md +++ b/tests/README.md @@ -2,6 +2,8 @@ This project uses unit, smoke and integration tests with Python files and notebooks. For more information, see a [quick introduction to unit, smoke and integration tests](https://miguelgfierro.com/blog/2018/a-beginners-guide-to-python-testing/). To manually execute the unit tests in the different environments, first **make sure you are in the correct environment as described in the [SETUP.md](/SETUP.md)**. +Tests are automatically run as part of a DevOps pipeline. The pipelines are defined in .yml files in tests/ci with filenames that align with pipeline names. + ## Test execution Click on the following menus to see more details on how to execute the unit, smoke and integration tests: diff --git a/tests/ci/cpu_unit_tests_linux.yml b/tests/ci/cpu_unit_tests_linux.yml new file mode 100644 index 000000000..f6a50b74f --- /dev/null +++ b/tests/ci/cpu_unit_tests_linux.yml @@ -0,0 +1,44 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +# Pull request against these branches will trigger this build +pr: +- master +- staging + +pool: + name: "nlptestmachine" + +steps: + +- bash: | + echo "##vso[task.prependpath]/data/anaconda/bin" + displayName: Add Conda to PATH + +- bash: | + python tools/generate_conda_file.py + conda env create -n nlp_cpu -f nlp_cpu.yaml + displayName: 'Creating Conda Environment with dependencies' + +- bash: | + source activate nlp_cpu + pytest tests/unit -m "not notebooks and not gpu" --junitxml=junit/test-unitttest.xml + displayName: 'Run Unit tests' + +- bash: | + echo Remove Conda Environment + conda remove -n nlp_cpu --all -q --force -y + echo Done Cleanup + displayName: 'Cleanup Task' + condition: always() + +- task: PublishTestResults@2 + inputs: + testResultsFiles: '**/test-unitttest.xml' + testRunTitle: 'Test results for PyTest' + +- task: ComponentGovernanceComponentDetection@0 + inputs: + scanType: 'Register' + verbosity: 'Verbose' + alertWarningLevel: 'High' \ No newline at end of file diff --git a/tests/ci/gpu_unit_tests_linux.yml b/tests/ci/gpu_unit_tests_linux.yml new file mode 100644 index 000000000..3cf91441a --- /dev/null +++ b/tests/ci/gpu_unit_tests_linux.yml @@ -0,0 +1,44 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +# Pull request against these branches will trigger this build +pr: +- master +- staging + +pool: + name: "nlptestmachine" + +steps: + +- bash: | + echo "##vso[task.prependpath]/data/anaconda/bin" + displayName: Add Conda to PATH + +- bash: | + python tools/generate_conda_file.py --gpu + conda env create -n nlp_gpu -f nlp_gpu.yaml + displayName: 'Creating Conda Environment with dependencies' + +- bash: | + source activate nlp_gpu + pytest tests/unit -m "not notebooks and gpu" --junitxml=junit/test-unitttest.xml + displayName: 'Run Unit tests' + +- bash: | + echo Remove Conda Environment + conda remove -n nlp_gpu --all -q --force -y + echo Done Cleanup + displayName: 'Cleanup Task' + condition: always() + +- task: PublishTestResults@2 + inputs: + testResultsFiles: '**/test-unitttest.xml' + testRunTitle: 'Test results for PyTest' + +- task: ComponentGovernanceComponentDetection@0 + inputs: + scanType: 'Register' + verbosity: 'Verbose' + alertWarningLevel: 'High' \ No newline at end of file diff --git a/tests/ci/notebooks_cpu_unit_tests_linux.yml b/tests/ci/notebooks_cpu_unit_tests_linux.yml new file mode 100644 index 000000000..ed66fbd3c --- /dev/null +++ b/tests/ci/notebooks_cpu_unit_tests_linux.yml @@ -0,0 +1,44 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +# Pull request against these branches will trigger this build +pr: +- master +- staging + +pool: + name: "nlptestmachine" + +steps: + +- bash: | + echo "##vso[task.prependpath]/data/anaconda/bin" + displayName: Add Conda to PATH + +- bash: | + python tools/generate_conda_file.py + conda env create -n nlp_cpu -f nlp_cpu.yaml + displayName: 'Creating Conda Environment with dependencies' + +- bash: | + source activate nlp_cpu + pytest tests/unit -m "notebooks and not gpu" --junitxml=junit/test-unitttest.xml + displayName: 'Run Unit tests' + +- bash: | + echo Remove Conda Environment + conda remove -n nlp_cpu --all -q --force -y + echo Done Cleanup + displayName: 'Cleanup Task' + condition: always() + +- task: PublishTestResults@2 + inputs: + testResultsFiles: '**/test-unitttest.xml' + testRunTitle: 'Test results for PyTest' + +- task: ComponentGovernanceComponentDetection@0 + inputs: + scanType: 'Register' + verbosity: 'Verbose' + alertWarningLevel: 'High' \ No newline at end of file diff --git a/tests/ci/notebooks_gpu_unit_tests_linux.yml b/tests/ci/notebooks_gpu_unit_tests_linux.yml new file mode 100644 index 000000000..619fd45cb --- /dev/null +++ b/tests/ci/notebooks_gpu_unit_tests_linux.yml @@ -0,0 +1,44 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +# Pull request against these branches will trigger this build +pr: +- master +- staging + +pool: + name: "nlptestmachine" + +steps: + +- bash: | + echo "##vso[task.prependpath]/data/anaconda/bin" + displayName: Add Conda to PATH + +- bash: | + python tools/generate_conda_file.py --gpu + conda env create -n nlp_gpu -f nlp_gpu.yaml + displayName: 'Creating Conda Environment with dependencies' + +- bash: | + source activate nlp_gpu + pytest tests/unit -m "notebooks and gpu" --junitxml=junit/test-unitttest.xml + displayName: 'Run Unit tests' + +- bash: | + echo Remove Conda Environment + conda remove -n nlp_gpu --all -q --force -y + echo Done Cleanup + displayName: 'Cleanup Task' + condition: always() + +- task: PublishTestResults@2 + inputs: + testResultsFiles: '**/test-unitttest.xml' + testRunTitle: 'Test results for PyTest' + +- task: ComponentGovernanceComponentDetection@0 + inputs: + scanType: 'Register' + verbosity: 'Verbose' + alertWarningLevel: 'High' \ No newline at end of file diff --git a/tests/ci/repo_metrics_pipeline.yml b/tests/ci/repo_metrics_pipeline.yml new file mode 100644 index 000000000..13166ccc2 --- /dev/null +++ b/tests/ci/repo_metrics_pipeline.yml @@ -0,0 +1,26 @@ + +jobs: +- job: Repometrics + pool: + vmImage: 'ubuntu-16.04' + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.6' + architecture: 'x64' + + - script: | + cp tools/repo_metrics/config_template.py tools/repo_metrics/config.py + sed -i ''s/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX/$(github_token)/g'' tools/repo_metrics/config.py + sed -i ''s/XXXXXXXXXXXXXXXXXXXXXXXXX/$(cosmosdb_connectionstring)/g'' tools/repo_metrics/config.py + displayName: Configure CosmosDB Connection + + - script: | + python -m pip install python-dateutil>=2.80 pymongo>=3.8.0 gitpython>2.1.11 requests>=2.21.0 + python tools/repo_metrics/track_metrics.py --github_repo "https://github.com/microsoft/nlp" --save_to_database + displayName: Python script to record stats + + + + diff --git a/tests/conftest.py b/tests/conftest.py index c37966e73..11007e2d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,11 +9,15 @@ # file. You don’t need to import the fixture you want to use in a test, it # automatically gets discovered by pytest." -import pytest import os -from tests.notebooks_common import path_notebooks from tempfile import TemporaryDirectory +import pytest +from tests.notebooks_common import path_notebooks + +from utils_nlp.bert.common import Language +from utils_nlp.bert.common import Tokenizer as BERTTokenizer + @pytest.fixture(scope="module") def notebooks(): @@ -113,3 +117,8 @@ def ner_test_data(): "EXPECTED_TRAILING_TOKEN_MASK": TRAILING_TOKEN_MASK, "EXPECTED_LABEL_IDS": INPUT_LABEL_IDS, } + + +@pytest.fixture() +def bert_english_tokenizer(): + return BERTTokenizer(language=Language.ENGLISHCASED, to_lower=False) diff --git a/tests/unit/test_bert_common.py b/tests/unit/test_bert_common.py index f611b5ece..b559b28e0 100644 --- a/tests/unit/test_bert_common.py +++ b/tests/unit/test_bert_common.py @@ -3,17 +3,24 @@ import pytest +from utils_nlp.bert.common import create_data_loader -from utils_nlp.bert.common import Tokenizer, create_data_loader, Language +def test_tokenize(bert_english_tokenizer): + text = ["Hello World.", "How you doing?", "greatttt"] + tokens = bert_english_tokenizer.tokenize(text) + assert len(tokens) == len(text) + assert len(tokens[0]) == 3 + assert len(tokens[1]) == 4 + assert len(tokens[2]) == 3 + assert tokens[2][1].startswith("##") -def test_tokenizer_preprocess_ner_tokens(ner_test_data): - seq_length = 20 - tokenizer = Tokenizer(language=Language.ENGLISHCASED, to_lower=False) +def test_tokenize_ner(ner_test_data, bert_english_tokenizer): + seq_length = 20 # test providing labels - preprocessed_tokens = tokenizer.preprocess_ner_tokens( + preprocessed_tokens = bert_english_tokenizer.tokenize_ner( text=ner_test_data["INPUT_TEXT"], labels=ner_test_data["INPUT_LABELS"], label_map=ner_test_data["LABEL_MAP"], @@ -28,7 +35,7 @@ def test_tokenizer_preprocess_ner_tokens(ner_test_data): assert preprocessed_tokens[3] == ner_test_data["EXPECTED_LABEL_IDS"] # test not providing labels - preprocessed_tokens = tokenizer.preprocess_ner_tokens( + preprocessed_tokens = bert_english_tokenizer.tokenize_ner( text=ner_test_data["INPUT_TEXT"], label_map=ner_test_data["LABEL_MAP"], max_len=20, diff --git a/tests/unit/test_bert_sequence_classification.py b/tests/unit/test_bert_sequence_classification.py new file mode 100644 index 000000000..b40cb2cc3 --- /dev/null +++ b/tests/unit/test_bert_sequence_classification.py @@ -0,0 +1,40 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import pytest + +from utils_nlp.bert.sequence_classification import BERTSequenceClassifier +from utils_nlp.bert.common import Language + + +@pytest.fixture() +def data(): + return ( + ["hi", "hello", "what's wrong with us", "can I leave?"], + [0, 0, 1, 2], + ) + + +def test_classifier(bert_english_tokenizer, data): + tokens = bert_english_tokenizer.tokenize(data[0]) + tokens, mask, _ = bert_english_tokenizer.preprocess_classification_tokens( + tokens, max_len=10 + ) + + classifier = BERTSequenceClassifier( + language=Language.ENGLISHCASED, num_labels=3 + ) + classifier.fit( + token_ids=tokens, + input_mask=mask, + labels=data[1], + num_gpus=0, + num_epochs=1, + batch_size=2, + verbose=True, + ) + + preds = classifier.predict( + token_ids=tokens, input_mask=mask, num_gpus=0, batch_size=2 + ) + assert len(preds) == len(data[1]) diff --git a/tests/unit/test_bert_token_classification.py b/tests/unit/test_bert_token_classification.py index 070c37739..ec4c4a961 100644 --- a/tests/unit/test_bert_token_classification.py +++ b/tests/unit/test_bert_token_classification.py @@ -44,6 +44,15 @@ def test_token_classifier_fit_predict(tmp_path, ner_test_data): labels=ner_test_data["INPUT_LABEL_IDS"], ) + # test output probabilities + predictions = token_classifier.predict( + token_ids=ner_test_data["INPUT_TOKEN_IDS"], + input_mask=ner_test_data["INPUT_MASK"], + labels=ner_test_data["INPUT_LABEL_IDS"], + probabilities=True, + ) + assert len(predictions.classes) == predictions.probabilities.shape[0] + def test_postprocess_token_labels(ner_test_data): labels_no_padding = postprocess_token_labels( diff --git a/tests/unit/test_data_loaders.py b/tests/unit/test_data_loaders.py new file mode 100644 index 000000000..0a535c3e7 --- /dev/null +++ b/tests/unit/test_data_loaders.py @@ -0,0 +1,66 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import random + +import numpy as np +import pytest + +from utils_nlp.dataset.data_loaders import DaskCSVLoader + +UNIF1 = {"a": 0, "b": 10, "n": 1000} # some uniform distribution + + +@pytest.fixture() +def csv_file(tmpdir): + f = tmpdir.mkdir("test_loaders").join("tl_data.csv") + for i in range(1000): + f.write( + "\n".join( + [ + "{},{}".format( + random.randint(0, 1), + random.randint(UNIF1["a"], UNIF1["b"]), + ) + for x in range(UNIF1["n"]) + ] + ) + ) + return str(f) + + +def test_dask_csv_rnd_loader(csv_file): + num_batches = 500 + batch_size = 12 + num_partitions = 4 + + loader = DaskCSVLoader( + csv_file, header=None, block_size=5 * int(UNIF1["n"] / num_partitions) + ) + + sample = [] + for batch in loader.get_random_batches(num_batches, batch_size): + sample.append(list(batch.iloc[:, 1])) + sample = np.concatenate(sample) + + assert loader.df.npartitions == num_partitions + assert sample.mean().round() == UNIF1["a"] + UNIF1["b"] / 2 + assert len(sample) <= num_batches * batch_size + + +def test_dask_csv_seq_loader(csv_file): + batch_size = 12 + num_partitions = 4 + + loader = DaskCSVLoader( + csv_file, header=None, block_size=5 * int(UNIF1["n"] / num_partitions) + ) + + sample = [] + for batch in loader.get_sequential_batches(batch_size): + sample.append(list(batch.iloc[:, 1])) + sample = np.concatenate(sample) + + assert loader.df.npartitions == num_partitions + assert sample.mean().round() == UNIF1["a"] + UNIF1["b"] / 2 + assert len(sample) == UNIF1["n"] diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 2592fbbd3..91ca57a0b 100755 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -30,27 +30,19 @@ def test_load_pandas_df_msrpc(): def test_wikigold(tmp_path): - wg_text_length = 318333 wg_sentence_count = 1841 wg_test_percentage = 0.5 wg_test_sentence_count = round(wg_sentence_count * wg_test_percentage) wg_train_sentence_count = wg_sentence_count - wg_test_sentence_count - # test download downloaded_file = os.path.join(tmp_path, "wikigold.conll.txt") assert not os.path.exists(downloaded_file) - wg.download(dir_path=tmp_path) - assert os.path.exists(downloaded_file) - - # test read_data - wg_text = wg.read_data(downloaded_file) - assert len(wg_text) == wg_text_length - # test get_train_test_data - train_text, train_labels, test_text, test_labels = wg.get_train_test_data( - wg_text, test_percentage=wg_test_percentage + train_df, test_df = wg.load_train_test_dfs( + tmp_path, test_percentage=wg_test_percentage ) - assert len(train_text) == wg_train_sentence_count - assert len(train_labels) == wg_train_sentence_count - assert len(test_text) == wg_test_sentence_count - assert len(test_labels) == wg_test_sentence_count + + assert os.path.exists(downloaded_file) + + assert train_df.shape == (wg_train_sentence_count, 2) + assert test_df.shape == (wg_test_sentence_count, 2) diff --git a/tests/unit/test_word_embeddings.py b/tests/unit/test_word_embeddings.py index c1eb0e32d..bd1cbb0c2 100644 --- a/tests/unit/test_word_embeddings.py +++ b/tests/unit/test_word_embeddings.py @@ -29,7 +29,7 @@ def test_load_pretrained_vectors_word2vec(): model = load_word2vec(dir_path, limit=500000) assert isinstance(model, Word2VecKeyedVectors) - assert (len(model.wv.vocab) == 500000) + assert len(model.wv.vocab) == 500000 file_path = Path(file_path) assert file_path.is_file() @@ -38,6 +38,7 @@ def test_load_pretrained_vectors_word2vec(): assert isinstance(load_word2vec(dir_path), Word2VecKeyedVectors) + def test_load_pretrained_vectors_glove(): dir_path = "temp_data/" file_path = os.path.join( @@ -48,7 +49,7 @@ def test_load_pretrained_vectors_glove(): model = load_glove(dir_path, limit=50000) assert isinstance(model, Word2VecKeyedVectors) - assert (len(model.wv.vocab) == 50000) + assert len(model.wv.vocab) == 50000 file_path = Path(file_path) assert file_path.is_file() @@ -58,7 +59,9 @@ def test_load_pretrained_vectors_glove(): def test_load_pretrained_vectors_fasttext(): dir_path = "temp_data/" - file_path = os.path.join(os.path.join(dir_path, "fastText"), "wiki.simple.bin") + file_path = os.path.join( + os.path.join(dir_path, "fastText"), "wiki.simple.bin" + ) assert isinstance(load_fasttext(dir_path), FastText) diff --git a/tools/generate_conda_file.py b/tools/generate_conda_file.py index da0a591a4..428790a01 100644 --- a/tools/generate_conda_file.py +++ b/tools/generate_conda_file.py @@ -54,11 +54,10 @@ } PIP_BASE = { - "azureml-sdk[notebooks,tensorboard]": ( - "azureml-sdk[notebooks,tensorboard]==1.0.33" - ), + "azureml-sdk[notebooks,tensorboard]": "azureml-sdk[notebooks,tensorboard]==1.0.43", "azureml-dataprep": "azureml-dataprep==1.1.4", "black": "black>=18.6b4", + "dask": "dask[dataframe]==1.2.2", "papermill": "papermill==0.18.2", "pydocumentdb": "pydocumentdb>=2.3.3", "tqdm": "tqdm==4.31.1", @@ -74,6 +73,7 @@ "nltk": "nltk>=3.4", "pytorch-pretrained-bert": "pytorch-pretrained-bert>=0.6", "seqeval": "seqeval>=0.0.12", + "azureml-mlflow": "azureml-mlflow>=1.0.43.1", } PIP_GPU = {"horovod": "horovod>=0.16.1"} diff --git a/tools/repo_metrics/README.md b/tools/repo_metrics/README.md index 389fa9ed8..8d14448c2 100755 --- a/tools/repo_metrics/README.md +++ b/tools/repo_metrics/README.md @@ -1,8 +1,8 @@ # Repository Metrics -[![Build status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/Recommenders/Recommenders%20repo%20stats)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=5206) +[![Build Status](https://dev.azure.com/best-practices/nlp/_apis/build/status/repo_metrics?branchName=master)](https://dev.azure.com/best-practices/nlp/_build/latest?definitionId=36&branchName=master) -We developed a script that allows us to track the metrics of the Recommenders repo. Some of the metrics we can track are listed here: +We developed a script that allows us to track the repo metrics. Some of the metrics we can track are listed here: * Number of stars * Number of forks @@ -10,17 +10,27 @@ We developed a script that allows us to track the metrics of the Recommenders re * Number of views * Number of lines of code -To see the full list of metrics, see [git_stats.py](scripts/repo_metrics/git_stats.py) +To see the full list of metrics, see [git_stats.py](git_stats.py) The first step is to set up the credentials, copy the configuration file and fill up the credentials of GitHub and CosmosDB: - cp scripts/repo_metrics/config_template.py scripts/repo_metrics/config.py + cp tools/repo_metrics/config_template.py tools/repo_metrics/config.py To track the current state of the repository and save it to CosmosDB: - python scripts/repo_metrics/track_metrics.py --github_repo "https://github.com/Microsoft/Recommenders" --save_to_database + python tools/repo_metrics/track_metrics.py --github_repo "https://github.com/Microsoft/NLP" --save_to_database To track an event related to this repository and save it to CosmosDB: - python scripts/repo_metrics/track_metrics.py --event "Today we did our first blog of the project" --event_date 2018-12-01 --save_to_database + python tools/repo_metrics/track_metrics.py --event "Today we did our first blog of the project" --event_date 2018-12-01 --save_to_database + + +### Setting up Azure CosmosDB + +The API that we is used to track the GitHub metrics is the [Mongo API](https://docs.microsoft.com/en-us/azure/cosmos-db/mongodb-introduction). + +The database name and collections name are defined in the [config file](config_template.py). There are two main collections, defined as `COLLECTION_GITHUB_STATS` and `COLLECTION_EVENTS` to store the information defined on the previous section. + +**IMPORTANT NOTE**: If the database and the collections are created directly through the portal, a common partition key should be defined. We recommend to use `date` as partition key. + diff --git a/tools/repo_metrics/config_template.py b/tools/repo_metrics/config_template.py index 78825d957..dad9a0950 100755 --- a/tools/repo_metrics/config_template.py +++ b/tools/repo_metrics/config_template.py @@ -7,7 +7,7 @@ # CosmosDB Mongo API CONNECTION_STRING = "mongodb://XXXXXXXXXXXXXXXXXXXXXXXXX.documents.azure.com:10255/?ssl=true&replicaSet=globaldb" -DATABASE = "reco_stats" +DATABASE = "nlp_stats" COLLECTION_GITHUB_STATS = "github_stats" COLLECTION_EVENTS = "events" diff --git a/tools/repo_metrics/track_metrics.py b/tools/repo_metrics/track_metrics.py index e4136fdc9..5580ec9f0 100755 --- a/tools/repo_metrics/track_metrics.py +++ b/tools/repo_metrics/track_metrics.py @@ -5,7 +5,6 @@ import os # Need to append a full path instead of relative path. -# This seems to be an issue from Azure DevOps command line task. # NOTE this does not affect running directly in the shell. sys.path.append(os.getcwd()) import argparse @@ -14,9 +13,8 @@ from datetime import datetime from dateutil.parser import isoparse from pymongo import MongoClient -from datetime import datetime -from scripts.repo_metrics.git_stats import Github -from scripts.repo_metrics.config import ( +from tools.repo_metrics.git_stats import Github +from tools.repo_metrics.config import ( GITHUB_TOKEN, CONNECTION_STRING, DATABASE, @@ -32,6 +30,7 @@ def parse_args(): """Argument parser. + Returns: obj: Parser. """ @@ -61,8 +60,10 @@ def parse_args(): def connect(uri="mongodb://localhost"): """Mongo connector. + Args: uri (str): Connection string. + Returns: obj: Mongo client. """ @@ -78,9 +79,11 @@ def connect(uri="mongodb://localhost"): def event_as_dict(event, date): """Encodes an string event input as a dictionary with the date. + Args: event (str): Details of a event. date (datetime): Date of the event. + Returns: dict: Dictionary with the event and the date. """ @@ -89,8 +92,10 @@ def event_as_dict(event, date): def github_stats_as_dict(github): """Encodes Github statistics as a dictionary with the date. + Args: obj: Github object. + Returns: dict: Dictionary with Github details and the date. """ @@ -125,6 +130,7 @@ def github_stats_as_dict(github): def tracker(args): """Main function to track metrics. + Args: args (obj): Parsed arguments. """ diff --git a/utils_nlp/bert/common.py b/utils_nlp/bert/common.py index 52dcdb2d5..942b06685 100644 --- a/utils_nlp/bert/common.py +++ b/utils_nlp/bert/common.py @@ -1,10 +1,18 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -from pytorch_pretrained_bert.tokenization import BertTokenizer + +# This script reuses some code from +# https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples +# /run_classifier.py + + from enum import Enum import warnings import torch +from tqdm import tqdm + +from pytorch_pretrained_bert.tokenization import BertTokenizer from torch.utils.data import ( DataLoader, @@ -18,12 +26,14 @@ class Language(Enum): - """An enumeration of the supported languages.""" + """An enumeration of the supported pretrained models and languages.""" ENGLISH = "bert-base-uncased" ENGLISHCASED = "bert-base-cased" ENGLISHLARGE = "bert-large-uncased" ENGLISHLARGECASED = "bert-large-cased" + ENGLISHLARGEWWM = "bert-large-uncased-whole-word-masking" + ENGLISHLARGECASEDWWM = "bert-large-cased-whole-word-masking" CHINESE = "bert-base-chinese" MULTILINGUAL = "bert-base-multilingual-cased" @@ -33,6 +43,7 @@ def __init__( self, language=Language.ENGLISH, to_lower=False, cache_dir="." ): """Initializes the underlying pretrained BERT tokenizer. + Args: language (Language, optional): The pretrained model's language. Defaults to Language.ENGLISH. @@ -46,28 +57,62 @@ def __init__( def tokenize(self, text): """Tokenizes a list of documents using a BERT tokenizer + Args: - text (list(str)): list of text documents. + text (list): List of strings (one sequence) or + tuples (two sequences). + Returns: - [list(str)]: list of token lists. + [list]: List of lists. Each sublist contains WordPiece tokens + of the input sequence(s). """ - tokens = [self.tokenizer.tokenize(x) for x in text] - return tokens + if isinstance(text[0], str): + return [self.tokenizer.tokenize(x) for x in tqdm(text)] + else: + return [ + [self.tokenizer.tokenize(x) for x in sentences] + for sentences in tqdm(text) + ] + + def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + # This is a simple heuristic which will always truncate the longer + # sequence one token at a time. This makes more sense than + # truncating an equal percent of tokens from each, since if one + # sequence is very short then each token that's truncated likely + # contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + tokens_a.append("[SEP]") + tokens_b.append("[SEP]") + + return [tokens_a, tokens_b] def preprocess_classification_tokens(self, tokens, max_len=BERT_MAX_LEN): """Preprocessing of input tokens: - add BERT sentence markers ([CLS] and [SEP]) - - map tokens to indices + - map tokens to token indices in the BERT vocabulary - pad and truncate sequences - create an input_mask + - create token type ids, aka. segment ids + Args: - tokens (list): List of tokens to preprocess. + tokens (list): List of token lists to preprocess. max_len (int, optional): Maximum number of tokens (documents will be truncated or padded). Defaults to 512. Returns: - list of preprocesssed token lists - list of input mask lists + tuple: A tuple containing the following three lists + list of preprocesssed token lists + list of input mask lists + list of token type id lists """ if max_len > BERT_MAX_LEN: print( @@ -77,17 +122,47 @@ def preprocess_classification_tokens(self, tokens, max_len=BERT_MAX_LEN): ) max_len = BERT_MAX_LEN - # truncate and add BERT sentence markers - tokens = [["[CLS]"] + x[0 : max_len - 2] + ["[SEP]"] for x in tokens] + if isinstance(tokens[0][0], str): + tokens = [x[0 : max_len - 2] + ["[SEP]"] for x in tokens] + token_type_ids = None + else: + # get tokens for each sentence [[t00, t01, ...] [t10, t11,... ]] + tokens = [ + self._truncate_seq_pair(sentence[0], sentence[1], max_len - 3) + for sentence in tokens + ] + + # construct token_type_ids + # [[0, 0, 0, 0, ... 0, 1, 1, 1, ... 1], [0, 0, 0, ..., 1, 1, ] + token_type_ids = [ + [[i] * len(sentence) for i, sentence in enumerate(example)] + for example in tokens + ] + # merge sentences + tokens = [ + [token for sentence in example for token in sentence] + for example in tokens + ] + # prefix with [0] for [CLS] + token_type_ids = [ + [0] + [i for sentence in example for i in sentence] + for example in token_type_ids + ] + # pad sequence + token_type_ids = [ + x + [0] * (max_len - len(x)) for x in token_type_ids + ] + + tokens = [["[CLS]"] + x for x in tokens] # convert tokens to indices tokens = [self.tokenizer.convert_tokens_to_ids(x) for x in tokens] # pad sequence tokens = [x + [0] * (max_len - len(x)) for x in tokens] # create input mask input_mask = [[min(1, x) for x in y] for y in tokens] - return tokens, input_mask + return tokens, input_mask, token_type_ids - def preprocess_ner_tokens( + def tokenize_ner( self, text, max_len=BERT_MAX_LEN, @@ -96,7 +171,7 @@ def preprocess_ner_tokens( trailing_piece_tag="X", ): """ - Preprocesses input text, involving the following steps + Tokenize and preprocesses input text, involving the following steps 0. Tokenize input text. 1. Convert string tokens to token ids. 2. Convert input labels to label ids, if labels and label_map are @@ -118,8 +193,8 @@ def preprocess_ner_tokens( labels (which may be string type) to integers. Default value is None. trailing_piece_tag (str, optional): Tag used to label trailing - word pieces. For example, "playing" is broken into "play" - and "##ing", "play" preserves its original label and "##ing" + word pieces. For example, "criticize" is broken into "critic" + and "##ize", "critic" preserves its original label and "##ize" is labeled as trailing_piece_tag. Default value is "X". Returns: @@ -134,7 +209,7 @@ def preprocess_ner_tokens( 3. trailing_token_mask: List of lists. Each sublist is a boolean list, True for the first word piece of each original word, False for the trailing word pieces, - e.g. "##ing". This mask is useful for removing the + e.g. "##ize". This mask is useful for removing the predictions on trailing word pieces, so that each original word in the input text has a unique predicted label. @@ -142,6 +217,10 @@ def preprocess_ner_tokens( each sublist contains token labels of a input sentence/paragraph, if labels is provided. """ + text = [ + self.tokenizer.basic_tokenizer._tokenize_chinese_chars(t) + for t in text + ] if max_len > BERT_MAX_LEN: warnings.warn( "setting max_len to max allowed tokens: {}".format( @@ -162,7 +241,7 @@ def preprocess_ner_tokens( trailing_token_mask_all = [] for t, t_labels in zip(text, labels): new_labels = [] - tokens = [] + new_tokens = [] if label_available: for word, tag in zip(t.split(), t_labels): sub_words = self.tokenizer.tokenize(word) @@ -170,7 +249,7 @@ def preprocess_ner_tokens( if count > 0: tag = trailing_piece_tag new_labels.append(tag) - tokens.append(sub_word) + new_tokens.append(sub_word) else: for word in t.split(): sub_words = self.tokenizer.tokenize(word) @@ -180,12 +259,12 @@ def preprocess_ner_tokens( else: tag = "O" new_labels.append(tag) - tokens.append(sub_word) + new_tokens.append(sub_word) - if len(tokens) > max_len: - tokens = tokens[:max_len] + if len(new_tokens) > max_len: + new_tokens = new_tokens[:max_len] new_labels = new_labels[:max_len] - input_ids = self.tokenizer.convert_tokens_to_ids(tokens) + input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens) # The mask has 1 for real tokens and 0 for padding tokens. # Only real tokens are attended to. @@ -235,6 +314,7 @@ def create_data_loader( ): """ Create a dataloader for sampling and serving data batches. + Args: input_ids (list): List of lists. Each sublist contains numerical values, i.e. token ids, corresponding to the tokens in the input diff --git a/utils_nlp/bert/sequence_classification.py b/utils_nlp/bert/sequence_classification.py index c5d4614f2..40e77b44b 100644 --- a/utils_nlp/bert/sequence_classification.py +++ b/utils_nlp/bert/sequence_classification.py @@ -5,6 +5,7 @@ # https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py import random +from collections import namedtuple import numpy as np import torch @@ -22,6 +23,7 @@ class BERTSequenceClassifier: def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir="."): """Initializes the classifier and the underlying pretrained model. + Args: language (Language, optional): The pretrained model's language. Defaults to Language.ENGLISH. @@ -47,19 +49,24 @@ def fit( token_ids, input_mask, labels, + token_type_ids=None, num_gpus=None, num_epochs=1, batch_size=32, lr=2e-5, + warmup_proportion=None, verbose=True, ): """Fine-tunes the BERT classifier using the given training data. + Args: token_ids (list): List of training token id lists. input_mask (list): List of input mask lists. labels (list): List of training labels. - device (str, optional): Device used for training ("cpu" or "gpu"). - Defaults to "gpu". + token_type_ids (list, optional): List of lists. Each sublist + contains segment ids indicating if the token belongs to + the first sentence(0) or second sentence(1). Only needed + for two-sentence tasks. num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. @@ -67,6 +74,9 @@ def fit( Defaults to 1. batch_size (int, optional): Training batch size. Defaults to 32. lr (float): Learning rate of the Adam optimizer. Defaults to 2e-5. + warmup_proportion (float, optional): Proportion of training to + perform linear learning rate warmup for. E.g., 0.1 = 10% of + training. Defaults to None. verbose (bool, optional): If True, shows the training progress and loss values. Defaults to True. """ @@ -95,16 +105,27 @@ def fit( }, ] - opt = BertAdam(optimizer_grouped_parameters, lr=lr) + num_examples = len(token_ids) + num_batches = int(num_examples / batch_size) + num_train_optimization_steps = num_batches * num_epochs + + if warmup_proportion is None: + opt = BertAdam(optimizer_grouped_parameters, lr=lr) + else: + opt = BertAdam( + optimizer_grouped_parameters, + lr=lr, + t_total=num_train_optimization_steps, + warmup=warmup_proportion, + ) # define loss function loss_func = nn.CrossEntropyLoss().to(device) # train self.model.train() # training mode - num_examples = len(token_ids) - num_batches = int(num_examples / batch_size) + token_type_ids_batch = None for epoch in range(num_epochs): for i in range(num_batches): @@ -121,11 +142,18 @@ def fit( input_mask[start:end], dtype=torch.long, device=device ) + if token_type_ids is not None: + token_type_ids_batch = torch.tensor( + token_type_ids[start:end], + dtype=torch.long, + device=device, + ) + opt.zero_grad() y_h = self.model( input_ids=x_batch, - token_type_ids=None, + token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) @@ -146,20 +174,37 @@ def fit( ) ) # empty cache - del [x_batch, y_batch, mask_batch] + del [x_batch, y_batch, mask_batch, token_type_ids_batch] torch.cuda.empty_cache() - def predict(self, token_ids, input_mask, num_gpus=None, batch_size=32): + def predict( + self, + token_ids, + input_mask, + token_type_ids=None, + num_gpus=None, + batch_size=32, + probabilities=False, + ): """Scores the given dataset and returns the predicted classes. + Args: token_ids (list): List of training token lists. input_mask (list): List of input mask lists. + token_type_ids (list, optional): List of lists. Each sublist + contains segment ids indicating if the token belongs to + the first sentence(0) or second sentence(1). Only needed + for two-sentence tasks. num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. batch_size (int, optional): Scoring batch size. Defaults to 32. + probabilities (bool, optional): + If True, the predicted probability distribution + is also returned. Defaults to False. Returns: - [ndarray]: Predicted classes. + 1darray, namedtuple(1darray, ndarray): Predicted classes or + (classes, probabilities) if probabilities is True. """ device = get_device("cpu" if num_gpus == 0 else "gpu") @@ -178,16 +223,30 @@ def predict(self, token_ids, input_mask, num_gpus=None, batch_size=32): mask_batch = torch.tensor( mask_batch, dtype=torch.long, device=device ) + token_type_ids_batch = None + if token_type_ids is not None: + token_type_ids_batch = torch.tensor( + token_type_ids[i : i + batch_size], + dtype=torch.long, + device=device, + ) with torch.no_grad(): p_batch = self.model( input_ids=x_batch, - token_type_ids=None, + token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) - preds.append(p_batch.cpu().data.numpy()) + preds.append(p_batch.cpu()) if i % batch_size == 0: pbar.update(batch_size) - preds = [x.argmax(1) for x in preds] + preds = np.concatenate(preds) - return preds + + if probabilities: + return namedtuple("Predictions", "classes probabilities")( + preds.argmax(axis=1), + nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(), + ) + else: + return preds.argmax(axis=1) diff --git a/utils_nlp/bert/token_classification.py b/utils_nlp/bert/token_classification.py index 3182684b4..f4c0f50bf 100644 --- a/utils_nlp/bert/token_classification.py +++ b/utils_nlp/bert/token_classification.py @@ -7,6 +7,7 @@ import numpy as np from tqdm import tqdm, trange +from collections import namedtuple import torch import torch.nn as nn @@ -14,7 +15,7 @@ from pytorch_pretrained_bert.optimization import BertAdam from pytorch_pretrained_bert.modeling import BertForTokenClassification -from .common import Language, create_data_loader +from utils_nlp.bert.common import Language, create_data_loader from utils_nlp.pytorch.device_utils import get_device, move_to_device @@ -192,8 +193,16 @@ def fit( train_loss = tr_loss / nb_tr_steps print("Train loss: {}".format(train_loss)) + torch.cuda.empty_cache() + def predict( - self, token_ids, input_mask, labels=None, batch_size=32, num_gpus=None + self, + token_ids, + input_mask, + labels=None, + batch_size=32, + num_gpus=None, + probabilities=False, ): """ Predict token labels on the testing data. @@ -215,7 +224,12 @@ def predict( If None, all available GPUs will be used. Defaults to None. Returns: - list: List of lists of predicted token labels. + list or namedtuple(list, ndarray): List of lists of predicted + token labels or ([token labels], probabilities) if + probabilities is True. The probabilities output is an n x m + array, where n is the size of the testing data and m is the + number of tokens in each input sublist. The probability + values are the softmax probability of the predicted class. """ test_dataloader = create_data_loader( input_ids=token_ids, @@ -228,7 +242,6 @@ def predict( self.model = move_to_device(self.model, device, num_gpus) self.model.eval() - predictions = [] eval_loss = 0 nb_eval_steps = 0 for step, batch in enumerate( @@ -255,16 +268,37 @@ def predict( eval_loss += tmp_eval_loss.mean().item() - logits = logits.detach().cpu().numpy() - predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) + logits = logits.detach().cpu() + + if step == 0: + logits_all = logits.numpy() + else: + logits_all = np.append(logits_all, logits, axis=0) nb_eval_steps += 1 + predictions = [list(p) for p in np.argmax(logits_all, axis=2)] + if true_label_available: validation_loss = eval_loss / nb_eval_steps print("Evaluation loss: {}".format(validation_loss)) - return predictions + if probabilities: + return namedtuple("Predictions", "classes probabilities")( + predictions, + np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2), + ) + else: + return predictions + + +def create_label_map(label_list, trailing_piece_tag="X"): + label_map = {label: i for i, label in enumerate(label_list)} + + if trailing_piece_tag not in label_list: + label_map[trailing_piece_tag] = len(label_list) + + return label_map def postprocess_token_labels( @@ -294,13 +328,13 @@ def postprocess_token_labels( original labels. Default value is None. remove_trailing_word_pieces (bool, optional): Whether to remove predicted labels of trailing word pieces generated by WordPiece - tokenizer. For example, "playing" is broken into "play" and - "##ing". After removing predicted label for "##ing", + tokenizer. For example, "criticize" is broken into "critic" and + "##ize". After removing predicted label for "##ize", the predicted label for "play" is assigned to the original word "playing". Default value is False. trailing_token_mask (list, optional): list of boolean values, True for the first word piece of each original word, False for trailing - word pieces, e.g. ##ing. If remove_trailing_word_pieces is + word pieces, e.g. ##ize. If remove_trailing_word_pieces is True, this mask is used to remove the predicted labels on trailing word pieces, so that each original word in the input text has a unique predicted label. diff --git a/utils_nlp/dataset/data_loaders.py b/utils_nlp/dataset/data_loaders.py new file mode 100644 index 000000000..08111fd95 --- /dev/null +++ b/utils_nlp/dataset/data_loaders.py @@ -0,0 +1,73 @@ +import random +import dask.dataframe as dd + + +class DaskCSVLoader: + """Class for creating and using a loader for large csv + or other delimited files. The loader uses dask to read + smaller partitions of a file into memory (one partition at a time), + before sampling batches from the partitions. + """ + + def __init__( + self, + file_path, + sep=",", + header="infer", + block_size=10e6, + random_seed=None, + ): + """Initializes the loader. + + Args: + file_path (str): Path to delimited file. + sep (str, optional): Delimiter. Defaults to ",". + header (str, optional): Number of rows to be used as the header. + See pandas.read_csv() + Defaults to "infer". + block_size (int, optional): Size of partition in bytes. + See dask.dataframe.read_csv() + Defaults to 10e6. + random_seed (int, optional): Random seed. See random.seed(). + Defaults to None. + """ + + self.df = dd.read_csv( + file_path, sep=sep, header=header, blocksize=block_size + ) + self.random_seed = random_seed + random.seed(random_seed) + + def get_random_batches(self, num_batches, batch_size): + """Creates a random-batch generator. + Batches returned are pandas dataframes of length=batch_size. + Note: If the sampled partition has less rows than the + specified batch_size, then a smaller batch of the same + size as that partition's number of rows is returned. + + Args: + num_batches (int): Number of batches to generate. + batch_size (int]): Batch size. + """ + for i in range(num_batches): + rnd_part_idx = random.randint(0, self.df.npartitions - 1) + sample_part = self.df.partitions[rnd_part_idx].compute() + if sample_part.shape[0] > batch_size: + yield sample_part.sample( + batch_size, random_state=self.random_seed + ) + else: + yield sample_part + + def get_sequential_batches(self, batch_size): + """Creates a sequential generator. + Batches returned are pandas dataframes of length=batch_size. + Note: Final batch might be of smaller size. + + Args: + batch_size (int): Batch size. + """ + for i in range(self.df.npartitions): + part = self.df.partitions[i].compute() + for j in range(0, part.shape[0], batch_size): + yield part.iloc[j : j + batch_size, :] diff --git a/utils_nlp/dataset/msra_ner.py b/utils_nlp/dataset/msra_ner.py new file mode 100644 index 000000000..b4a15b815 --- /dev/null +++ b/utils_nlp/dataset/msra_ner.py @@ -0,0 +1,38 @@ +import os +import pandas as pd +from utils_nlp.dataset.ner_utils import preprocess_conll + + +FILES = { + "train": "MSRA/msra-bakeoff3-training-utf8.2col", + "test": "MSRA/bakeoff3_goldstandard.txt", +} +ENCODINGS = {"train": "utf8", "test": "gbk"} + + +def load_pandas_df(local_cache_path="./", file_split="test"): + file_path = os.path.join(local_cache_path, FILES[file_split]) + encoding = ENCODINGS[file_split] + + with open(file_path, encoding=encoding) as file_path: + text = file_path.read() + + # Add line break after punctuations indicating end of sentence in Chinese + text = text.replace("。 0", "。 0\n") + text = text.replace("? 0", "? 0\n") + text = text.replace("! 0", "! 0\n") + + sentence_list, labels_list = preprocess_conll(text, file_split) + + labels_list = [ + ["O" if label == "0" else label for label in labels] + for labels in labels_list + ] + + df = pd.DataFrame({"sentence": sentence_list, "labels": labels_list}) + + return df + + +def get_unique_labels(): + return ["O", "B-LOC", "B-ORG", "B-PER", "I-LOC", "I-ORG", "I-PER"] diff --git a/utils_nlp/dataset/ner_utils.py b/utils_nlp/dataset/ner_utils.py new file mode 100644 index 000000000..7ac60734e --- /dev/null +++ b/utils_nlp/dataset/ner_utils.py @@ -0,0 +1,46 @@ +def preprocess_conll(text, data_type=""): + """ + Helper function converting data in conll format to sentence and list + of token labels. + + Args: + text (str): Text string in conll format, e.g. + "Amy B-PER + ADAMS I-PER + works O + at O + the O + University B-ORG + of I-ORG + Minnesota I-ORG + . O" + data_type (str, optional): String that briefly describes the data, + e.g. "train" + Returns: + tuple: + (list of sentences, list of token label lists) + """ + text_list = text.split("\n\n") + if text_list[-1] in (" ", ""): + text_list = text_list[:-1] + + max_seq_len = 0 + sentence_list = [] + labels_list = [] + for s in text_list: + # split each sentence string into "word label" pairs + s_split = s.split("\n") + # split "word label" pairs + s_split_split = [t.split() for t in s_split] + sentence_list.append( + " ".join([t[0] for t in s_split_split if len(t) > 1]) + ) + labels_list.append([t[1] for t in s_split_split if len(t) > 1]) + if len(s_split_split) > max_seq_len: + max_seq_len = len(s_split_split) + print( + "Maximum sequence length in {0} data is: {1}".format( + data_type, max_seq_len + ) + ) + return sentence_list, labels_list diff --git a/utils_nlp/dataset/preprocess.py b/utils_nlp/dataset/preprocess.py index 03f0e9062..2e51821f5 100644 --- a/utils_nlp/dataset/preprocess.py +++ b/utils_nlp/dataset/preprocess.py @@ -22,7 +22,8 @@ def to_lowercase_all(df): def to_lowercase(df, column_names=[]): """ - This function transforms strings of the column names in the dataframe passed to lowercase + This function transforms strings of the column names in the dataframe + passed to lowercase Args: df (pd.DataFrame): Raw dataframe with some text columns. @@ -46,18 +47,18 @@ def to_spacy_tokens( token_cols=["sentence1_tokens", "sentence2_tokens"], ): """ - This function tokenizes the sentence pairs using spaCy, defaulting to the - spaCy en_core_web_sm model - - Args: - df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize. - sentence_cols (list, optional): Column names of the raw sentence pairs. - token_cols (list, optional): Column names for the tokenized sentences. - - Returns: - pd.DataFrame: Dataframe with new columns token_cols, each containing - a list of tokens for their respective sentences. - """ + This function tokenizes the sentence pairs using spaCy, defaulting to the + spaCy en_core_web_sm model + + Args: + df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize. + sentence_cols (list, optional): Column names of the raw sentence pairs. + token_cols (list, optional): Column names for the tokenized sentences. + + Returns: + pd.DataFrame: Dataframe with new columns token_cols, each containing + a list of tokens for their respective sentences. + """ nlp = spacy.load("en_core_web_sm") text_df = df[sentence_cols] nlp_df = text_df.applymap(lambda x: nlp(x)) @@ -77,21 +78,22 @@ def rm_spacy_stopwords( custom_stopwords=[], ): """ - This function tokenizes the sentence pairs using spaCy and remove stopwords, - defaulting to the spaCy en_core_web_sm model - - Args: - df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize. - sentence_cols (list, optional): Column names for the raw sentence pairs. - stop_cols (list, optional): Column names for the tokenized sentences - without stop words. - custom_stopwords (list of str, optional): List of custom stopwords to - register with the spaCy model. - - Returns: - pd.DataFrame: Dataframe with new columns stop_cols, each containing a - list of tokens for their respective sentences. - """ + This function tokenizes the sentence pairs using spaCy and remove + stopwords, defaulting to the spaCy en_core_web_sm model + + Args: + df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize. + sentence_cols (list, optional): Column names for the raw sentence + pairs. + stop_cols (list, optional): Column names for the tokenized sentences + without stop words. + custom_stopwords (list of str, optional): List of custom stopwords to + register with the spaCy model. + + Returns: + pd.DataFrame: Dataframe with new columns stop_cols, each containing a + list of tokens for their respective sentences. + """ nlp = spacy.load("en_core_web_sm") if len(custom_stopwords) > 0: for csw in custom_stopwords: @@ -160,3 +162,13 @@ def rm_nltk_stopwords( stop_df.columns = stop_cols return pd.concat([df, stop_df], axis=1) + + +def convert_to_unicode(input_text): + """Converts intput_text to Unicode. Input must be utf-8.""" + if isinstance(input_text, str): + return input_text + elif isinstance(input_text, bytes): + return input_text.decode("utf-8", "ignore") + else: + raise TypeError("Unsupported string type: %s" % (type(input_text))) diff --git a/utils_nlp/dataset/stsbenchmark.py b/utils_nlp/dataset/stsbenchmark.py index ed919ed57..31e05e637 100644 --- a/utils_nlp/dataset/stsbenchmark.py +++ b/utils_nlp/dataset/stsbenchmark.py @@ -4,7 +4,6 @@ import os import tarfile import pandas as pd -import azureml.dataprep as dp from utils_nlp.dataset.url_utils import maybe_download @@ -14,38 +13,33 @@ def load_pandas_df(data_path, file_split=DEFAULT_FILE_SPLIT): """Load the STS Benchmark dataset as a pd.DataFrame - + Args: data_path (str): Path to data directory - file_split (str, optional): File split to load. One of (train, dev, test). Defaults to train. - + file_split (str, optional): File split to load. + One of (train, dev, test). + Defaults to train. + Returns: pd.DataFrame: STS Benchmark dataset """ - clean_file_path = os.path.join( - data_path, "clean/stsbenchmark", "sts-{}.csv".format(file_split) - ) - dflow = _maybe_download_and_extract(data_path, clean_file_path) - return dflow.to_pandas_dataframe() - - -def _maybe_download_and_extract(base_data_path, clean_file_path): - if not os.path.exists(clean_file_path): - raw_data_path = os.path.join(base_data_path, "raw") - if not os.path.exists(raw_data_path): - os.makedirs(raw_data_path) - sts_path = _download_sts(raw_data_path) - sts_files = [f for f in os.listdir(sts_path) if f.endswith(".csv")] - _clean_sts( - sts_files, - sts_path, - os.path.join(base_data_path, "clean", "stsbenchmark"), - ) - return dp.auto_read_file(clean_file_path).drop_columns("Column1") + file_name = "sts-{}.csv".format(file_split) + df = _maybe_download_and_extract(file_name, data_path) + return df + + +def _maybe_download_and_extract(sts_file, base_data_path): + raw_data_path = os.path.join(base_data_path, "raw") + if not os.path.exists(raw_data_path): + os.makedirs(raw_data_path) + sts_path = _download_sts(raw_data_path) + df = _load_sts(os.path.join(sts_path, sts_file)) + return df def _download_sts(dirpath): - """Download and extract data from http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz + """Download and extract data from + http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz Args: dirpath (str): Path to data directory. @@ -66,8 +60,11 @@ def _extract_sts(tarpath, target_dirpath=".", tmode="r"): Args: tarpath (str): Path to tarfile, to be deleted after extraction. - target_dirpath (str, optional): Directory in which to save the extracted files. - tmode (str, optional): The mode for reading, of the form "filemode[:compression]". Defaults to "r". + target_dirpath (str, optional): Directory in which to save + the extracted files. + tmode (str, optional): The mode for reading, + of the form "filemode[:compression]". + Defaults to "r". Returns: str: Path to extracted STS Benchmark data. @@ -79,31 +76,59 @@ def _extract_sts(tarpath, target_dirpath=".", tmode="r"): return os.path.join(target_dirpath, extracted) -def _clean_sts(filenames, src_dir, target_dir): - """Drop columns containing irrelevant metadata and save as new csv files in the target_dir +def _load_sts(src_file_path): + """Load datafile as dataframe Args: - filenames (list of str): List of filenames for the train/dev/test csv files. - src_dir (str): Directory for the raw csv files. - target_dir (str): Directory for the clean csv files to be written to. + src_file_path (str): filepath to train/dev/test csv files. """ - if not os.path.exists(target_dir): - os.makedirs(target_dir) - filepaths = [os.path.join(src_dir, f) for f in filenames] - for i, fp in enumerate(filepaths): - dat = dp.auto_read_file(path=fp) - s = dat.keep_columns(["Column5", "Column6", "Column7"]).rename_columns( - { - "Column5": "score", - "Column6": "sentence1", - "Column7": "sentence2", - } - ) - print( - "Writing clean dataframe to {}".format( - os.path.join(target_dir, filenames[i]) + with open(src_file_path, "r", encoding="utf-8") as f: + sent_pairs = [] + for line in f: + line = line.strip().split("\t") + sent_pairs.append( + [ + line[0].strip(), + line[1].strip(), + line[2].strip(), + line[3].strip(), + float(line[4]), + line[5].strip(), + line[6].strip(), + ] ) + + sdf = pd.DataFrame( + sent_pairs, + columns=[ + "column_0", + "column_1", + "column_2", + "column_3", + "column_4", + "column_5", + "column_6", + ], ) - sdf = s.to_pandas_dataframe().to_csv( - os.path.join(target_dir, filenames[i]), sep="\t" - ) + return sdf + + +def clean_sts(df): + """Drop columns containing irrelevant metadata and + save as new csv files in the target_dir. + + Args: + df (pandas.Dataframe): drop columns from train/test/dev files. + """ + clean_df = df.drop( + ["column_0", "column_1", "column_2", "column_3"], axis=1 + ) + clean_df = clean_df.rename( + index=str, + columns={ + "column_4": "score", + "column_5": "sentence1", + "column_6": "sentence2", + }, + ) + return clean_df diff --git a/utils_nlp/dataset/wikigold.py b/utils_nlp/dataset/wikigold.py index 8f32bad27..740440831 100644 --- a/utils_nlp/dataset/wikigold.py +++ b/utils_nlp/dataset/wikigold.py @@ -2,6 +2,9 @@ # Licensed under the MIT License. import random +import os +import pandas as pd + from utils_nlp.dataset.url_utils import maybe_download URL = ( @@ -10,48 +13,35 @@ ) -def download(dir_path="."): - """Download the wikigold data file to dir_path if it doesn't exist yet.""" - file_name = URL.split("/")[-1] - maybe_download(URL, file_name, dir_path) - - -def read_data(data_file): +def load_train_test_dfs( + local_cache_path="./", test_percentage=0.5, random_seed=None +): """ - Read the wikigold dataset into a string of text. + Get the training and testing data frames based on test_percentage. Args: - data_file (str): data file path, including the file name. - - Returns: - str: One string containing the wikigold dataset. - """ - with open(data_file, "r", encoding="utf8") as file: - text = file.read() - - return text - - -def get_train_test_data(text, test_percentage=0.5, random_seed=None): - """ - Get the training and testing data based on test_percentage. - - Args: - text (str): One string containing the wikigold dataset. + local_cache_path (str): Path to store the data. If the data file + doesn't exist in this path, it's downloaded. test_percentage (float, optional): Percentage of data ot use for testing. Since this is a small dataset, the default testing percentage is set to 0.5 random_seed (float, optional): Random seed used to shuffle the data. Returns: - tuple: A tuple containing four lists: - train_sentence_list: List of training sentence strings. - train_labels_list: List of lists. Each sublist contains the - entity labels of the words in the training sentence. - test_sentence_list: List of testing sentence strings. - test_labels_list: List of lists. Each sublist contains the - entity labels of the word in the testing sentence. + tuple: (train_pandas_df, test_pandas_df), each data frame contains + two columns + "sentence": sentences in strings. + "labels": list of entity labels of the words in the sentence. + """ + file_name = URL.split("/")[-1] + maybe_download(URL, file_name, local_cache_path) + + data_file = os.path.join(local_cache_path, file_name) + + with open(data_file, "r", encoding="utf8") as file: + text = file.read() + # Input data are separated by empty lines text_split = text.split("\n\n") # Remove empty line at EOF @@ -94,14 +84,17 @@ def _get_sentence_and_labels(text_list, data_type): test_text_split, "testing" ) - return ( - train_sentence_list, - train_labels_list, - test_sentence_list, - test_labels_list, + train_df = pd.DataFrame( + {"sentence": train_sentence_list, "labels": train_labels_list} ) + test_df = pd.DataFrame( + {"sentence": test_sentence_list, "labels": test_labels_list} + ) + + return (train_df, test_df) + def get_unique_labels(): """Get the unique labels in the wikigold dataset.""" - return ["O", "I-LOC", "I-MISC", "I-PER", "I-ORG", "X"] + return ["O", "I-LOC", "I-MISC", "I-PER", "I-ORG"] diff --git a/utils_nlp/dataset/xnli.py b/utils_nlp/dataset/xnli.py index e7bbcf4cb..a233c9e7b 100644 --- a/utils_nlp/dataset/xnli.py +++ b/utils_nlp/dataset/xnli.py @@ -10,37 +10,86 @@ import pandas as pd from utils_nlp.dataset.url_utils import extract_zip, maybe_download +from utils_nlp.dataset.preprocess import convert_to_unicode -URL = "https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip" +URL_XNLI = "https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip" +URL_XNLI_MT = "https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip" -DATA_FILES = { - "dev": "XNLI-1.0/xnli.dev.jsonl", - "test": "XNLI-1.0/xnli.test.jsonl", -} +def load_pandas_df(local_cache_path="./", file_split="dev", language="zh"): + """Downloads and extracts the dataset files. -def load_pandas_df(local_cache_path=None, file_split="dev"): - """Downloads and extracts the dataset files Args: - local_cache_path ([type], optional): [description]. - Defaults to None. + local_cache_path (str, optional): Path to store the data. + Defaults to "./". file_split (str, optional): The subset to load. - One of: {"dev", "test"} - Defaults to "train". + One of: {"train", "dev", "test"} + Defaults to "dev". + language (str, optional): language subset to read. + One of: {"en", "fr", "es", "de", "el", "bg", "ru", + "tr", "ar", "vi", "th", "zh", "hi", "sw", "ur"} + Defaults to "zh" (Chinese). Returns: pd.DataFrame: pandas DataFrame containing the specified XNLI subset. """ - file_name = URL.split("/")[-1] - maybe_download(URL, file_name, local_cache_path) + if file_split in ("dev", "test"): + url = URL_XNLI + sentence_1_index = 6 + sentence_2_index = 7 + label_index = 1 - if not os.path.exists( - os.path.join(local_cache_path, DATA_FILES[file_split]) - ): + zip_file_name = url.split("/")[-1] + folder_name = ".".join(zip_file_name.split(".")[:-1]) + file_name = folder_name + "/" + ".".join(["xnli", file_split, "tsv"]) + elif file_split == "train": + url = URL_XNLI_MT + sentence_1_index = 0 + sentence_2_index = 1 + label_index = 2 + + zip_file_name = url.split("/")[-1] + folder_name = ".".join(zip_file_name.split(".")[:-1]) + file_name = ( + folder_name + + "/multinli/" + + ".".join(["multinli", file_split, language, "tsv"]) + ) + + maybe_download(url, zip_file_name, local_cache_path) + + if not os.path.exists(os.path.join(local_cache_path, folder_name)): extract_zip( - os.path.join(local_cache_path, file_name), local_cache_path + os.path.join(local_cache_path, zip_file_name), local_cache_path + ) + + with open( + os.path.join(local_cache_path, file_name), "r", encoding="utf-8" + ) as f: + lines = f.read().splitlines() + + line_list = [line.split("\t") for line in lines] + # Remove the column name row + line_list.pop(0) + if file_split != "train": + line_list = [line for line in line_list if line[0] == language] + + label_list = [convert_to_unicode(line[label_index]) for line in line_list] + old_contradict_label = convert_to_unicode("contradictory") + new_contradict_label = convert_to_unicode("contradiction") + label_list = [ + new_contradict_label if label == old_contradict_label else label + for label in label_list + ] + text_list = [ + ( + convert_to_unicode(line[sentence_1_index]), + convert_to_unicode(line[sentence_2_index]), ) - return pd.read_json( - os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True - ) + for line in line_list + ] + + df = pd.DataFrame({"text": text_list, "label": label_list}) + + return df diff --git a/utils_nlp/dataset/yahoo_answers.py b/utils_nlp/dataset/yahoo_answers.py deleted file mode 100644 index a4c4a8923..000000000 --- a/utils_nlp/dataset/yahoo_answers.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -"""Yahoo! Answers dataset utils""" - -import os -import pandas as pd -from utils_nlp.dataset.url_utils import maybe_download, extract_tar - - -URL = "https://s3.amazonaws.com/fast-ai-nlp/yahoo_answers_csv.tgz" - - -def download(dir_path): - """Downloads and extracts the dataset files""" - file_name = URL.split("/")[-1] - maybe_download(URL, file_name, dir_path) - extract_tar(os.path.join(dir_path, file_name), dir_path) - - -def read_data(data_file, nrows=None): - return pd.read_csv(data_file, header=None, nrows=nrows) - - -def get_text(df): - df.fillna("", inplace=True) - text = df.iloc[:, 1] + " " + df.iloc[:, 2] + " " + df.iloc[:, 3] - text = text.str.replace(r"[^A-Za-z ]", "").str.lower() - text = text.str.replace(r"\\s+", " ") - text = text.astype(str) - return text - - -def get_labels(df): - return list(df[0] - 1)