diff --git a/README.md b/README.md index 1ec683ab..5fff9d39 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,13 @@ `nimbusml` is a Python module that provides Python bindings for [ML.NET](https://github.com/dotnet/machinelearning). -ML.NET was originally developed in Microsoft Research and is used across many product groups in Microsoft like Windows, Bing, PowerPoint, Excel and others. `nimbusml` was built to enable data science teams that are more familiar with Python to take advantage of ML.NET's functionality and performance. +ML.NET was originally developed in Microsoft Research and is used across many product groups in Microsoft like Windows, Bing, PowerPoint, Excel, and others. `nimbusml` was built to enable data science teams that are more familiar with Python to take advantage of ML.NET's functionality and performance. -This package enables training ML.NET pipelines or integrating ML.NET components directly into [scikit-learn](https://scikit-learn.org/stable/) pipelines (it supports `numpy.ndarray`, `scipy.sparse_cst`, and `pandas.DataFrame` as inputs). +`nimbusml` enables training ML.NET pipelines or integrating ML.NET components directly into [scikit-learn](https://scikit-learn.org/stable/) pipelines. It adheres to existing `scikit-learn` conventions, allowing simple interoperability between `nimbusml` and `scikit-learn` components, while adding a suite of fast, highly optimized, and scalable algorithms, transforms, and components written in C++ and C\#. + +See examples below showing interoperability with `scikit-learn`. A more detailed example in the [documentation](https://docs.microsoft.com/en-us/nimbusml/tutorials/b_c-sentiment-analysis-3-combining-nimbusml-and-scikit-learn) shows how to use a `nimbusml` component in a `scikit-learn` pipeline, and create a pipeline using only `nimbusml` components. + +`nimbusml` supports `numpy.ndarray`, `scipy.sparse_cst`, and `pandas.DataFrame` as inputs. In addition, `nimbusml` also supports streaming from files without loading the dataset into memory with `FileDataStream`, which allows training on data significantly exceeding memory. Documentation can be found [here](https://docs.microsoft.com/en-us/NimbusML/overview) and additional notebook samples can be found [here](https://github.com/Microsoft/NimbusML-Samples). @@ -84,7 +88,7 @@ To build `nimbusml` from source please visit our [developer guide](docs/develope ## Contributing -The contributions guide can be found [here](CONTRIBUTING.md). Given the experimental nature of this project, support will be provided on a best-effort basis. We suggest opening an issue for discussion before starting a PR with big changes. +The contributions guide can be found [here](CONTRIBUTING.md). ## Support diff --git a/build.cmd b/build.cmd index 8ad4a127..46c56ac5 100644 --- a/build.cmd +++ b/build.cmd @@ -26,6 +26,9 @@ set RunExtendedTests=False set BuildDotNetBridgeOnly=False set SkipDotNetBridge=False set AzureBuild=False +set BuildManifestGenerator=False +set UpdateManifest=False +set VerifyManifest=False :Arg_Loop if [%1] == [] goto :Build @@ -53,6 +56,10 @@ if /i [%1] == [--skipDotNetBridge] ( set SkipDotNetBridge=True shift && goto :Arg_Loop ) +if /i [%1] == [--updateManifest] ( + set UpdateManifest=True + shift && goto :Arg_Loop +) if /i [%1] == [--azureBuild] ( set AzureBuild=True shift && goto :Arg_Loop @@ -68,6 +75,7 @@ echo " --installPythonPackages Install python packages after build" echo " --includeExtendedTests Include the extended tests if the tests are run" echo " --buildDotNetBridgeOnly Build only DotNetBridge" echo " --skipDotNetBridge Build everything except DotNetBridge" +echo " --updateManifest Update manifest.json" echo " --azureBuild Building in azure devops (adds dotnet CLI to the path)" goto :Exit_Success @@ -173,8 +181,6 @@ if "%AzureBuild%" == "True" ( echo ##vso[task.prependpath]%_dotnetRoot% ) -set LOCAL_NUGET_PACKAGES_DIR=.\local-nuget-packages - :: Build managed code echo "" echo "#################################" @@ -191,6 +197,37 @@ if "%BuildDotNetBridgeOnly%" == "True" ( call "%_dotnet%" build -c %Configuration% --force "%__currentScriptDir%src\Platforms\build.csproj" call "%_dotnet%" publish "%__currentScriptDir%src\Platforms\build.csproj" --force --self-contained -r win-x64 -c %Configuration% + +if "%Configuration:~-5%" == "Py3.7" set VerifyManifest=True +if "%VerifyManifest%" == "True" set BuildManifestGenerator=True +if "%UpdateManifest%" == "True" set BuildManifestGenerator=True + +if "%BuildManifestGenerator%" == "True" ( + echo "" + echo "#################################" + echo "Building Manifest Generator... " + echo "#################################" + call "%_dotnet%" build -c %Configuration% -o "%BuildOutputDir%%Configuration%" --force "%__currentScriptDir%src\ManifestGenerator\ManifestGenerator.csproj" +) + +if "%UpdateManifest%" == "True" ( + echo Updating manifest.json ... + call "%_dotnet%" "%BuildOutputDir%%Configuration%\ManifestGenerator.dll" create %__currentScriptDir%\src\python\tools\manifest.json + echo manifest.json updated. + echo Run entrypoint_compiler.py --generate_api --generate_entrypoints to generate entry points and api files. + goto :Exit_Success +) + +if "%VerifyManifest%" == "True" ( + echo Verifying manifest.json ... + call "%_dotnet%" "%BuildOutputDir%%Configuration%\ManifestGenerator.dll" verify %__currentScriptDir%\src\python\tools\manifest.json + if errorlevel 1 ( + echo manifest.json is invalid. + echo Run build --updateManifest to update manifest.json. + goto :Exit_Error + ) +) + echo "" echo "#################################" echo "Downloading Dependencies " @@ -352,13 +389,13 @@ if "%InstallPythonPackages%" == "True" ( echo "#################################" echo "Installing python packages ... " echo "#################################" - call "%PythonExe%" -m pip install --upgrade pip - call "%PythonExe%" -m pip install --upgrade nose pytest graphviz imageio pytest-cov "jupyter_client>=4.4.0" "nbconvert>=4.2.0" + call "%PythonExe%" -m pip install --upgrade "pip==19.3.1" + call "%PythonExe%" -m pip install --upgrade nose pytest pytest-xdist graphviz imageio pytest-cov "jupyter_client>=4.4.0" "nbconvert>=4.2.0" if %PythonVersion% == 2.7 ( call "%PythonExe%" -m pip install --upgrade pyzmq ) else ( - call "%PythonExe%" -m pip install --upgrade "azureml-dataprep>=1.1.12" + call "%PythonExe%" -m pip install --upgrade "azureml-dataprep>=1.1.33" ) call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%" @@ -379,27 +416,53 @@ set TestsPath1=%PackagePath%\tests set TestsPath2=%__currentScriptDir%src\python\tests set TestsPath3=%__currentScriptDir%src\python\tests_extended set ReportPath=%__currentScriptDir%build\TestCoverageReport -call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath1%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%" -if errorlevel 1 ( - goto :Exit_Error -) -call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath2%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%" +set NumConcurrentTests=%NUMBER_OF_PROCESSORS% + +call "%PythonExe%" -m pytest -n %NumConcurrentTests% --verbose --maxfail=1000 --capture=sys "%TestsPath2%" "%TestsPath1%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%" if errorlevel 1 ( - goto :Exit_Error + :: Rerun any failed tests to give them one more + :: chance in case the errors were intermittent. + call "%PythonExe%" -m pytest -n %NumConcurrentTests% --last-failed --verbose --maxfail=1000 --capture=sys "%TestsPath2%" "%TestsPath1%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%" + if errorlevel 1 ( + goto :Exit_Error + ) ) if "%RunExtendedTests%" == "True" ( - call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath3%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%" + call "%PythonExe%" -m pytest -n %NumConcurrentTests% --verbose --maxfail=1000 --capture=sys "%TestsPath3%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%" if errorlevel 1 ( - goto :Exit_Error + :: Rerun any failed tests to give them one more + :: chance in case the errors were intermittent. + call "%PythonExe%" -m pytest -n %NumConcurrentTests% --last-failed --verbose --maxfail=1000 --capture=sys "%TestsPath3%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%" + if errorlevel 1 ( + goto :Exit_Error + ) ) ) :Exit_Success +call :CleanUpDotnet endlocal exit /b %ERRORLEVEL% :Exit_Error +call :CleanUpDotnet endlocal echo Failed with error %ERRORLEVEL% -exit /b %ERRORLEVEL% \ No newline at end of file +exit /b %ERRORLEVEL% + +:CleanUpDotnet +:: Save the error level so it can be +:: restored when exiting the function +set PrevErrorLevel=%ERRORLEVEL% + +:: Shutdown all dotnet persistent servers so that the +:: dotnet executable is not left open in the background. +:: As of dotnet 2.1.3 three servers are left running in +:: the background. This will shutdown them all down. +:: See here for more info: https://github.com/dotnet/cli/issues/9458 +:: This fixes an issue when re-running the build script because +:: the build script was trying to replace the existing dotnet +:: binaries which were sometimes still in use. +call "%_dotnet%" build-server shutdown +exit /b %PrevErrorLevel% \ No newline at end of file diff --git a/build.sh b/build.sh index e2292693..2b20be39 100755 --- a/build.sh +++ b/build.sh @@ -175,8 +175,6 @@ then echo "Installing dotnet SDK ... " curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Version 2.1.701 -InstallDir ./cli - export LOCAL_NUGET_PACKAGES_DIR=./local-nuget-packages - # Build managed code echo "Building managed code ... " _dotnet="${__currentScriptDir}/cli/dotnet" @@ -284,7 +282,7 @@ then exit 1 fi # Review: Adding "--upgrade" to pip install will cause problems when using Anaconda as the python distro because of Anaconda's quirks with pytest. - "${PythonExe}" -m pip install nose "pytest>=4.4.0" graphviz "pytest-cov>=2.6.1" "jupyter_client>=4.4.0" "nbconvert>=4.2.0" + "${PythonExe}" -m pip install nose "pytest>=4.4.0" pytest-xdist graphviz "pytest-cov>=2.6.1" "jupyter_client>=4.4.0" "nbconvert>=4.2.0" if [ ${PythonVersion} = 2.7 ] then "${PythonExe}" -m pip install --upgrade pyzmq @@ -294,7 +292,7 @@ then "${PythonExe}" -m pip install --upgrade pytest-remotedata fi - "${PythonExe}" -m pip install --upgrade "azureml-dataprep>=1.1.12" + "${PythonExe}" -m pip install --upgrade "azureml-dataprep>=1.1.33" fi "${PythonExe}" -m pip install --upgrade "${Wheel}" "${PythonExe}" -m pip install "scikit-learn==0.19.2" @@ -311,12 +309,25 @@ then TestsPath2=${__currentScriptDir}/src/python/tests TestsPath3=${__currentScriptDir}/src/python/tests_extended ReportPath=${__currentScriptDir}/build/TestCoverageReport - "${PythonExe}" -m pytest --verbose --maxfail=1000 --capture=sys "${TestsPath1}" - "${PythonExe}" -m pytest --verbose --maxfail=1000 --capture=sys "${TestsPath2}" + "${PythonExe}" -m pytest -n 4 --verbose --maxfail=1000 --capture=sys "${TestsPath2}" "${TestsPath1}" || \ + "${PythonExe}" -m pytest -n 4 --last-failed --verbose --maxfail=1000 --capture=sys "${TestsPath2}" "${TestsPath1}" if [ ${__runExtendedTests} = true ] - then - "${PythonExe}" -m pytest --verbose --maxfail=1000 --capture=sys "${TestsPath3}" + then + echo "Running extended tests ... " + if [ ! "$(uname -s)" = "Darwin" ] + then + # Required for Image.py and Image_df.py to run successfully on Ubuntu. + { + apt-get update + apt-get install libc6-dev -y + apt-get install libgdiplus -y + } || { + # Required for Image.py and Image_df.py to run successfully on CentOS. + yum install glibc-devel -y + } + fi + "${PythonExe}" -m pytest -n 4 --verbose --maxfail=1000 --capture=sys "${TestsPath3}" fi fi diff --git a/build/ci/phase-template.yml b/build/ci/phase-template.yml index 4df9692c..047d95fe 100644 --- a/build/ci/phase-template.yml +++ b/build/ci/phase-template.yml @@ -26,7 +26,8 @@ phases: - script: $(_buildScript) --configuration $(_configuration) --runTests $(_testOptions) # Mac phases - ${{ if eq(parameters.name, 'Mac') }}: - - script: brew update && brew install https://raw.githubusercontent.com/Homebrew/homebrew-core/f5b1ac99a7fba27c19cee0bc4f036775c889b359/Formula/libomp.rb mono-libgdiplus gettext && brew link gettext --force + # Note: Manual defining of the libomp URL below is needed to avoid error at runtime. Installing using 'brew install libomp' results in "Intel MKL FATAL ERROR: Cannot load libmkl_intel_thread.dylib." + - script: brew update && brew install https://raw.githubusercontent.com/Homebrew/homebrew-core/f5b1ac99a7fba27c19cee0bc4f036775c889b359/Formula/libomp.rb gettext && brew link gettext --force && brew unlink python@2 && brew install mono-libgdiplus - ${{ if eq(parameters.testDistro, 'noTests') }}: - script: chmod 777 $(_buildScript) && $(_buildScript) --configuration $(_configuration) - ${{ if eq(parameters.testDistro, '') }}: @@ -50,7 +51,6 @@ phases: # Publish build artifacts - ${{ if or(eq(parameters.name, 'Linux_Ubuntu16'), eq(parameters.name, 'Windows'), eq(parameters.name, 'Mac')) }}: - task: PublishBuildArtifacts@1 - condition: and(always(), ne(variables['Build.Reason'], 'PullRequest')) displayName: Publish wheel file to VSTS artifacts inputs: pathToPublish: $(Build.SourcesDirectory)/target diff --git a/build/libs_linux.txt b/build/libs_linux.txt index c2c7d848..b7298fef 100644 --- a/build/libs_linux.txt +++ b/build/libs_linux.txt @@ -1,3 +1,4 @@ +Google.Protobuf.dll Newtonsoft.Json.dll libCpuMathNative.so libFastTreeNative.so diff --git a/build/libs_mac.txt b/build/libs_mac.txt index 1ebc1724..1c4dc2e4 100644 --- a/build/libs_mac.txt +++ b/build/libs_mac.txt @@ -1,3 +1,4 @@ +Google.Protobuf.dll Newtonsoft.Json.dll libCpuMathNative.dylib libFastTreeNative.dylib diff --git a/docs/developers/linux-build.md b/docs/developers/linux-build.md index 6ed681e8..fa59738e 100644 --- a/docs/developers/linux-build.md +++ b/docs/developers/linux-build.md @@ -12,9 +12,9 @@ Building NimbusML from source on Linux ## Build Run `./build.sh` -This downloads dependencies (.NET SDK, specific versions of Python and Boost), builds native code and managed code, and packages NimbusML into a pip-installable wheel. This produces debug binaries by default, and release versions can be specified by `./build.sh --configuration RlsLinPy3.7` for examle. +This downloads dependencies (.NET SDK, specific versions of Python and Boost), builds native code and managed code, and packages NimbusML into a pip-installable wheel. This produces debug binaries by default, and release versions can be specified by `./build.sh --configuration RlsLinPy3.7` for example. For additional options including running tests and building components independently, see `./build.sh -h`. ### Known Issues -The LightGBM estimator fails on Linux when building from source. The official NimbusML Linux wheel package on Pypi.org has a working version of LightGBM. \ No newline at end of file +The LightGBM estimator fails on Linux when building from source. The official NimbusML Linux wheel package on Pypi.org has a working version of LightGBM. diff --git a/docs/developers/windows-build.md b/docs/developers/windows-build.md index 8dd0e4b8..4d8c4da5 100644 --- a/docs/developers/windows-build.md +++ b/docs/developers/windows-build.md @@ -7,6 +7,6 @@ Building NimbusML from source on Windows ## Build Run `build.cmd` -This downloads dependencies (.NET SDK, specific versions of Python and Boost), builds native code and managed code, and packages NimbusML into a pip-installable wheel. This produces debug binaries by default, and release versions can be specified by `build.cmd --configuration RlsWinPy3.7` for examle. +This downloads dependencies (.NET SDK, specific versions of Python and Boost), builds native code and managed code, and packages NimbusML into a pip-installable wheel. This produces debug binaries by default, and release versions can be specified by `build.cmd --configuration RlsWinPy3.7` for example. For additional options including running tests and building components independently, see `build.cmd -?`. diff --git a/docs/release-notes/release-1.5.0.md b/docs/release-notes/release-1.5.0.md new file mode 100644 index 00000000..e5a2eded --- /dev/null +++ b/docs/release-notes/release-1.5.0.md @@ -0,0 +1,101 @@ +# [NimbusML](https://docs.microsoft.com/en-us/nimbusml/overview) 1.5.0 + +## **New Features** + +- **Initial implementation of `csr_matrix` output support.** + + [PR#250](https://github.com/microsoft/NimbusML/pull/250) + Add support for data output in `scipy.sparse.csr_matrix` format. + + ```python + xf = OneHotVectorizer(columns={'c0':'c0', 'c1':'c1'}) + xf.fit(train_df) + result = xf.transform(train_df, as_csr=True) + ``` + +- **Permutation Feature Importance for model interpretibility.** + + [PR#279](https://github.com/microsoft/NimbusML/pull/279) + Adds `permutation_feature_importance()` method to `Pipeline` and + predictor estimators, enabling evaluation of model-wide feature + importances on any dataset with same schema as the dataset used + to fit the `Pipeline`. + + ```python + pipe = Pipeline([ + LogisticRegressionBinaryClassifier(label='label', feature=['feature']) + ]) + pipe.fit(data) + pipe.permutation_feature_importance(data) + ``` + +- **Initial implementation of DateTime input and output column support.** + + [PR#290](https://github.com/microsoft/NimbusML/pull/290) + Add initial support for input and output of Pandas DateTime columns. + +- **Initial implementation of LpScaler.** + + [PR#253](https://github.com/microsoft/NimbusML/pull/253) + Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). + Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D + is either L2 norm, L1 norm or LInf norm. + +- **Add support for variable length vector output.** + + [PR#267](https://github.com/microsoft/NimbusML/pull/267) + Support output of columns returned from ML.Net which contain variable length vectors. + +- **Save `predictor_model` when pickling a `Pipeline`.** + + [PR#295](https://github.com/microsoft/NimbusML/pull/295) + +- **Initial implementation of the WordTokenizer transform.** + + [PR#296](https://github.com/microsoft/NimbusML/pull/296) + +- **Add support for summary output from tree based predictors.** + + [PR#298](https://github.com/microsoft/NimbusML/pull/298) + +## **Bug Fixes** + +- **Fixed `Pipeline.transform()` in transform only `Pipeline` fails if y column is provided ** + + [PR#294](https://github.com/microsoft/NimbusML/pull/294) + Enable calling `.transform()` on a `Pipeline` containing only transforms when the y column is provided + +- **Fix issue when using `predict_proba` or `decision_function` with combined models.** + + [PR#272](https://github.com/microsoft/NimbusML/pull/272) + +- **Fix `Pipeline._extract_classes_from_headers` was not checking for valid steps.** + + [PR#292](https://github.com/microsoft/NimbusML/pull/292) + +- **Fix BinaryDataStream was not valid as input for transformer.** + + [PR#307](https://github.com/microsoft/NimbusML/pull/307) + +- **Fix casing for the installPythonPackages build.sh argument.** + + [PR#256](https://github.com/microsoft/NimbusML/pull/256) + +## **Breaking Changes** + +- **Removed `y` parameter from `Pipeline.transform()`** + + [PR#294](https://github.com/microsoft/NimbusML/pull/294) + Removed `y` parameter from `Pipeline.transform()` as it is not needed nor used for transforming data with a fitted `Pipeline`. + +## **Enhancements** + +None. + +## **Documentation and Samples** + +None. + +## **Remarks** + +None. diff --git a/docs/release-notes/release-1.6.0.md b/docs/release-notes/release-1.6.0.md new file mode 100644 index 00000000..fa5ef3d8 --- /dev/null +++ b/docs/release-notes/release-1.6.0.md @@ -0,0 +1,42 @@ +# [NimbusML](https://docs.microsoft.com/en-us/nimbusml/overview) 1.6.0 + +## **New Features** + +- **Initial implementation of NGramExtractor.** + + [PR#320](https://github.com/microsoft/NimbusML/pull/320) + Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) + in a given vector of keys. It does so by building a dictionary of n-grams and using + the id in the dictionary as the index in the bag. + +- **Update Manifest Generator.** + + [PR#329](https://github.com/microsoft/NimbusML/pull/329) + Update the Manifest Generator project to work with the latest changes and incorporate + it in to the build process. + +## **Bug Fixes** + +None. + +## **Enhancements** + +- **Update To ML.Net Version 1.4.0.** + + [PR#353](https://github.com/microsoft/NimbusML/pull/353) + +- **Update To Latest Version Of DataPrep.** + + [PR#379](https://github.com/microsoft/NimbusML/pull/379) + +- **Update Tests To Execute In Parallel.** + + [PR#331](https://github.com/microsoft/NimbusML/pull/331) + +## **Documentation and Samples** + +None. + +## **Remarks** + +None. diff --git a/local-nuget-packages/MicrosoftMLFeaturizers.0.1.0.nupkg b/local-nuget-packages/MicrosoftMLFeaturizers.0.1.0.nupkg deleted file mode 100644 index 0a8b2fbd..00000000 Binary files a/local-nuget-packages/MicrosoftMLFeaturizers.0.1.0.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.extensions.ml.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.extensions.ml.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 2ceed83a..00000000 Binary files a/local-nuget-packages/microsoft.extensions.ml.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.extensions.ml.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.extensions.ml.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 845b027f..00000000 Binary files a/local-nuget-packages/microsoft.extensions.ml.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index a8debf72..00000000 Binary files a/local-nuget-packages/microsoft.ml.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.automl.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.automl.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index f858c678..00000000 Binary files a/local-nuget-packages/microsoft.ml.automl.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.automl.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.automl.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 3cf6ed34..00000000 Binary files a/local-nuget-packages/microsoft.ml.automl.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.cpumath.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.cpumath.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 008df73c..00000000 Binary files a/local-nuget-packages/microsoft.ml.cpumath.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.cpumath.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.cpumath.symbols.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index bdcd6852..00000000 Binary files a/local-nuget-packages/microsoft.ml.cpumath.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.dataview.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dataview.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 5729bfa7..00000000 Binary files a/local-nuget-packages/microsoft.ml.dataview.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.dataview.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dataview.symbols.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index beefe429..00000000 Binary files a/local-nuget-packages/microsoft.ml.dataview.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.dnn.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dnn.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index f728196c..00000000 Binary files a/local-nuget-packages/microsoft.ml.dnn.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.dnn.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dnn.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 73ffedf4..00000000 Binary files a/local-nuget-packages/microsoft.ml.dnn.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.ensemble.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.ensemble.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 9cbdef31..00000000 Binary files a/local-nuget-packages/microsoft.ml.ensemble.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.ensemble.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.ensemble.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 069b69d9..00000000 Binary files a/local-nuget-packages/microsoft.ml.ensemble.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.entrypoints.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.entrypoints.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 8e27e3cc..00000000 Binary files a/local-nuget-packages/microsoft.ml.entrypoints.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.entrypoints.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.entrypoints.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index f72c9382..00000000 Binary files a/local-nuget-packages/microsoft.ml.entrypoints.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.experimental.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.experimental.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 554d2417..00000000 Binary files a/local-nuget-packages/microsoft.ml.experimental.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.experimental.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.experimental.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index fc844210..00000000 Binary files a/local-nuget-packages/microsoft.ml.experimental.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.fasttree.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.fasttree.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 820b48b3..00000000 Binary files a/local-nuget-packages/microsoft.ml.fasttree.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.fasttree.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.fasttree.symbols.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 4174ee8e..00000000 Binary files a/local-nuget-packages/microsoft.ml.fasttree.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.featurizers.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.featurizers.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index cb04dfd5..00000000 Binary files a/local-nuget-packages/microsoft.ml.featurizers.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.featurizers.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.featurizers.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 5be74193..00000000 Binary files a/local-nuget-packages/microsoft.ml.featurizers.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.imageanalytics.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.imageanalytics.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 7c5afeb9..00000000 Binary files a/local-nuget-packages/microsoft.ml.imageanalytics.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.imageanalytics.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.imageanalytics.symbols.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 11d473a0..00000000 Binary files a/local-nuget-packages/microsoft.ml.imageanalytics.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.lightgbm.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.lightgbm.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 381c705c..00000000 Binary files a/local-nuget-packages/microsoft.ml.lightgbm.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.lightgbm.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.lightgbm.symbols.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index cbd0cf9d..00000000 Binary files a/local-nuget-packages/microsoft.ml.lightgbm.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.mkl.components.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.mkl.components.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 7e448a72..00000000 Binary files a/local-nuget-packages/microsoft.ml.mkl.components.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.mkl.components.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.mkl.components.symbols.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index c24c142e..00000000 Binary files a/local-nuget-packages/microsoft.ml.mkl.components.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.mkl.redist.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.mkl.redist.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 42d18904..00000000 Binary files a/local-nuget-packages/microsoft.ml.mkl.redist.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.onnxconverter.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxconverter.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 045429c8..00000000 Binary files a/local-nuget-packages/microsoft.ml.onnxconverter.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.onnxconverter.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxconverter.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 4a1216b1..00000000 Binary files a/local-nuget-packages/microsoft.ml.onnxconverter.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.onnxtransformer.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxtransformer.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 0d97af5c..00000000 Binary files a/local-nuget-packages/microsoft.ml.onnxtransformer.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.onnxtransformer.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxtransformer.symbols.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index e8e99abc..00000000 Binary files a/local-nuget-packages/microsoft.ml.onnxtransformer.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.parquet.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.parquet.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 8f51320e..00000000 Binary files a/local-nuget-packages/microsoft.ml.parquet.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.parquet.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.parquet.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 765ce5f9..00000000 Binary files a/local-nuget-packages/microsoft.ml.parquet.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.recommender.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.recommender.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index dffcf5c4..00000000 Binary files a/local-nuget-packages/microsoft.ml.recommender.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.recommender.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.recommender.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 0c802cb0..00000000 Binary files a/local-nuget-packages/microsoft.ml.recommender.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.sampleutils.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.sampleutils.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 88add318..00000000 Binary files a/local-nuget-packages/microsoft.ml.sampleutils.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.sampleutils.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.sampleutils.symbols.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 6348fe79..00000000 Binary files a/local-nuget-packages/microsoft.ml.sampleutils.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.symbols.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 6637e4ff..00000000 Binary files a/local-nuget-packages/microsoft.ml.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.tensorflow.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.tensorflow.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 2b4619e7..00000000 Binary files a/local-nuget-packages/microsoft.ml.tensorflow.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.tensorflow.redist.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.tensorflow.redist.0.18.2-preview2-28208-8.nupkg deleted file mode 100644 index 2e943616..00000000 Binary files a/local-nuget-packages/microsoft.ml.tensorflow.redist.0.18.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.tensorflow.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.tensorflow.symbols.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 88925eb0..00000000 Binary files a/local-nuget-packages/microsoft.ml.tensorflow.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.timeseries.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.timeseries.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index 036a2ca2..00000000 Binary files a/local-nuget-packages/microsoft.ml.timeseries.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/local-nuget-packages/microsoft.ml.timeseries.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.timeseries.symbols.1.6.2-preview2-28208-8.nupkg deleted file mode 100644 index fcb211d3..00000000 Binary files a/local-nuget-packages/microsoft.ml.timeseries.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ diff --git a/nimbusml.sln b/nimbusml.sln index 546014a9..c87f94b8 100644 --- a/nimbusml.sln +++ b/nimbusml.sln @@ -20,6 +20,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution version.txt = version.txt EndProjectSection EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ManifestGenerator", "src\ManifestGenerator\ManifestGenerator.csproj", "{D3AED287-722F-4243-966E-77AD0652B38E}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution DbgLinPy2.7|x64 = DbgLinPy2.7|x64 @@ -65,36 +67,36 @@ Global {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.5|x64.ActiveCfg = DbgLinPy3.5|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.5|x64.Build.0 = DbgLinPy3.5|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.6|x64.ActiveCfg = DbgLinPy3.6|x64 - {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.7|x64.ActiveCfg = DbgLinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.6|x64.Build.0 = DbgLinPy3.6|x64 + {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.7|x64.ActiveCfg = DbgLinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.7|x64.Build.0 = DbgLinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy2.7|x64.ActiveCfg = DbgWinPy2.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy2.7|x64.Build.0 = DbgWinPy2.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.5|x64.ActiveCfg = DbgWinPy3.5|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.5|x64.Build.0 = DbgWinPy3.5|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.6|x64.ActiveCfg = DbgWinPy3.6|x64 - {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.7|x64.ActiveCfg = DbgWinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.6|x64.Build.0 = DbgWinPy3.6|x64 + {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.7|x64.ActiveCfg = DbgWinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.7|x64.Build.0 = DbgWinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy2.7|x64.ActiveCfg = RlsLinPy2.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy2.7|x64.Build.0 = RlsLinPy2.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.5|x64.ActiveCfg = RlsLinPy3.5|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.5|x64.Build.0 = RlsLinPy3.5|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.6|x64.ActiveCfg = RlsLinPy3.6|x64 - {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.7|x64.ActiveCfg = RlsLinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.6|x64.Build.0 = RlsLinPy3.6|x64 + {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.7|x64.ActiveCfg = RlsLinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.7|x64.Build.0 = RlsLinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsMacPy3.6|x64.ActiveCfg = RlsMacPy3.6|x64 - {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsMacPy3.7|x64.ActiveCfg = RlsMacPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsMacPy3.6|x64.Build.0 = RlsMacPy3.6|x64 + {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsMacPy3.7|x64.ActiveCfg = RlsMacPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsMacPy3.7|x64.Build.0 = RlsMacPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy2.7|x64.ActiveCfg = RlsWinPy2.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy2.7|x64.Build.0 = RlsWinPy2.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.5|x64.ActiveCfg = RlsWinPy3.5|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.5|x64.Build.0 = RlsWinPy3.5|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.6|x64.ActiveCfg = RlsWinPy3.6|x64 - {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.7|x64.ActiveCfg = RlsWinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.6|x64.Build.0 = RlsWinPy3.6|x64 + {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.7|x64.ActiveCfg = RlsWinPy3.7|x64 {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.7|x64.Build.0 = RlsWinPy3.7|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgLinPy2.7|x64.ActiveCfg = DbgWinPy2.7|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgLinPy3.5|x64.ActiveCfg = DbgWinPy3.5|x64 @@ -105,8 +107,8 @@ Global {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.5|x64.ActiveCfg = DbgWinPy3.5|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.5|x64.Build.0 = DbgWinPy3.5|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.6|x64.ActiveCfg = DbgWinPy3.6|x64 - {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.7|x64.ActiveCfg = DbgWinPy3.7|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.6|x64.Build.0 = DbgWinPy3.6|x64 + {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.7|x64.ActiveCfg = DbgWinPy3.7|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.7|x64.Build.0 = DbgWinPy3.7|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsLinPy2.7|x64.ActiveCfg = RlsWinPy2.7|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsLinPy3.5|x64.ActiveCfg = RlsWinPy3.5|x64 @@ -119,9 +121,45 @@ Global {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.5|x64.ActiveCfg = RlsWinPy3.5|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.5|x64.Build.0 = RlsWinPy3.5|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.6|x64.ActiveCfg = RlsWinPy3.6|x64 - {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.7|x64.ActiveCfg = RlsWinPy3.7|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.6|x64.Build.0 = RlsWinPy3.6|x64 + {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.7|x64.ActiveCfg = RlsWinPy3.7|x64 {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.7|x64.Build.0 = RlsWinPy3.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy2.7|x64.ActiveCfg = DbgLinPy2.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy2.7|x64.Build.0 = DbgLinPy2.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.5|x64.ActiveCfg = DbgLinPy3.5|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.5|x64.Build.0 = DbgLinPy3.5|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.6|x64.ActiveCfg = DbgLinPy3.6|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.6|x64.Build.0 = DbgLinPy3.6|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.7|x64.ActiveCfg = DbgLinPy3.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.7|x64.Build.0 = DbgLinPy3.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy2.7|x64.ActiveCfg = DbgWinPy2.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy2.7|x64.Build.0 = DbgWinPy2.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.5|x64.ActiveCfg = DbgWinPy3.5|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.5|x64.Build.0 = DbgWinPy3.5|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.6|x64.ActiveCfg = DbgWinPy3.6|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.6|x64.Build.0 = DbgWinPy3.6|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.7|x64.ActiveCfg = DbgWinPy3.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.7|x64.Build.0 = DbgWinPy3.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy2.7|x64.ActiveCfg = RlsLinPy2.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy2.7|x64.Build.0 = RlsLinPy2.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.5|x64.ActiveCfg = RlsLinPy3.5|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.5|x64.Build.0 = RlsLinPy3.5|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.6|x64.ActiveCfg = RlsLinPy3.6|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.6|x64.Build.0 = RlsLinPy3.6|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.7|x64.ActiveCfg = RlsLinPy3.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.7|x64.Build.0 = RlsLinPy3.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsMacPy3.6|x64.ActiveCfg = RlsMacPy3.6|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsMacPy3.6|x64.Build.0 = RlsMacPy3.6|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsMacPy3.7|x64.ActiveCfg = RlsMacPy3.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsMacPy3.7|x64.Build.0 = RlsMacPy3.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy2.7|x64.ActiveCfg = RlsWinPy2.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy2.7|x64.Build.0 = RlsWinPy2.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.5|x64.ActiveCfg = RlsWinPy3.5|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.5|x64.Build.0 = RlsWinPy3.5|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.6|x64.ActiveCfg = RlsWinPy3.6|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.6|x64.Build.0 = RlsWinPy3.6|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.7|x64.ActiveCfg = RlsWinPy3.7|x64 + {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.7|x64.Build.0 = RlsWinPy3.7|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/nuget.config b/nuget.config index 75ab3744..c0efdcaa 100644 --- a/nuget.config +++ b/nuget.config @@ -5,7 +5,7 @@ - - + + diff --git a/release-next.md b/release-next.md index 031f060f..c6d1ec43 100644 --- a/release-next.md +++ b/release-next.md @@ -2,91 +2,11 @@ ## **New Features** -- **Initial implementation of `csr_matrix` output support.** - - [PR#250](https://github.com/microsoft/NimbusML/pull/250) - Add support for data output in `scipy.sparse.csr_matrix` format. - - ```python - xf = OneHotVectorizer(columns={'c0':'c0', 'c1':'c1'}) - xf.fit(train_df) - result = xf.transform(train_df, as_csr=True) - ``` - -- **Permutation Feature Importance for model interpretibility.** - - [PR#279](https://github.com/microsoft/NimbusML/pull/279) - Adds `permutation_feature_importance()` method to `Pipeline` and - predictor estimators, enabling evaluation of model-wide feature - importances on any dataset with same schema as the dataset used - to fit the `Pipeline`. - - ```python - pipe = Pipeline([ - LogisticRegressionBinaryClassifier(label='label', feature=['feature']) - ]) - pipe.fit(data) - pipe.permutation_feature_importance(data) - ``` - -- **Initial implementation of DateTime input and output column support.** - - [PR#290](https://github.com/microsoft/NimbusML/pull/290) - Add initial support for input and output of Pandas DateTime columns. - -- **Initial implementation of LpScaler.** - - [PR#253](https://github.com/microsoft/NimbusML/pull/253) - Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). - Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D - is either L2 norm, L1 norm or LInf norm. - -- **Add support for variable length vector output.** - - [PR#267](https://github.com/microsoft/NimbusML/pull/267) - Support output of columns returned from ML.Net which contain variable length vectors. - -- **Save `predictor_model` when pickling a `Pipeline`.** - - [PR#295](https://github.com/microsoft/NimbusML/pull/295) - -- **Initial implementation of the WordTokenizer transform.** - - [PR#296](https://github.com/microsoft/NimbusML/pull/296) - -- **Add support for summary output from tree based predictors.** - - [PR#298](https://github.com/microsoft/NimbusML/pull/298) +None. ## **Bug Fixes** -- **Fixed `Pipeline.transform()` in transform only `Pipeline` fails if y column is provided ** - - [PR#294](https://github.com/microsoft/NimbusML/pull/294) - Enable calling `.transform()` on a `Pipeline` containing only transforms when the y column is provided - -- **Fix issue when using `predict_proba` or `decision_function` with combined models.** - - [PR#272](https://github.com/microsoft/NimbusML/pull/272) - -- **Fix `Pipeline._extract_classes_from_headers` was not checking for valid steps.** - - [PR#292](https://github.com/microsoft/NimbusML/pull/292) - -- **Fix BinaryDataStream was not valid as input for transformer.** - - [PR#307](https://github.com/microsoft/NimbusML/pull/307) - -- **Fix casing for the installPythonPackages build.sh argument.** - - [PR#256](https://github.com/microsoft/NimbusML/pull/256) - -## **Breaking Changes** - -- **Removed `y` parameter from `Pipeline.transform()`** - - [PR#294](https://github.com/microsoft/NimbusML/pull/294) - Removed `y` parameter from `Pipeline.transform()` as it is not needed nor used for transforming data with a fitted `Pipeline`. +None. ## **Enhancements** diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs index 00947124..30450540 100644 --- a/src/DotNetBridge/Bridge.cs +++ b/src/DotNetBridge/Bridge.cs @@ -8,9 +8,9 @@ using System.Text; using System.Threading; using Microsoft.ML; -using Microsoft.ML.Featurizers; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; +using Microsoft.ML.Featurizers; using Microsoft.ML.Runtime; using Microsoft.ML.Trainers; using Microsoft.ML.Trainers.Ensemble; @@ -302,8 +302,8 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd //env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly); //env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly); env.ComponentCatalog.RegisterAssembly(typeof(SsaChangePointDetector).Assembly); - env.ComponentCatalog.RegisterAssembly(typeof(CategoryImputerTransformer).Assembly); env.ComponentCatalog.RegisterAssembly(typeof(DotNetBridgeEntrypoints).Assembly); + env.ComponentCatalog.RegisterAssembly(typeof(DateTimeTransformer).Assembly); using (var ch = host.Start("Executing")) { diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj index 9985bb62..67ba3209 100644 --- a/src/DotNetBridge/DotNetBridge.csproj +++ b/src/DotNetBridge/DotNetBridge.csproj @@ -32,21 +32,21 @@ all runtime; build; native; contentfiles; analyzers - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index 9be84e67..535d9d75 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -33,7 +33,7 @@ public static CommonOutputs.TransformOutput ConcatColumns(IHostEnvironment env, for (int i = 0; i < input.Data.Schema.Count; i++) colNames.Add(input.Data.Schema[i].Name); - // Iterate throuh input options, find matching source columns, create new input options + // Iterate through input options, find matching source columns, create new input options var inputOptions = new ColumnConcatenatingTransformer.Options() { Data = input.Data }; var columns = new List(input.Columns.Length); foreach (var col in input.Columns) diff --git a/src/DotNetBridge/ManifestUtils.cs b/src/DotNetBridge/ManifestUtils.cs new file mode 100644 index 00000000..c01b8480 --- /dev/null +++ b/src/DotNetBridge/ManifestUtils.cs @@ -0,0 +1,112 @@ +//------------------------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +//------------------------------------------------------------------------------ + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Reflection; +using System.Text.RegularExpressions; +using Microsoft.ML.Data; +using Microsoft.ML.EntryPoints; +using Microsoft.ML.Featurizers; +using Microsoft.ML.Model.OnnxConverter; +using Microsoft.ML.Runtime; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.Ensemble; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Trainers.LightGbm; +using Microsoft.ML.Transforms; +using Microsoft.ML.Transforms.TimeSeries; +using Newtonsoft.Json; +using Newtonsoft.Json.Linq; + + +namespace Microsoft.ML.DotNetBridge +{ + public static class ManifestUtils + { + private static readonly Type[] _types = new Type[] + { + typeof(TextLoader), + typeof(LinearModelParameters), + typeof(OneHotEncodingTransformer), + typeof(FastTreeBinaryModelParameters), + typeof(EnsembleModelParameters), + typeof(KMeansModelParameters), + typeof(PcaModelParameters), + typeof(CVSplit), + typeof(LightGbmBinaryModelParameters), + typeof(TensorFlowTransformer), + typeof(ImageLoadingTransformer), + typeof(SymbolicSgdLogisticRegressionBinaryTrainer), + typeof(OnnxContext), + typeof(SsaForecastingTransformer), + typeof(VariableColumnTransform), + typeof(DateTimeTransformer) + }; + + private static (IEnumerable epListContents, JObject manifest) BuildManifests() + { + ConsoleEnvironment env = new ConsoleEnvironment(); + + foreach (Type type in _types) + { + env.ComponentCatalog.RegisterAssembly(type.Assembly); + } + + var catalog = env.ComponentCatalog; + + var regex = new Regex(@"\r\n?|\n", RegexOptions.Compiled); + var epListContents = catalog.AllEntryPoints() + .Select(x => string.Join("\t", + x.Name, + regex.Replace(x.Description, ""), + x.Method.DeclaringType, + x.Method.Name, + x.InputType, + x.OutputType) + .Replace(Environment.NewLine, "", StringComparison.Ordinal)) + .OrderBy(x => x); + + var manifest = JsonManifestUtils.BuildAllManifests(env, catalog); + + //clean up the description from the new line characters + if (manifest[FieldNames.TopEntryPoints] != null && manifest[FieldNames.TopEntryPoints] is JArray) + { + foreach (JToken entry in manifest[FieldNames.TopEntryPoints].Children()) + if (entry[FieldNames.Desc] != null) + entry[FieldNames.Desc] = regex.Replace(entry[FieldNames.Desc].ToString(), ""); + } + + return (epListContents, manifest); + } + + public static void ShowAssemblyInfo() + { + foreach (Type type in _types) + { + Assembly assembly = type.Assembly; + Console.WriteLine(assembly.Location); + } + } + + public static void GenerateManifest(string filePath) + { + var (epListContents, jObj) = BuildManifests(); + + if (!string.IsNullOrWhiteSpace(filePath)) + File.Delete(filePath); + + using (var file = File.OpenWrite(filePath)) + using (var writer = new StreamWriter(file)) + using (var jw = new JsonTextWriter(writer)) + { + jw.Formatting = Formatting.Indented; + jObj.WriteTo(jw); + } + } + } +} diff --git a/src/DotNetBridge/MessageValidator.cs b/src/DotNetBridge/MessageValidator.cs index 4243a45d..1ebcae67 100644 --- a/src/DotNetBridge/MessageValidator.cs +++ b/src/DotNetBridge/MessageValidator.cs @@ -195,7 +195,7 @@ public sealed class MessageValidator { return "Failed to initialize CUDA runtime. Possible reasons:" + "\n" + @"1. The machine does not have CUDA-capable card. Supported devices have compute capability 2.0 and higher." + "\n" + - @"2. Outdated graphics drivers. Please install the latest drivers from http://www.nvidia.com/Drivers ." + "\n" + + @"2. Outdated graphics drivers. Please install the latest drivers from https://www.nvidia.com/Download/index.aspx?lang=en-us ." + "\n" + @"3. CUDA runtime DLLs are missing, please see the GPU acceleration help for the installation instructions."; } ) diff --git a/src/ManifestGenerator/ManifestGenerator.cs b/src/ManifestGenerator/ManifestGenerator.cs index 985318f6..b872775d 100644 --- a/src/ManifestGenerator/ManifestGenerator.cs +++ b/src/ManifestGenerator/ManifestGenerator.cs @@ -3,56 +3,79 @@ // Licensed under the MIT License. //------------------------------------------------------------------------------ +using System; using System.IO; -using Microsoft.ML.Runtime; -using Microsoft.ML.Runtime.Data; -using Microsoft.ML.Runtime.EntryPoints.JsonUtils; -using Microsoft.ML.Runtime.ImageAnalytics; -using Microsoft.ML.Runtime.Learners; -using Microsoft.ML.Runtime.LightGBM; -using Microsoft.ML.Runtime.Model.Onnx; -using Microsoft.ML.Runtime.PipelineInference; -using Microsoft.ML.Trainers.FastTree; -using Microsoft.ML.Trainers.KMeans; -using Microsoft.ML.Trainers.PCA; -using Microsoft.ML.Trainers.SymSgd; -using Microsoft.ML.Transforms; -using Microsoft.ML.Transforms.Categorical; -using Newtonsoft.Json; - -namespace Microsoft.MachineLearning.ManifestGenerator +using System.Linq; +using Microsoft.ML.DotNetBridge; + + +namespace Microsoft.ML.ManifestGenerator { public static class ManifestGenerator { - public static void Main() + private const int ERROR_SUCCESS = 0; + private const int ERROR_BAD_ARGUMENTS = 1; + private const int ERROR_MANIFEST_INVALID = 2; + + public static void ShowUsage() { - using (var env = new ConsoleEnvironment()) + string usage = + "Usage:\n" + + " create MANIFEST_PATH Creates a new manifest given the\n" + + " current assemblies and stores it\n" + + " in the file MANIFEST_PATH.\n" + + " verify MANIFEST_PATH Checks if the manifest specified by\n" + + " MANIFEST_PATH is valid given the\n" + + " the current assemblies.\n" + + "\n"; + + Console.WriteLine(usage); + } + + public static int Main(string[] args) + { + int exitCode = ERROR_BAD_ARGUMENTS; + + if (args.Length == 2) { - env.ComponentCatalog.RegisterAssembly(typeof(TextLoader).Assembly); // ML.Data - env.ComponentCatalog.RegisterAssembly(typeof(LinearPredictor).Assembly); // ML.StandardLearners - env.ComponentCatalog.RegisterAssembly(typeof(CategoricalTransform).Assembly); // ML.Transforms - env.ComponentCatalog.RegisterAssembly(typeof(FastTreeBinaryPredictor).Assembly); // ML.FastTree - env.ComponentCatalog.RegisterAssembly(typeof(KMeansPredictor).Assembly); // ML.KMeansClustering - env.ComponentCatalog.RegisterAssembly(typeof(PcaPredictor).Assembly); // ML.PCA - env.ComponentCatalog.RegisterAssembly(typeof(Experiment).Assembly); // ML.Legacy - env.ComponentCatalog.RegisterAssembly(typeof(LightGbmBinaryPredictor).Assembly); - env.ComponentCatalog.RegisterAssembly(typeof(TensorFlowTransform).Assembly); - env.ComponentCatalog.RegisterAssembly(typeof(ImageLoaderTransform).Assembly); - env.ComponentCatalog.RegisterAssembly(typeof(SymSgdClassificationTrainer).Assembly); - env.ComponentCatalog.RegisterAssembly(typeof(AutoInference).Assembly); - env.ComponentCatalog.RegisterAssembly(typeof(SaveOnnxCommand).Assembly); - var catalog = env.ComponentCatalog; - var jObj = JsonManifestUtils.BuildAllManifests(env, catalog); - - var jPath = "manifest.json"; - using (var file = File.OpenWrite(jPath)) - using (var writer = new StreamWriter(file)) - using (var jw = new JsonTextWriter(writer)) + if (args[0].ToLower() == "create") { - jw.Formatting = Formatting.Indented; - jObj.WriteTo(jw); + ManifestUtils.ShowAssemblyInfo(); + ManifestUtils.GenerateManifest(args[1]); + + exitCode = ERROR_SUCCESS; } + else if (args[0].ToLower() == "verify") + { + string tmpFilePath = Path.GetTempFileName(); + ManifestUtils.GenerateManifest(tmpFilePath); + + exitCode = FilesMatch(args[1], tmpFilePath) ? + exitCode = ERROR_SUCCESS : + exitCode = ERROR_MANIFEST_INVALID; + + File.Delete(tmpFilePath); + } + } + + if (exitCode == ERROR_BAD_ARGUMENTS) + { + Console.WriteLine("ManifestGenerator: Error - Invalid Arguments."); + ShowUsage(); } + + return exitCode; + } + + private static bool FilesMatch(string path1, string path2) + { + long fileLength1 = new FileInfo(path1).Length; + long fileLength2 = new FileInfo(path2).Length; + if (fileLength1 != fileLength2) return false; + + // TODO: read in only parts of the file at a time + bool bytesMatch = File.ReadAllBytes(path1).SequenceEqual(File.ReadAllBytes(path2)); + return bytesMatch; } } } diff --git a/src/ManifestGenerator/ManifestGenerator.csproj b/src/ManifestGenerator/ManifestGenerator.csproj index 4cd94610..13e69006 100644 --- a/src/ManifestGenerator/ManifestGenerator.csproj +++ b/src/ManifestGenerator/ManifestGenerator.csproj @@ -1,18 +1,24 @@  - {D3AED287-722F-4243-966E-77AD0652B38E} - Exe - Properties + netcoreapp2.1 true x64 + CORECLR ManifestGenerator ManifestGenerator false - $(OutputBase) - Debug;Release - Microsoft.MachineLearning.ManifestGenerator.ManifestGenerator + ..\$(Platform)\$(Configuration)\ + DbgWinPy3.7;DbgWinPy3.6;DbgWinPy3.5;DbgWinPy2.7;RlsWinPy3.7;RlsWinPy3.6;RlsWinPy3.5;RlsWinPy2.7;DbgLinPy3.7;DbgLinPy3.6;DbgLinPy3.5;DbgLinPy2.7;RlsLinPy3.7;RlsLinPy3.6;RlsLinPy3.5;RlsLinPy2.7;RlsMacPy3.7;RlsMacPy3.6 + Microsoft.ML.ManifestGenerator.ManifestGenerator + 0.1.0 + Microsoft Corporation + (c) Microsoft Corporation. All rights reserved. + https://github.com/Microsoft/NimbusML + https://github.com/Microsoft/NimbusML + latest - netcoreapp2.0 + {D3AED287-722F-4243-966E-77AD0652B38E} + Exe @@ -24,13 +30,7 @@ - - - - - - - + diff --git a/src/ManifestGenerator/ManifestGenerator.sln b/src/ManifestGenerator/ManifestGenerator.sln deleted file mode 100644 index 56d26d1d..00000000 --- a/src/ManifestGenerator/ManifestGenerator.sln +++ /dev/null @@ -1,25 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 15 -VisualStudioVersion = 15.0.27428.2037 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ManifestGenerator", "ManifestGenerator.csproj", "{D3AED287-722F-4243-966E-77AD0652B38E}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|x64 = Debug|x64 - Release|x64 = Release|x64 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {D3AED287-722F-4243-966E-77AD0652B38E}.Debug|x64.ActiveCfg = Debug|x64 - {D3AED287-722F-4243-966E-77AD0652B38E}.Debug|x64.Build.0 = Debug|x64 - {D3AED287-722F-4243-966E-77AD0652B38E}.Release|x64.ActiveCfg = Release|x64 - {D3AED287-722F-4243-966E-77AD0652B38E}.Release|x64.Build.0 = Release|x64 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {620035F0-EA24-426B-BA6F-FF34BC8E14FA} - EndGlobalSection -EndGlobal diff --git a/src/ManifestGenerator/app.config b/src/ManifestGenerator/app.config index 5618aa07..7ded20c2 100644 --- a/src/ManifestGenerator/app.config +++ b/src/ManifestGenerator/app.config @@ -1,13 +1,5 @@ - - - - - - - - diff --git a/src/NativeBridge/DataViewInterop.h b/src/NativeBridge/DataViewInterop.h index c764b285..f9e87763 100644 --- a/src/NativeBridge/DataViewInterop.h +++ b/src/NativeBridge/DataViewInterop.h @@ -16,7 +16,7 @@ typedef MANAGED_CALLBACK_PTR(bool, GETLABELS)(DataSourceBlock *source, int col, // REVIEW: boost_python is not updated at the same speed as swig or pybind11. // Both have a larger audience now, see about pybind11 https://github.com/davisking/dlib/issues/293 -// It handles csr_matrix: http://pybind11-rtdtest.readthedocs.io/en/stable/advanced.html#transparent-conversion-of-dense-and-sparse-eigen-data-types. +// It handles csr_matrix: https://pybind11-rtdtest.readthedocs.io/en/stable/advanced.html#transparent-conversion-of-dense-and-sparse-eigen-data-types. using namespace boost::python; // The data source wrapper used for managed interop. Some of the fields of this are visible to managed code. @@ -240,6 +240,7 @@ class DataSourceBlock if (bp::extract(str(s).encode("utf_8")).check()) { + size = -1; missing = -1; pch = bp::extract(str(s).encode("utf_8")); #if _MSC_VER diff --git a/src/NativeBridge/NativeBridge.vcxproj b/src/NativeBridge/NativeBridge.vcxproj index f9cf674c..82367ce5 100644 --- a/src/NativeBridge/NativeBridge.vcxproj +++ b/src/NativeBridge/NativeBridge.vcxproj @@ -150,7 +150,7 @@ CORECLR;_DEBUG;_WINDOWS;_USRDLL;PYBRIDGE_EXPORTS;BOOST_USE_STATIC_LIBS;BOOST_PYTHON_STATIC_LIB;BOOST_ALL_NO_LIB;BOOST_NUMPY_STATIC_LIB;_HAS_ITERATOR_DEBUGGING;%(PreprocessorDefinitions) $(BoostRoot)\Include;$(PythonRoot)\include true - MultiThreadedDebug + MultiThreadedDebugDLL true diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj index 626822c9..6a0b7ab7 100644 --- a/src/Platforms/build.csproj +++ b/src/Platforms/build.csproj @@ -11,21 +11,21 @@ - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + diff --git a/src/python/README.md b/src/python/README.md index 74254980..4385ade0 100644 --- a/src/python/README.md +++ b/src/python/README.md @@ -1,27 +1,35 @@ -NimbusML +# NimbusML -`nimbusml` provides battle-tested state-of-the-art ML algorithms, -transforms and components, aiming to make them useful for all -developers, data scientists, and information workers and helpful in all -products, services and devices. The components are authored by the team -members, as well as numerous contributors from MSR, CISL, Bing and other -teams at Microsoft. +`nimbusml` is a Python module that provides Python bindings for [ML.NET](https://github.com/dotnet/machinelearning). + +`nimbusml` aims to enable data science teams that are more familiar with Python +to take advantage of ML.NET's functionality and performance. It provides +battle-tested, state-of-the-art ML algorithms, transforms, and components. The +components are authored by the team members, as well as numerous contributors +from MSR, CISL, Bing, and other teams at Microsoft. `nimbusml` is interoperable with `scikit-learn` estimators and transforms, -while adding a suite of highly optimized algorithms written in C++ and -C\# for speed and performance. `nimbusml` trainers and transforms support -the following data structures for the `fit()` and `transform()` methods: +while adding a suite of fast, highly optimized, and scalable algorithms written +in C++ and C\#. `nimbusml` trainers and transforms support the following data +structures for the `fit()` and `transform()` methods: - `numpy.ndarray` - `scipy.sparse_cst` - `pandas.DataFrame`. -In addition, `nimbusml` also supports streaming from files without loading -the dataset into memory, which allows training on data significantly -exceeding memory using `FileDataStream`. +In addition, `nimbusml` also supports streaming from files without loading the +dataset into memory with `FileDataStream`, which allows training on data +significantly exceeding memory. -With `FileDataStream` `nimbusml` is able to handle up to **billion** features - and **billions** of training examples for select algorithms. +With `FileDataStream`, `nimbusml` is able to handle up to a **billion** +features and **billions** of training examples for select algorithms. For more details, please refer to the documentation: -. \ No newline at end of file +. + +## Third party notices + +`nimbusml` contains ML.NET binaries and the .NET Core CLR runtime, as well as +their dependencies. Both ML.NET and .NET Core CLR are made available under the +MIT license. Please refer to the [third party notices](https://github.com/microsoft/NimbusML/blob/master/THIRD-PARTY-NOTICES.txt) +for full licensing information for ML.NET and .NET Core CLR. \ No newline at end of file diff --git a/src/python/docs/docstrings/AveragedPerceptronBinaryClassifier.txt b/src/python/docs/docstrings/AveragedPerceptronBinaryClassifier.txt index 45b12209..31314605 100644 --- a/src/python/docs/docstrings/AveragedPerceptronBinaryClassifier.txt +++ b/src/python/docs/docstrings/AveragedPerceptronBinaryClassifier.txt @@ -45,10 +45,10 @@ `_ `Large Margin Classification Using the Perceptron Algorithm - `_ + `_ `Discriminative Training Methods for Hidden Markov Models - `_ + `_ :param loss: The default is :py:class:`'hinge' `. Other diff --git a/src/python/docs/docstrings/Dart.txt b/src/python/docs/docstrings/Dart.txt index faa504e0..d037b6f7 100644 --- a/src/python/docs/docstrings/Dart.txt +++ b/src/python/docs/docstrings/Dart.txt @@ -7,9 +7,9 @@ `_ is an ensemble method of boosted regression trees. The Dropouts meet Multiple Additive Regression - Trees (DART) employs dropouts in MART and overcomes the issues of over- + Trees (DART) employs dropouts in MART and overcomes the issues of over- specialization of MART, - achiving better performance in many tasks. + achieving better performance in many tasks. **Reference** diff --git a/src/python/docs/docstrings/DssmFeaturizer.txt b/src/python/docs/docstrings/DssmFeaturizer.txt deleted file mode 100644 index d71d2540..00000000 --- a/src/python/docs/docstrings/DssmFeaturizer.txt +++ /dev/null @@ -1,32 +0,0 @@ - """ - - The input to this transform is text. It applies a pretrained DSSM - featurizer and outputs semantic embeddings for - the input vectors and a cosine similarity computed between the query - and document columns. - - .. remarks:: - DSSM is a neural network algorithm that produces feature embeddings - for key-value string pairs. It is trained - using a dataset consisting of positive key-value pairs, from which - the original rows are used as correct - examples, and the strings are recombined to produce adversarial, - incorrect training examples. Some example of - key-value pairs include search query and clicked document title text, - search query and clicked ad content text, - Search using Clickthrough Data `_ , an MSR publication. - - - .. seealso:: - :py:class:`NGramFeaturizer `, - :py:class:`Sentiment `, - :py:class:`SsweEmbedding `, - :py:class:`WordEmbedding `. - - .. index:: transform, featurizer, text - - Example: - .. literalinclude:: /../nimbusml/examples/DssmFeaturizer.py - :language: python - """ \ No newline at end of file diff --git a/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt b/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt index 787972a2..c8e86ac9 100644 --- a/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt +++ b/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt @@ -22,7 +22,7 @@ `Field Aware Factorization Machines `_, `Field-aware Factorization Machines for CTR Prediction - `_, + `_, `Adaptive Subgradient Methods for Online Learning and Stochastic Optimization `_ diff --git a/src/python/docs/docstrings/FastForestBinaryClassifier.txt b/src/python/docs/docstrings/FastForestBinaryClassifier.txt index 6ebc1938..3e9a6688 100644 --- a/src/python/docs/docstrings/FastForestBinaryClassifier.txt +++ b/src/python/docs/docstrings/FastForestBinaryClassifier.txt @@ -33,7 +33,7 @@ **Reference** `Wikipedia: Random forest - `_ + `_ `Quantile regression forest `_ diff --git a/src/python/docs/docstrings/FastForestRegressor.txt b/src/python/docs/docstrings/FastForestRegressor.txt index 0d01ad8c..35a6ad5e 100644 --- a/src/python/docs/docstrings/FastForestRegressor.txt +++ b/src/python/docs/docstrings/FastForestRegressor.txt @@ -43,7 +43,7 @@ **Reference** `Wikipedia: Random forest - `_ + `_ `Quantile regression forest `_ diff --git a/src/python/docs/docstrings/FastLinearBinaryClassifier.txt b/src/python/docs/docstrings/FastLinearBinaryClassifier.txt index db2c74db..a16893e8 100644 --- a/src/python/docs/docstrings/FastLinearBinaryClassifier.txt +++ b/src/python/docs/docstrings/FastLinearBinaryClassifier.txt @@ -1,7 +1,7 @@ """ A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer - for linear binary classification and regression. + for linear binary classification. .. remarks:: ``FastLinearBinaryClassifier`` is a trainer based on the Stochastic @@ -58,8 +58,7 @@ content/uploads/2016/06/main-3.pdf>`_ `Stochastic Dual Coordinate Ascent Methods for Regularized Loss - Minimization `_ + Minimization `_ :param loss: The default is :py:class:`'log' `. Other diff --git a/src/python/docs/docstrings/FastLinearClassifier.txt b/src/python/docs/docstrings/FastLinearClassifier.txt index 2fcb2868..d9984dd5 100644 --- a/src/python/docs/docstrings/FastLinearClassifier.txt +++ b/src/python/docs/docstrings/FastLinearClassifier.txt @@ -1,6 +1,7 @@ """ - Train an SDCA multi class model + A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer for + multi class classification. .. remarks:: ``FastLinearClassifier`` is a trainer based on the Stochastic Dual @@ -56,8 +57,7 @@ content/uploads/2016/06/main-3.pdf>`_ `Stochastic Dual Coordinate Ascent Methods for Regularized Loss - Minimization `_ + Minimization `_ :param feature: see `Columns `_. diff --git a/src/python/docs/docstrings/FastLinearRegressor.txt b/src/python/docs/docstrings/FastLinearRegressor.txt index 4dda71be..9e7c5d88 100644 --- a/src/python/docs/docstrings/FastLinearRegressor.txt +++ b/src/python/docs/docstrings/FastLinearRegressor.txt @@ -1,7 +1,7 @@ """ A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer - for linear binary classification and regression. + for linear regression. .. remarks:: ``FastLinearRegressor`` is a trainer based on the Stochastic Dual @@ -56,8 +56,7 @@ content/uploads/2016/06/main-3.pdf>`_ `Stochastic Dual Coordinate Ascent Methods for Regularized Loss - Minimization `_ + Minimization `_ :param loss: The only supported loss is :py:class:`'squared' diff --git a/src/python/docs/docstrings/FastTreesBinaryClassifier.txt b/src/python/docs/docstrings/FastTreesBinaryClassifier.txt index 1789d738..15865149 100644 --- a/src/python/docs/docstrings/FastTreesBinaryClassifier.txt +++ b/src/python/docs/docstrings/FastTreesBinaryClassifier.txt @@ -57,7 +57,7 @@ `_ `Greedy function approximation: A gradient boosting machine. - `_ + `_ :param optimizer: Default is ``sgd``. diff --git a/src/python/docs/docstrings/FastTreesRegressor.txt b/src/python/docs/docstrings/FastTreesRegressor.txt index cd1f76b8..91a3622d 100644 --- a/src/python/docs/docstrings/FastTreesRegressor.txt +++ b/src/python/docs/docstrings/FastTreesRegressor.txt @@ -62,7 +62,7 @@ `_ `Greedy function approximation: A gradient boosting machine. - `_ + `_ :param optimizer: Default is ``sgd``. diff --git a/src/python/docs/docstrings/FastTreesTweedieRegressor.txt b/src/python/docs/docstrings/FastTreesTweedieRegressor.txt index 76cd6749..3c02e645 100644 --- a/src/python/docs/docstrings/FastTreesTweedieRegressor.txt +++ b/src/python/docs/docstrings/FastTreesTweedieRegressor.txt @@ -14,7 +14,7 @@ `_ `Greedy function approximation: A gradient boosting machine. - `_ + `_ :param optimizer: Default is ``sgd``. diff --git a/src/python/docs/docstrings/FromKey.txt b/src/python/docs/docstrings/FromKey.txt index a61b7064..fd162550 100644 --- a/src/python/docs/docstrings/FromKey.txt +++ b/src/python/docs/docstrings/FromKey.txt @@ -1,7 +1,6 @@ """ - Text transforms that can be performed on data before training - a model. + Converts the key types back to their original values. .. remarks:: The ``FromKey`` transform converts a column of keys, generated using diff --git a/src/python/docs/docstrings/GamBinaryClassifier.txt b/src/python/docs/docstrings/GamBinaryClassifier.txt index 69484156..acd5f023 100644 --- a/src/python/docs/docstrings/GamBinaryClassifier.txt +++ b/src/python/docs/docstrings/GamBinaryClassifier.txt @@ -21,7 +21,7 @@ functions learned will step between the discretization boundaries. This implementation is based on the this `paper - `_, + `_, but diverges from it in several important respects: most significantly, in each round of boosting, rather than do one feature at a time, it @@ -57,7 +57,7 @@ `Generalized additive models `_, `Intelligible Models for Classification and Regression - `_ + `_ :param normalize: Specifies the type of automatic normalization used: diff --git a/src/python/docs/docstrings/GamRegressor.txt b/src/python/docs/docstrings/GamRegressor.txt index 54d71d10..3e44a736 100644 --- a/src/python/docs/docstrings/GamRegressor.txt +++ b/src/python/docs/docstrings/GamRegressor.txt @@ -21,7 +21,7 @@ functions learned will step between the discretization boundaries. This implementation is based on the this `paper - `_, + `_, but diverges from it in several important respects: most significantly, in each round of boosting, rather than do one feature at a time, it @@ -57,7 +57,7 @@ `Generalized additive models `_, `Intelligible Models for Classification and Regression - `_ + `_ :param normalize: Specifies the type of automatic normalization used: diff --git a/src/python/docs/docstrings/Goss.txt b/src/python/docs/docstrings/Goss.txt index 7ae86ec2..97265859 100644 --- a/src/python/docs/docstrings/Goss.txt +++ b/src/python/docs/docstrings/Goss.txt @@ -5,9 +5,9 @@ .. remarks:: Gradient-based One-Side Sampling (GOSS) employs an adaptive sampling named gradient-based - sampling. For datasets with large sample size, GOSS has considerable + sampling. For datasets with large sample size, GOSS has considerable advantage in terms of - statistical and computational efficiency. + statistical and computational efficiency. diff --git a/src/python/docs/docstrings/Handler.txt b/src/python/docs/docstrings/Handler.txt index 01d767e8..4a639e1e 100644 --- a/src/python/docs/docstrings/Handler.txt +++ b/src/python/docs/docstrings/Handler.txt @@ -33,14 +33,13 @@ For more details see `Columns `_. :param replace_with: The method to use to replace NaN values. The - following choices are available. - - * Def: Replace with default value of that type, usually ``0``. If no - replace - method is specified, this is the default strategy. - * Mean: Replace NaN values with the mean of the values in that column. - * Min: Replace with minimum value in the column. - * Max: Replace with maximum value in the column. + following choices are available. + + * Def: Replace with default value of that type, usually ``0``. If no + replace method is specified, this is the default strategy. + * Mean: Replace NaN values with the mean of the values in that column. + * Min: Replace with minimum value in the column. + * Max: Replace with maximum value in the column. .. seealso:: :py:class:`Filter `, diff --git a/src/python/docs/docstrings/LightLda.txt b/src/python/docs/docstrings/LightLda.txt index 95736da9..aaec0162 100644 --- a/src/python/docs/docstrings/LightLda.txt +++ b/src/python/docs/docstrings/LightLda.txt @@ -10,7 +10,7 @@ topical vectors. LightLDA is an extremely efficient implementation of LDA developed in MSR-Asia that incorporates a number of optimization techniques - `(http://arxiv.org/abs/1412.1576) `_. + `(https://arxiv.org/abs/1412.1576) `_. With the LDA transform, we can train a topic model to produce 1 million topics with 1 million vocabulary on a 1-billion-token document set one diff --git a/src/python/docs/docstrings/Loader.txt b/src/python/docs/docstrings/Loader.txt index ca290c1e..e94fb9e1 100644 --- a/src/python/docs/docstrings/Loader.txt +++ b/src/python/docs/docstrings/Loader.txt @@ -1,6 +1,6 @@ """ - Loaders image data. + Loads image data. .. remarks:: ``Loader`` loads images from paths. diff --git a/src/python/docs/docstrings/LocalDeepSvmBinaryClassifier.txt b/src/python/docs/docstrings/LocalDeepSvmBinaryClassifier.txt index cf028dcd..4863237a 100644 --- a/src/python/docs/docstrings/LocalDeepSvmBinaryClassifier.txt +++ b/src/python/docs/docstrings/LocalDeepSvmBinaryClassifier.txt @@ -39,14 +39,14 @@ More details about LD-SVM can be found in this paper `Local deep kernel learning for efficient non-linear SVM prediction - `_. **Reference** `Local deep kernel learning for efficient non-linear SVM prediction - `_ diff --git a/src/python/docs/docstrings/LogisticRegressionBinaryClassifier.txt b/src/python/docs/docstrings/LogisticRegressionBinaryClassifier.txt index 6fb1063d..b268dea2 100644 --- a/src/python/docs/docstrings/LogisticRegressionBinaryClassifier.txt +++ b/src/python/docs/docstrings/LogisticRegressionBinaryClassifier.txt @@ -69,14 +69,14 @@ **Reference** - `Wikipedia: L-BFGS `_ + `Wikipedia: L-BFGS `_ `Wikipedia: Logistic - regression `_ + regression `_ `Scalable Training of L1-Regularized Log-Linear Models - `_ + `_ `Test Run - L1 and L2 Regularization for Machine Learning diff --git a/src/python/docs/docstrings/LogisticRegressionClassifier.txt b/src/python/docs/docstrings/LogisticRegressionClassifier.txt index db6f386e..405c20f3 100644 --- a/src/python/docs/docstrings/LogisticRegressionClassifier.txt +++ b/src/python/docs/docstrings/LogisticRegressionClassifier.txt @@ -70,14 +70,14 @@ **Reference** - `Wikipedia: L-BFGS `_ + `Wikipedia: L-BFGS `_ `Wikipedia: Logistic - regression `_ + regression `_ `Scalable Training of L1-Regularized Log-Linear Models - `_ + `_ `Test Run - L1 and L2 Regularization for Machine Learning diff --git a/src/python/docs/docstrings/NGram.txt b/src/python/docs/docstrings/NGram.txt index e05c292a..e4d681db 100644 --- a/src/python/docs/docstrings/NGram.txt +++ b/src/python/docs/docstrings/NGram.txt @@ -1,6 +1,6 @@ """ - Extracts NGrams from text and convert them to vector using + Extracts NGrams from text and converts them to vector using dictionary. .. remarks:: diff --git a/src/python/docs/docstrings/NgramHash.txt b/src/python/docs/docstrings/NgramHash.txt index b7e34e8a..a1969901 100644 --- a/src/python/docs/docstrings/NgramHash.txt +++ b/src/python/docs/docstrings/NgramHash.txt @@ -1,6 +1,6 @@ """ - Extracts NGrams from text and convert them to vector using hashing + Extracts NGrams from text and converts them to vector using hashing trick. .. remarks:: diff --git a/src/python/docs/docstrings/OneClassSVMAnomalyDetector.txt b/src/python/docs/docstrings/OneClassSVMAnomalyDetector.txt index 958bd389..44e9ef30 100644 --- a/src/python/docs/docstrings/OneClassSVMAnomalyDetector.txt +++ b/src/python/docs/docstrings/OneClassSVMAnomalyDetector.txt @@ -29,10 +29,10 @@ us/library/azure/dn913103.aspx>`_ `Estimating the Support of a High-Dimensional Distribution - `_ + `_ `New Support Vector Algorithms - `_ + `_ `LIBSVM: A Library for Support Vector Machines `_ diff --git a/src/python/docs/docstrings/PcaAnomalyDetector.txt b/src/python/docs/docstrings/PcaAnomalyDetector.txt index 5896c5c9..f51aaf24 100644 --- a/src/python/docs/docstrings/PcaAnomalyDetector.txt +++ b/src/python/docs/docstrings/PcaAnomalyDetector.txt @@ -36,13 +36,12 @@ `Randomized Methods for Computing the Singular Value Decomposition (SVD) of very large matrices - `_ + `_ `A randomized algorithm for principal component analysis `_, `Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions - `_ + `_ :param feature: see `Columns `_. diff --git a/src/python/docs/docstrings/PrefixColumnConcatenator.txt b/src/python/docs/docstrings/PrefixColumnConcatenator.txt new file mode 100644 index 00000000..aac3d116 --- /dev/null +++ b/src/python/docs/docstrings/PrefixColumnConcatenator.txt @@ -0,0 +1,44 @@ + """ + + Combines several columns into a single vector-valued column by prefix. + + .. remarks:: + ``PrefixColumnConcatenator`` creates a single vector-valued column from + multiple + columns. It can be performed on data before training a model. The + concatenation + can significantly speed up the processing of data when the number of + columns + is as large as hundreds to thousands. + + :param columns: a dictionary of key-value pairs, where key is the output + column name and value is a list of input column names. + + * Only one key-value pair is allowed. + * Input column type: numeric or string. + * Output column type: + `Vector Type `_. + + The << operator can be used to set this value (see + `Column Operator `_) + + For example + * ColumnConcatenator(columns={'features': ['age', 'parity', + 'induced']}) + * ColumnConcatenator() << {'features': ['age', 'parity', + 'induced']}) + + For more details see `Columns `_. + + .. seealso:: + :py:class:`ColumnDropper + `, + :py:class:`ColumnSelector + `. + + .. index:: transform, schema + + Example: + .. literalinclude:: /../nimbusml/examples/PrefixColumnConcatenator.py + :language: python + """ diff --git a/src/python/docs/docstrings/Resizer.txt b/src/python/docs/docstrings/Resizer.txt index eb45128e..2bf9857f 100644 --- a/src/python/docs/docstrings/Resizer.txt +++ b/src/python/docs/docstrings/Resizer.txt @@ -1,15 +1,15 @@ """ - Resizers an image to a specified dimension using a specified + Resizes an image to a specified dimension using a specified resizing method. .. remarks:: - ``Resizer`` resizers an image to the specified height and width + ``Resizer`` resizes an image to the specified height and width using a specified resizing method. The input variables to this transforms must be images, typically the result of the ``Loader`` transform. - :param columns: a dictionary of key-value pairs, where key is the output + :param columns: A dictionary of key-value pairs, where key is the output column name and value is the input column name. * Multiple key-value pairs are allowed. diff --git a/src/python/docs/docstrings/SgdBinaryClassifier.txt b/src/python/docs/docstrings/SgdBinaryClassifier.txt index c1ed86ac..a585e088 100644 --- a/src/python/docs/docstrings/SgdBinaryClassifier.txt +++ b/src/python/docs/docstrings/SgdBinaryClassifier.txt @@ -13,14 +13,14 @@ associated optimization problem is sparse, then Hogwild SGD achieves a nearly optimal rate of convergence. For a detailed reference, please - refer to `http://arxiv.org/pdf/1106.5730v2.pdf - `_. + refer to `https://arxiv.org/pdf/1106.5730v2.pdf + `_. **Reference** - `http://arxiv.org/pdf/1106.5730v2.pdf - `_ + `https://arxiv.org/pdf/1106.5730v2.pdf + `_ :param normalize: Specifies the type of automatic normalization used: diff --git a/src/python/docs/docstrings/SigmoidKernel.txt b/src/python/docs/docstrings/SigmoidKernel.txt index 3a22d2cd..62c5785a 100644 --- a/src/python/docs/docstrings/SigmoidKernel.txt +++ b/src/python/docs/docstrings/SigmoidKernel.txt @@ -3,8 +3,7 @@ Apply sigmoid function. tanh(gamma*+c). .. remarks:: - `SigmoidKernel `_ is a + `SigmoidKernel `_ is a kernel function that computes the similarity between two features. diff --git a/src/python/docs/docstrings/SsaForecaster.txt b/src/python/docs/docstrings/SsaForecaster.txt index 8873702b..a8a99a6f 100644 --- a/src/python/docs/docstrings/SsaForecaster.txt +++ b/src/python/docs/docstrings/SsaForecaster.txt @@ -11,7 +11,7 @@ input time-series where each component in the spectrum corresponds to a trend, seasonal or noise component in the time-series. For details of the Singular Spectrum Analysis (SSA), refer to `this document - `_. + `_. .. seealso:: :py:func:`IIDChangePointDetector diff --git a/src/python/docs/docstrings/SsweEmbedding.txt b/src/python/docs/docstrings/SsweEmbedding.txt index 4c476285..4e6c56f9 100644 --- a/src/python/docs/docstrings/SsweEmbedding.txt +++ b/src/python/docs/docstrings/SsweEmbedding.txt @@ -7,12 +7,12 @@ versions of `GloVe Models `_, `FastText `_, and `Sswe - `_. + `_. .. remarks:: Sentiment-specific word embedding (SSWE) is a DNN featurizer developed - by MSRA (`paper `_). + by MSRA (`paper `_). It incorporates sentiment information into the neural network to learn sentiment specific word embedding. It proves to be useful in various @@ -63,7 +63,6 @@ .. seealso:: :py:class:`NGramFeaturizer `, - :py:class:`DssmFeaturizer `, :py:class:`Sentiment `, :py:class:`WordEmbedding `. diff --git a/src/python/docs/docstrings/SupervisedBinner.txt b/src/python/docs/docstrings/SupervisedBinner.txt index 963a560e..95317b75 100644 --- a/src/python/docs/docstrings/SupervisedBinner.txt +++ b/src/python/docs/docstrings/SupervisedBinner.txt @@ -24,7 +24,7 @@ the default is to normalize features before training. ``SupervisedBinner`` implements the `Entropy-Based Discretization - `_. + `_. Partition of the data is performed recursively to select the split with highest entropy gain with respect to the label. Therefore, the final binned features will have high correlation with diff --git a/src/python/docs/docstrings/ToKey.txt b/src/python/docs/docstrings/ToKey.txt index 2740561b..89a32047 100644 --- a/src/python/docs/docstrings/ToKey.txt +++ b/src/python/docs/docstrings/ToKey.txt @@ -1,7 +1,6 @@ """ - Text transforms that can be performed on data before training - a model. + Converts input values (words, numbers, etc.) to index in a dictionary. .. remarks:: The ``ToKey`` transform converts a column of text to key values diff --git a/src/python/docs/docstrings/WordEmbedding.txt b/src/python/docs/docstrings/WordEmbedding.txt index 41d6f1c6..f19c73d8 100644 --- a/src/python/docs/docstrings/WordEmbedding.txt +++ b/src/python/docs/docstrings/WordEmbedding.txt @@ -10,7 +10,7 @@ available options are various versions of `GloVe Models `_, `FastText `_, and `Sswe - `_. + `_. :param model_kind: Pre-trained model used to create the vocabulary. diff --git a/src/python/docs/sphinx/ci_script/_static/mystyle.css b/src/python/docs/sphinx/ci_script/_static/mystyle.css index a69e381c..a5df3a24 100644 --- a/src/python/docs/sphinx/ci_script/_static/mystyle.css +++ b/src/python/docs/sphinx/ci_script/_static/mystyle.css @@ -8432,7 +8432,7 @@ label { padding: 0px; } /* Flexible box model classes */ -/* Taken from Alex Russell http://infrequently.org/2009/08/css-3-progress/ */ +/* Taken from Alex Russell https://infrequently.org/2009/08/css-3-progress/ */ /* This file is a compatability layer. It allows the usage of flexible box model layouts accross multiple browsers, including older browsers. The newest, universal implementation of the flexible box model is used when available (see diff --git a/src/python/docs/sphinx/ci_script/conf.py b/src/python/docs/sphinx/ci_script/conf.py index f96889d1..1acb3312 100644 --- a/src/python/docs/sphinx/ci_script/conf.py +++ b/src/python/docs/sphinx/ci_script/conf.py @@ -128,8 +128,8 @@ 'relative': True, 'reference_url': { 'nimbusml': None, - 'matplotlib': 'http://matplotlib.org', - 'numpy': 'http://www.numpy.org/', + 'matplotlib': 'https://matplotlib.org', + 'numpy': 'https://www.numpy.org/', 'scipy': 'https://www.scipy.org/'}, } diff --git a/src/python/docs/sphinx/concepts/datasources.rst b/src/python/docs/sphinx/concepts/datasources.rst index 0a8b1986..731fad98 100644 --- a/src/python/docs/sphinx/concepts/datasources.rst +++ b/src/python/docs/sphinx/concepts/datasources.rst @@ -120,15 +120,21 @@ Example: Output Data Types of Transforms ------------------------------- -The return type of all of the transforms is a ``pandas.DataFrame``, when they -are used inside a `sklearn.pipeline.Pipeline -`_ -or when they are used individually. - -However, when used inside a :py:class:`nimbusml.Pipeline`, the outputs are often stored in +When used inside a `sklearn.pipeline.Pipeline +`_, +the return type of all of the transforms is a ``pandas.DataFrame``. + +When used individually or inside a :py:class:`nimbusml.Pipeline` +that contains only transforms, the default output is a ``pandas.DataFrame``. To instead output an +`IDataView `_, +pass ``as_binary_data_stream=True`` to either ``transform()`` or ``fit_transform()``. +To output a sparse CSR matrix, pass ``as_csr=True``. +See :py:class:`nimbusml.Pipeline` for more information. + +Note, when used inside a :py:class:`nimbusml.Pipeline`, the outputs are often stored in a more optimized :ref:`VectorDataViewType`, which minimizes data conversion to dataframes. When several transforms are combined inside an :py:class:`nimbusml.Pipeline`, the intermediate transforms will store the data in the optimized format and only -the last transform will return a ``pandas.DataFrame``. +the last transform will return a ``pandas.DataFrame`` (or IDataView/CSR; see above). diff --git a/src/python/docs/sphinx/conf.py b/src/python/docs/sphinx/conf.py index 1f0cccfc..9fb1d4ab 100644 --- a/src/python/docs/sphinx/conf.py +++ b/src/python/docs/sphinx/conf.py @@ -145,8 +145,8 @@ def install_and_import(package): 'relative': True, 'reference_url': { 'nimbusml': None, - 'matplotlib': 'http://matplotlib.org', - 'numpy': 'http://www.numpy.org/', + 'matplotlib': 'https://matplotlib.org', + 'numpy': 'https://www.numpy.org/', 'scipy': 'https://www.scipy.org/'}, } diff --git a/src/python/docs/sphinx/modules/feature_extraction/text/dssmfeaturizer.rst b/src/python/docs/sphinx/modules/feature_extraction/text/dssmfeaturizer.rst deleted file mode 100644 index 7aa663ee..00000000 --- a/src/python/docs/sphinx/modules/feature_extraction/text/dssmfeaturizer.rst +++ /dev/null @@ -1,4 +0,0 @@ -`nimbusml.feature_extraction.text.DssmFeaturizer` -=========================================================== - -.. autoclass:: nimbusml.feature_extraction.text.DssmFeaturizer diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index dc1a2c39..452dac23 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -93,8 +93,10 @@ - + + + @@ -249,6 +251,7 @@ + @@ -296,6 +299,7 @@ + @@ -352,7 +356,6 @@ - @@ -451,7 +454,6 @@ - @@ -474,6 +476,7 @@ + @@ -530,8 +533,6 @@ - - @@ -706,6 +707,7 @@ + @@ -716,10 +718,10 @@ + - @@ -819,14 +821,17 @@ + + + @@ -983,7 +988,6 @@ - diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py index 0fdadc02..afb13002 100644 --- a/src/python/nimbusml/__init__.py +++ b/src/python/nimbusml/__init__.py @@ -2,7 +2,7 @@ Microsoft Machine Learning for Python """ -__version__ = '1.5.1' +__version__ = '1.6.1' # CoreCLR version of MicrosoftML is built on Windows. # But file permissions are not preserved when it's copied to Linux. diff --git a/src/python/nimbusml/base_transform.py b/src/python/nimbusml/base_transform.py index 393c3655..b227d567 100644 --- a/src/python/nimbusml/base_transform.py +++ b/src/python/nimbusml/base_transform.py @@ -35,7 +35,19 @@ def fit_transform(self, X, y=None, as_binary_data_stream=False, :param X: array-like with shape=[n_samples, n_features] or else :py:class:`nimbusml.FileDataStream` :param y: array-like with shape=[n_samples] - :return: pandas.DataFrame + :param as_binary_data_stream: If ``True`` then output an IDV file. + See `here `_ + for more information. + :param params: Additional arguments. + If ``as_csr=True`` and ``as_binary_data_stream=False`` then + return the transformed data in CSR (sparse matrix) format. + If ``as_binary_data_stream`` is also true then that + parameter takes precedence over ``as_csr`` and the output will + be an IDV file. + + :return: Returns a pandas DataFrame if no other output format + is specified. See ``as_binary_data_stream`` and ``as_csr`` + for other available output formats. """ pipeline = Pipeline([self]) try: @@ -88,8 +100,20 @@ def transform(self, X, as_binary_data_stream=False, **params): Applies transform to data. :param X: array-like with shape=[n_samples, n_features] or else - :py:class:`nimbusml.FileDataStream` - :return: pandas.DataFrame + :py:class:`nimbusml.FileDataStream` + :param as_binary_data_stream: If ``True`` then output an IDV file. + See `here `_ + for more information. + :param params: Additional arguments. + If ``as_csr=True`` and ``as_binary_data_stream=False`` then + return the transformed data in CSR (sparse matrix) format. + If ``as_binary_data_stream`` is also true then that + parameter takes precedence over ``as_csr`` and the output will + be an IDV file. + + :return: Returns a pandas DataFrame if no other output format + is specified. See ``as_binary_data_stream`` and ``as_csr`` + for other available output formats. """ # Check that the input is of the same shape as the one passed # during diff --git a/src/python/nimbusml/datasets/datasets.py b/src/python/nimbusml/datasets/datasets.py index 56c325a6..9f040ff1 100644 --- a/src/python/nimbusml/datasets/datasets.py +++ b/src/python/nimbusml/datasets/datasets.py @@ -3,7 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- """ -Datasets used in MicrosoftML unittests. +Datasets used in MicrosoftML unittests. """ import copy import os @@ -15,6 +15,8 @@ __all__ = ["get_dataset", "available_datasets"] +DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') + class DataSet: """ @@ -175,11 +177,7 @@ def load(self): # isCase ~ age + parity + education + spontaneous + induced # education age parity induced case spontaneous stratum # pooled.stratum - this = os.path.join( - os.path.dirname(__file__), - "data", - "gplv2", - "infert.csv") + this = os.path.join(DATA_DIR, "gplv2", "infert.csv") self.__dict__['_data'] = pandas.read_csv(this) self.__dict__['case'] = self._data["case"] self._finalize() @@ -229,11 +227,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "gplv2", - "infert.csv") + return os.path.join(DATA_DIR, "gplv2", "infert.csv") class DataSetAirQuality(DataSet): @@ -262,11 +256,7 @@ def load(self): # isCase ~ age + parity + education + spontaneous + induced # education age parity induced case spontaneous stratum # pooled.stratum - this = os.path.join( - os.path.dirname(__file__), - "data", - "gplv2", - "airquality.csv") + this = os.path.join(DATA_DIR, "gplv2", "airquality.csv") self.__dict__['_data'] = pandas.read_csv(this) self._finalize() @@ -294,11 +284,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "gplv2", - "airquality.csv") + return os.path.join(DATA_DIR, "gplv2", "airquality.csv") class Topics(DataSet): @@ -324,8 +310,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join(os.path.dirname(__file__), "data", - "topics.csv") + return os.path.join(DATA_DIR, "topics.csv") class Timeseries(DataSet): @@ -351,10 +336,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "timeseries.csv") + return os.path.join(DATA_DIR, "timeseries.csv") class WikiDetox_Train(DataSet): @@ -379,10 +361,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train-250.wikipedia.sample.tsv") + return os.path.join(DATA_DIR, "train-250.wikipedia.sample.tsv") class WikiDetox_Test(DataSet): @@ -407,10 +386,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test.wikipedia.sample.tsv") + return os.path.join(DATA_DIR, "test.wikipedia.sample.tsv") class FS_Train(DataSet): @@ -435,10 +411,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train_fs.csv") + return os.path.join(DATA_DIR, "train_fs.csv") class FS_Test(DataSet): @@ -463,10 +436,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test_fs.csv") + return os.path.join(DATA_DIR, "test_fs.csv") class MSLTR_Train(DataSet): @@ -492,10 +462,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train-msltr.sample.csv") + return os.path.join(DATA_DIR, "train-msltr.sample.csv") class MSLTR_Test(DataSet): @@ -521,10 +488,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test-msltr.sample.csv") + return os.path.join(DATA_DIR, "test-msltr.sample.csv") class Uci_Train(DataSet): @@ -548,10 +512,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train-500.uciadult.sample.csv") + return os.path.join(DATA_DIR, "train-500.uciadult.sample.csv") class Uci_Test(DataSet): @@ -575,10 +536,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test-100.uciadult.sample.csv") + return os.path.join(DATA_DIR, "test-100.uciadult.sample.csv") class Generated_Twitter_Train(DataSet): @@ -603,10 +561,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train-twitter.gen-sample.tsv") + return os.path.join(DATA_DIR, "train-twitter.gen-sample.tsv") class Generated_Twitter_Test(DataSet): @@ -631,10 +586,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test-twitter.gen-sample.tsv") + return os.path.join(DATA_DIR, "test-twitter.gen-sample.tsv") class Generated_Ticket_Train(DataSet): @@ -659,10 +611,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train-ticketchoice.csv") + return os.path.join(DATA_DIR, "train-ticketchoice.csv") class Generated_Ticket_Test(DataSet): @@ -687,10 +636,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test-ticketchoice.csv") + return os.path.join(DATA_DIR, "test-ticketchoice.csv") _datasets = dict( diff --git a/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py b/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py index fd3d75a2..4a8f6c44 100644 --- a/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py +++ b/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py @@ -44,7 +44,7 @@ class FactorizationMachineBinaryClassifier( `Field Aware Factorization Machines `_, `Field-aware Factorization Machines for CTR Prediction - `_, + `_, `Adaptive Subgradient Methods for Online Learning and Stochastic Optimization `_ diff --git a/src/python/nimbusml/decomposition/pcaanomalydetector.py b/src/python/nimbusml/decomposition/pcaanomalydetector.py index bdf42b22..85938224 100644 --- a/src/python/nimbusml/decomposition/pcaanomalydetector.py +++ b/src/python/nimbusml/decomposition/pcaanomalydetector.py @@ -57,13 +57,12 @@ class PcaAnomalyDetector(core, BasePredictor, ClassifierMixin): `Randomized Methods for Computing the Singular Value Decomposition (SVD) of very large matrices - `_ + `_ `A randomized algorithm for principal component analysis `_, `Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions - `_ + `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/ensemble/booster/dart.py b/src/python/nimbusml/ensemble/booster/dart.py index 33dc8295..ab6857e2 100644 --- a/src/python/nimbusml/ensemble/booster/dart.py +++ b/src/python/nimbusml/ensemble/booster/dart.py @@ -24,9 +24,9 @@ class Dart(core): `_ is an ensemble method of boosted regression trees. The Dropouts meet Multiple Additive Regression - Trees (DART) employs dropouts in MART and overcomes the issues of over- + Trees (DART) employs dropouts in MART and overcomes the issues of over- specialization of MART, - achiving better performance in many tasks. + achieving better performance in many tasks. **Reference** diff --git a/src/python/nimbusml/ensemble/booster/goss.py b/src/python/nimbusml/ensemble/booster/goss.py index 8e57181b..9b17e4ad 100644 --- a/src/python/nimbusml/ensemble/booster/goss.py +++ b/src/python/nimbusml/ensemble/booster/goss.py @@ -22,9 +22,9 @@ class Goss(core): .. remarks:: Gradient-based One-Side Sampling (GOSS) employs an adaptive sampling named gradient-based - sampling. For datasets with large sample size, GOSS has considerable + sampling. For datasets with large sample size, GOSS has considerable advantage in terms of - statistical and computational efficiency. + statistical and computational efficiency. diff --git a/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py b/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py index ea911977..5e6d5bd9 100644 --- a/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py +++ b/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py @@ -55,7 +55,7 @@ class FastForestBinaryClassifier( **Reference** `Wikipedia: Random forest - `_ + `_ `Quantile regression forest `_ diff --git a/src/python/nimbusml/ensemble/fastforestregressor.py b/src/python/nimbusml/ensemble/fastforestregressor.py index 5a2affe4..cb20c847 100644 --- a/src/python/nimbusml/ensemble/fastforestregressor.py +++ b/src/python/nimbusml/ensemble/fastforestregressor.py @@ -64,7 +64,7 @@ class FastForestRegressor(core, BasePredictor, RegressorMixin): **Reference** `Wikipedia: Random forest - `_ + `_ `Quantile regression forest `_ diff --git a/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py b/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py index 8c12cb48..24f633fe 100644 --- a/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py +++ b/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py @@ -81,7 +81,7 @@ class FastTreesBinaryClassifier( `_ `Greedy function approximation: A gradient boosting machine. - `_ + `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/ensemble/fasttreesregressor.py b/src/python/nimbusml/ensemble/fasttreesregressor.py index c3994230..12c8c59b 100644 --- a/src/python/nimbusml/ensemble/fasttreesregressor.py +++ b/src/python/nimbusml/ensemble/fasttreesregressor.py @@ -83,7 +83,7 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin): `_ `Greedy function approximation: A gradient boosting machine. - `_ + `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/ensemble/fasttreestweedieregressor.py b/src/python/nimbusml/ensemble/fasttreestweedieregressor.py index 1db266b7..177d9ede 100644 --- a/src/python/nimbusml/ensemble/fasttreestweedieregressor.py +++ b/src/python/nimbusml/ensemble/fasttreestweedieregressor.py @@ -38,7 +38,7 @@ class FastTreesTweedieRegressor( `_ `Greedy function approximation: A gradient boosting machine. - `_ + `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/ensemble/gambinaryclassifier.py b/src/python/nimbusml/ensemble/gambinaryclassifier.py index eb08e95c..79808610 100644 --- a/src/python/nimbusml/ensemble/gambinaryclassifier.py +++ b/src/python/nimbusml/ensemble/gambinaryclassifier.py @@ -42,7 +42,7 @@ class GamBinaryClassifier(core, BasePredictor, ClassifierMixin): functions learned will step between the discretization boundaries. This implementation is based on the this `paper - `_, + `_, but diverges from it in several important respects: most significantly, in each round of boosting, rather than do one feature at a time, it @@ -78,7 +78,7 @@ class GamBinaryClassifier(core, BasePredictor, ClassifierMixin): `Generalized additive models `_, `Intelligible Models for Classification and Regression - `_ + `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/ensemble/gamregressor.py b/src/python/nimbusml/ensemble/gamregressor.py index c57ad499..45796805 100644 --- a/src/python/nimbusml/ensemble/gamregressor.py +++ b/src/python/nimbusml/ensemble/gamregressor.py @@ -41,7 +41,7 @@ class GamRegressor(core, BasePredictor, RegressorMixin): functions learned will step between the discretization boundaries. This implementation is based on the this `paper - `_, + `_, but diverges from it in several important respects: most significantly, in each round of boosting, rather than do one feature at a time, it @@ -77,7 +77,7 @@ class GamRegressor(core, BasePredictor, RegressorMixin): `Generalized additive models `_, `Intelligible Models for Classification and Regression - `_ + `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/examples/AveragedPerceptronBinaryClassifier.py b/src/python/nimbusml/examples/AveragedPerceptronBinaryClassifier.py index 69566dab..1e0bd727 100644 --- a/src/python/nimbusml/examples/AveragedPerceptronBinaryClassifier.py +++ b/src/python/nimbusml/examples/AveragedPerceptronBinaryClassifier.py @@ -20,7 +20,6 @@ feature=['age', 'parity', 'spontaneous'], label='case')]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/FactorizationMachineBinaryClassifier.py b/src/python/nimbusml/examples/FactorizationMachineBinaryClassifier.py index 508f8a84..52dbcc6f 100644 --- a/src/python/nimbusml/examples/FactorizationMachineBinaryClassifier.py +++ b/src/python/nimbusml/examples/FactorizationMachineBinaryClassifier.py @@ -26,7 +26,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/FastForestBinaryClassifier.py b/src/python/nimbusml/examples/FastForestBinaryClassifier.py index aa7f34ed..1f1a5e3f 100644 --- a/src/python/nimbusml/examples/FastForestBinaryClassifier.py +++ b/src/python/nimbusml/examples/FastForestBinaryClassifier.py @@ -25,7 +25,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/FastLinearBinaryClassifier.py b/src/python/nimbusml/examples/FastLinearBinaryClassifier.py index fd38072a..73f72f03 100644 --- a/src/python/nimbusml/examples/FastLinearBinaryClassifier.py +++ b/src/python/nimbusml/examples/FastLinearBinaryClassifier.py @@ -23,7 +23,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/FastLinearClassifier.py b/src/python/nimbusml/examples/FastLinearClassifier.py index d668a49e..32d00ecd 100644 --- a/src/python/nimbusml/examples/FastLinearClassifier.py +++ b/src/python/nimbusml/examples/FastLinearClassifier.py @@ -24,7 +24,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/FastLinearRegressor.py b/src/python/nimbusml/examples/FastLinearRegressor.py index 4fb64001..64b97cc4 100644 --- a/src/python/nimbusml/examples/FastLinearRegressor.py +++ b/src/python/nimbusml/examples/FastLinearRegressor.py @@ -23,7 +23,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/FastTreesBinaryClassifier.py b/src/python/nimbusml/examples/FastTreesBinaryClassifier.py index 4d9712e1..6a3d1458 100644 --- a/src/python/nimbusml/examples/FastTreesBinaryClassifier.py +++ b/src/python/nimbusml/examples/FastTreesBinaryClassifier.py @@ -23,7 +23,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/FastTreesRegressor.py b/src/python/nimbusml/examples/FastTreesRegressor.py index aac8fc38..a08ac653 100644 --- a/src/python/nimbusml/examples/FastTreesRegressor.py +++ b/src/python/nimbusml/examples/FastTreesRegressor.py @@ -23,7 +23,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/FastTreesTweedieRegressor.py b/src/python/nimbusml/examples/FastTreesTweedieRegressor.py index f6a0bac1..008107ac 100644 --- a/src/python/nimbusml/examples/FastTreesTweedieRegressor.py +++ b/src/python/nimbusml/examples/FastTreesTweedieRegressor.py @@ -23,7 +23,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/GamBinaryClassifier.py b/src/python/nimbusml/examples/GamBinaryClassifier.py index 78ee1ba4..de8d049f 100644 --- a/src/python/nimbusml/examples/GamBinaryClassifier.py +++ b/src/python/nimbusml/examples/GamBinaryClassifier.py @@ -23,7 +23,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/GamRegressor.py b/src/python/nimbusml/examples/GamRegressor.py index c4bf43f8..82a3b70b 100644 --- a/src/python/nimbusml/examples/GamRegressor.py +++ b/src/python/nimbusml/examples/GamRegressor.py @@ -23,7 +23,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/Image.py b/src/python/nimbusml/examples/Image.py index 08c6aa35..78ee62b6 100644 --- a/src/python/nimbusml/examples/Image.py +++ b/src/python/nimbusml/examples/Image.py @@ -18,10 +18,10 @@ X = data[['Path']] y = data[['Label']] -# define the training pipeline +# define the training pipeline pipeline = Pipeline([ Loader(columns={'ImgPath': 'Path'}), - Resizer(image_width=227, image_height=227, + Resizer(image_width=32, image_height=32, columns={'ImgResize': 'ImgPath'}), PixelExtractor(columns={'ImgPixels': 'ImgResize'}), FastLinearBinaryClassifier(feature='ImgPixels') diff --git a/src/python/nimbusml/examples/KMeansPlusPlus.py b/src/python/nimbusml/examples/KMeansPlusPlus.py index fab4c2d8..673feb95 100644 --- a/src/python/nimbusml/examples/KMeansPlusPlus.py +++ b/src/python/nimbusml/examples/KMeansPlusPlus.py @@ -24,7 +24,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline \ .fit(data) \ .test(data, 'induced', output_scores=True) diff --git a/src/python/nimbusml/examples/LightGbmBinaryClassifier.py b/src/python/nimbusml/examples/LightGbmBinaryClassifier.py index 3774c815..b4a99dda 100644 --- a/src/python/nimbusml/examples/LightGbmBinaryClassifier.py +++ b/src/python/nimbusml/examples/LightGbmBinaryClassifier.py @@ -26,7 +26,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit( data, 'case').test( data, output_scores=True) diff --git a/src/python/nimbusml/examples/LightGbmClassifier.py b/src/python/nimbusml/examples/LightGbmClassifier.py index 15179a3b..543f72ca 100644 --- a/src/python/nimbusml/examples/LightGbmClassifier.py +++ b/src/python/nimbusml/examples/LightGbmClassifier.py @@ -26,7 +26,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/LightGbmRanker.py b/src/python/nimbusml/examples/LightGbmRanker.py index b137ff94..7b04a87d 100644 --- a/src/python/nimbusml/examples/LightGbmRanker.py +++ b/src/python/nimbusml/examples/LightGbmRanker.py @@ -16,7 +16,6 @@ feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group')]) # train, predict, and evaluate. -# TODO: Replace with CV metrics, predictions = pipeline \ .fit(data) \ .test(data, output_scores=True) diff --git a/src/python/nimbusml/examples/LightGbmRegressor.py b/src/python/nimbusml/examples/LightGbmRegressor.py index 6165f614..cac8a047 100644 --- a/src/python/nimbusml/examples/LightGbmRegressor.py +++ b/src/python/nimbusml/examples/LightGbmRegressor.py @@ -26,7 +26,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py b/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py index 1a2d70e6..50d760ec 100644 --- a/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py +++ b/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py @@ -20,7 +20,6 @@ feature=['age', 'parity', 'spontaneous'], label='case')]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/LogisticRegressionBinaryClassifier.py b/src/python/nimbusml/examples/LogisticRegressionBinaryClassifier.py index a99b5dc3..e9b15be8 100644 --- a/src/python/nimbusml/examples/LogisticRegressionBinaryClassifier.py +++ b/src/python/nimbusml/examples/LogisticRegressionBinaryClassifier.py @@ -24,7 +24,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/LogisticRegressionClassifier.py b/src/python/nimbusml/examples/LogisticRegressionClassifier.py index 232605c8..80af4ee0 100644 --- a/src/python/nimbusml/examples/LogisticRegressionClassifier.py +++ b/src/python/nimbusml/examples/LogisticRegressionClassifier.py @@ -24,7 +24,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/NGramExtractor.py b/src/python/nimbusml/examples/NGramExtractor.py new file mode 100644 index 00000000..facb4596 --- /dev/null +++ b/src/python/nimbusml/examples/NGramExtractor.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +############################################################################### +# NGramExtractor +from nimbusml import FileDataStream, Pipeline +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing.schema import ColumnDropper +from nimbusml.preprocessing.text import CharTokenizer +from nimbusml.feature_extraction.text import NGramExtractor + +# data input (as a FileDataStream) +path = get_dataset("wiki_detox_train").as_filepath() + +data = FileDataStream.read_csv(path, sep='\t') +print(data.head()) +# Sentiment SentimentText +# 0 1 ==RUDE== Dude, you are rude upload that carl p... +# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... +# 2 1 Stop trolling, zapatancas, calling me a liar m... +# 3 1 ==You're cool== You seem like a really cool g... +# 4 1 ::::: Why are you threatening me? I'm not bein... + +# transform usage +pipe = Pipeline([ + CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), + NGramExtractor(ngram_length=1, all_lengths=False, columns={'Ngrams': 'SentimentText_Transform'}), + ColumnDropper(columns=['SentimentText_Transform', 'SentimentText', 'Sentiment']) + ]) + +# fit and transform +features = pipe.fit_transform(data) + +print(features.head()) +# Ngrams.<␂> Ngrams.= Ngrams.R Ngrams.U Ngrams.D Ngrams.E ... +# 0 1.0 4.0 1.0 1.0 2.0 1.0 ... +# 1 1.0 4.0 0.0 0.0 2.0 3.0 ... +# 2 1.0 0.0 0.0 0.0 0.0 0.0 ... +# 3 1.0 4.0 0.0 0.0 0.0 0.0 ... +# 4 1.0 0.0 0.0 0.0 0.0 0.0 ... diff --git a/src/python/nimbusml/examples/NaiveBayesClassifier.py b/src/python/nimbusml/examples/NaiveBayesClassifier.py index 04e038af..8cabd122 100644 --- a/src/python/nimbusml/examples/NaiveBayesClassifier.py +++ b/src/python/nimbusml/examples/NaiveBayesClassifier.py @@ -25,7 +25,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/OneVsRestClassifier.py b/src/python/nimbusml/examples/OneVsRestClassifier.py index e5c864cb..caef3cc6 100644 --- a/src/python/nimbusml/examples/OneVsRestClassifier.py +++ b/src/python/nimbusml/examples/OneVsRestClassifier.py @@ -30,7 +30,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/OnlineGradientDescentRegressor.py b/src/python/nimbusml/examples/OnlineGradientDescentRegressor.py index 85f6e49f..95a6f18c 100644 --- a/src/python/nimbusml/examples/OnlineGradientDescentRegressor.py +++ b/src/python/nimbusml/examples/OnlineGradientDescentRegressor.py @@ -24,7 +24,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/OrdinaryLeastSquaresRegressor.py b/src/python/nimbusml/examples/OrdinaryLeastSquaresRegressor.py index c394f23b..8a9feebc 100644 --- a/src/python/nimbusml/examples/OrdinaryLeastSquaresRegressor.py +++ b/src/python/nimbusml/examples/OrdinaryLeastSquaresRegressor.py @@ -24,7 +24,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/PcaAnomalyDetector.py b/src/python/nimbusml/examples/PcaAnomalyDetector.py index dfe50237..8e16aa91 100644 --- a/src/python/nimbusml/examples/PcaAnomalyDetector.py +++ b/src/python/nimbusml/examples/PcaAnomalyDetector.py @@ -24,7 +24,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test( data, 'case', output_scores=True) # Score diff --git a/src/python/nimbusml/examples/PoissonRegressionRegressor.py b/src/python/nimbusml/examples/PoissonRegressionRegressor.py index 5edd5d27..0e2a3653 100644 --- a/src/python/nimbusml/examples/PoissonRegressionRegressor.py +++ b/src/python/nimbusml/examples/PoissonRegressionRegressor.py @@ -24,7 +24,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/Schema.py b/src/python/nimbusml/examples/Schema.py index c0b8d493..c54e708d 100644 --- a/src/python/nimbusml/examples/Schema.py +++ b/src/python/nimbusml/examples/Schema.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- ############################################################################### # Get schema from a fitted pipeline example. import numpy as np @@ -30,4 +31,4 @@ schema = pipe.get_output_columns() print(schema[0:5]) -# ['Sentiment', 'SentimentText', 'features.Char.|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u'] +# ['Sentiment', 'SentimentText', 'features.Char.<␂>|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u'] diff --git a/src/python/nimbusml/examples/SgdBinaryClassifier.py b/src/python/nimbusml/examples/SgdBinaryClassifier.py index df6c7c6a..a31576f0 100644 --- a/src/python/nimbusml/examples/SgdBinaryClassifier.py +++ b/src/python/nimbusml/examples/SgdBinaryClassifier.py @@ -24,7 +24,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/SymSgdBinaryClassifier.py b/src/python/nimbusml/examples/SymSgdBinaryClassifier.py index 9cae2d8f..0d5c09a5 100644 --- a/src/python/nimbusml/examples/SymSgdBinaryClassifier.py +++ b/src/python/nimbusml/examples/SymSgdBinaryClassifier.py @@ -24,7 +24,6 @@ ]) # train, predict, and evaluate -# TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions diff --git a/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py b/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py index 8e33ab7b..f049c39a 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py @@ -1,7 +1,9 @@ ############################################################################### # DateTimeSplitter import pandas +from nimbusml import Pipeline from nimbusml.preprocessing import DateTimeSplitter +from nimbusml.preprocessing.schema import ColumnSelector df = pandas.DataFrame(data=dict( tokens1=[1, 2, 3, 157161600], @@ -9,16 +11,16 @@ )) cols_to_drop = [ - 'Hour12', 'DayOfWeek', 'DayOfQuarter', - 'DayOfYear', 'WeekOfMonth', 'QuarterOfYear', - 'HalfOfYear', 'WeekIso', 'YearIso', 'MonthLabel', - 'AmPmLabel', 'DayOfWeekLabel', 'IsPaidTimeOff' + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' ] -cd = DateTimeSplitter(prefix='dt', - country='Canada', - columns_to_drop=cols_to_drop) << 'tokens1' -y = cd.fit_transform(df) +dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1' + +pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) +y = pipeline.fit_transform(df) # view the three columns pandas.set_option('display.max_columns', None) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/Image_df.py b/src/python/nimbusml/examples/examples_from_dataframe/Image_df.py index 8dd050a0..3cdbb00e 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/Image_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/Image_df.py @@ -18,7 +18,7 @@ # transforms and learners transform_1 = Loader() << 'Path' -transform_2 = Resizer(image_width=227, image_height=227) +transform_2 = Resizer(image_width=32, image_height=32) transform_3 = PixelExtractor() algo = FastLinearBinaryClassifier() << 'Path' diff --git a/src/python/nimbusml/examples/examples_from_dataframe/NGramExtractor_df.py b/src/python/nimbusml/examples/examples_from_dataframe/NGramExtractor_df.py new file mode 100644 index 00000000..ddc27ab3 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/NGramExtractor_df.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- +############################################################################### +# Example with NGramExtractor and LogisticRegressionBinaryClassifier +import pandas +from nimbusml import Pipeline +from nimbusml.feature_extraction.text import NGramExtractor +from nimbusml.linear_model import LogisticRegressionBinaryClassifier +from nimbusml.preprocessing.schema import ColumnConcatenator, ColumnDropper +from nimbusml.preprocessing.text import CharTokenizer + +train_reviews = pandas.DataFrame( + data=dict( + review=[ + "This is great", + "I hate it", + "Love it", + "Do not like it", + "Really like it", + "I hate it", + "I like it a lot", + "I kind of hate it", + "I do like it", + "I really hate it", + "It is very good", + "I hate it a bunch", + "I love it a bunch", + "I hate it", + "I like it very much", + "I hate it very much.", + "I really do love it", + "I really do hate it", + "Love it!", + "Hate it!", + "I love it", + "I hate it", + "I love it", + "I hate it", + "I love it"], + like=[ + True, + False, + True, + False, + True, + False, + True, + False, + True, + False, + True, + False, + True, + False, + True, + False, + True, + False, + True, + False, + True, + False, + True, + False, + True])) + +test_reviews = pandas.DataFrame( + data=dict( + review=[ + "This is great", + "I hate it", + "Love it", + "Really like it", + "I hate it", + "I like it a lot", + "I love it", + "I do like it", + "I really hate it", + "I love it"])) + +y = train_reviews['like'] +X = train_reviews.loc[:, train_reviews.columns != 'like'] + +pipeline = Pipeline([ + CharTokenizer(columns={'review_transform': 'review'}), + NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}), + ColumnDropper(columns=['review_transform', 'review']) +]) +X = pipeline.fit_transform(X) + +print(X.head()) +# ngrams.<␂>|T|h ngrams.T|h|i ngrams.h|i|s ngrams.i|s|<␠> ... ngrams.i|t|! ngrams.t|!|<␃> ngrams.<␂>|H|a ngrams.H|a|t +# 0 1.0 1.0 1.0 2.0 ... 0.0 0.0 0.0 0.0 +# 1 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 +# 2 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 +# 3 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 +# 4 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 + +model = LogisticRegressionBinaryClassifier().fit(X, y) + +X_test = pipeline.transform(test_reviews) +result = model.predict(X_test) + +print(result) +# 0 True +# 1 False +# 2 True +# 3 True +# 4 False +# 5 True +# 6 True +# 7 True +# 8 False +# 9 True diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py index 9a4eba53..074ce92f 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py @@ -2,8 +2,7 @@ # WordEmbedding: pre-trained transform to generate word embeddings import pandas from nimbusml import Pipeline -from nimbusml.feature_extraction.text import WordEmbedding -from nimbusml.feature_extraction.text.ngramfeaturizer import NGramFeaturizer +from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding from nimbusml.feature_extraction.text.extractor import Ngram # create the data @@ -22,5 +21,12 @@ ]) y = pipeline.fit_transform(customer_reviews) -# view the review embeddings -# print(y.head()) +# view a small subset of the review embeddings +print(y.iloc[:5, -3:]) +# review_TransformedText.147 review_TransformedText.148 review_TransformedText.149 +# 0 1.918661 -0.714531 3.062141 +# 1 1.891922 -0.248650 1.706620 +# 2 1.601611 0.309785 3.379576 +# 3 1.970666 1.477450 3.110802 +# 4 2.521791 0.122538 3.129919 + diff --git a/src/python/nimbusml/feature_extraction/image/loader.py b/src/python/nimbusml/feature_extraction/image/loader.py index bd93a080..0ee0e305 100644 --- a/src/python/nimbusml/feature_extraction/image/loader.py +++ b/src/python/nimbusml/feature_extraction/image/loader.py @@ -20,7 +20,7 @@ class Loader(core, BaseTransform, TransformerMixin): """ - Loaders image data. + Loads image data. .. remarks:: ``Loader`` loads images from paths. diff --git a/src/python/nimbusml/feature_extraction/image/resizer.py b/src/python/nimbusml/feature_extraction/image/resizer.py index 77d9434f..2a8baf4a 100644 --- a/src/python/nimbusml/feature_extraction/image/resizer.py +++ b/src/python/nimbusml/feature_extraction/image/resizer.py @@ -20,16 +20,16 @@ class Resizer(core, BaseTransform, TransformerMixin): """ - Resizers an image to a specified dimension using a specified + Resizes an image to a specified dimension using a specified resizing method. .. remarks:: - ``Resizer`` resizers an image to the specified height and width + ``Resizer`` resizes an image to the specified height and width using a specified resizing method. The input variables to this transforms must be images, typically the result of the ``Loader`` transform. - :param columns: a dictionary of key-value pairs, where key is the output + :param columns: A dictionary of key-value pairs, where key is the output column name and value is the input column name. * Multiple key-value pairs are allowed. diff --git a/src/python/nimbusml/feature_extraction/text/__init__.py b/src/python/nimbusml/feature_extraction/text/__init__.py index 7dbd24cf..9c16726e 100644 --- a/src/python/nimbusml/feature_extraction/text/__init__.py +++ b/src/python/nimbusml/feature_extraction/text/__init__.py @@ -1,10 +1,12 @@ from .lightlda import LightLda +from .ngramextractor import NGramExtractor from .ngramfeaturizer import NGramFeaturizer from .sentiment import Sentiment from .wordembedding import WordEmbedding __all__ = [ 'LightLda', + 'NGramExtractor', 'NGramFeaturizer', 'Sentiment', 'WordEmbedding' diff --git a/src/python/nimbusml/feature_extraction/text/extractor/ngram.py b/src/python/nimbusml/feature_extraction/text/extractor/ngram.py index 9ec1858f..6da8cfd2 100644 --- a/src/python/nimbusml/feature_extraction/text/extractor/ngram.py +++ b/src/python/nimbusml/feature_extraction/text/extractor/ngram.py @@ -18,7 +18,7 @@ class Ngram(core): """ - Extracts NGrams from text and convert them to vector using + Extracts NGrams from text and converts them to vector using dictionary. .. remarks:: diff --git a/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py b/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py index 2f373a31..fca66615 100644 --- a/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py +++ b/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py @@ -18,7 +18,7 @@ class NgramHash(core): """ - Extracts NGrams from text and convert them to vector using hashing + Extracts NGrams from text and converts them to vector using hashing trick. .. remarks:: diff --git a/src/python/nimbusml/feature_extraction/text/lightlda.py b/src/python/nimbusml/feature_extraction/text/lightlda.py index 271f90c7..f8801caa 100644 --- a/src/python/nimbusml/feature_extraction/text/lightlda.py +++ b/src/python/nimbusml/feature_extraction/text/lightlda.py @@ -30,7 +30,7 @@ class LightLda(core, BaseTransform, TransformerMixin): topical vectors. LightLDA is an extremely efficient implementation of LDA developed in MSR-Asia that incorporates a number of optimization techniques - `(http://arxiv.org/abs/1412.1576) `_. + `(https://arxiv.org/abs/1412.1576) `_. With the LDA transform, we can train a topic model to produce 1 million topics with 1 million vocabulary on a 1-billion-token document set one diff --git a/src/python/nimbusml/feature_extraction/text/ngramextractor.py b/src/python/nimbusml/feature_extraction/text/ngramextractor.py new file mode 100644 index 00000000..f27b7004 --- /dev/null +++ b/src/python/nimbusml/feature_extraction/text/ngramextractor.py @@ -0,0 +1,72 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +NGramExtractor +""" + +__all__ = ["NGramExtractor"] + + +from sklearn.base import TransformerMixin + +from ...base_transform import BaseTransform +from ...internal.core.feature_extraction.text.ngramextractor import \ + NGramExtractor as core +from ...internal.utils.utils import trace + + +class NGramExtractor(core, BaseTransform, TransformerMixin): + """ + **Description** + Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag. + + :param columns: see `Columns `_. + + :param ngram_length: Maximum n-gram length. + + :param all_lengths: Whether to store all n-gram lengths up to ngramLength, + or only ngramLength. + + :param skip_length: Maximum number of tokens to skip when constructing an + n-gram. + + :param max_num_terms: Maximum number of n-grams to store in the dictionary. + + :param weighting: The weighting criteria. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + ngram_length=2, + all_lengths=True, + skip_length=0, + max_num_terms=[10000000], + weighting='Tf', + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + ngram_length=ngram_length, + all_lengths=all_lengths, + skip_length=skip_length, + max_num_terms=max_num_terms, + weighting=weighting, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/feature_extraction/text/wordembedding.py b/src/python/nimbusml/feature_extraction/text/wordembedding.py index ad467ce1..957cf06d 100644 --- a/src/python/nimbusml/feature_extraction/text/wordembedding.py +++ b/src/python/nimbusml/feature_extraction/text/wordembedding.py @@ -31,7 +31,7 @@ class WordEmbedding(core, BaseTransform, TransformerMixin): available options are various versions of `GloVe Models `_, `FastText `_, and `Sswe - `_. + `_. :param columns: a dictionary of key-value pairs, where key is the output diff --git a/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py b/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py index c54f353b..bdc0a7d2 100644 --- a/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py +++ b/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py @@ -42,7 +42,7 @@ class FactorizationMachineBinaryClassifier( `Field Aware Factorization Machines `_, `Field-aware Factorization Machines for CTR Prediction - `_, + `_, `Adaptive Subgradient Methods for Online Learning and Stochastic Optimization `_ diff --git a/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py b/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py index 728a7132..9fe01d4f 100644 --- a/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py +++ b/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py @@ -57,13 +57,12 @@ class PcaAnomalyDetector( `Randomized Methods for Computing the Singular Value Decomposition (SVD) of very large matrices - `_ + `_ `A randomized algorithm for principal component analysis `_, `Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions - `_ + `_ :param normalize: Specifies the type of automatic normalization used: diff --git a/src/python/nimbusml/internal/core/ensemble/booster/dart.py b/src/python/nimbusml/internal/core/ensemble/booster/dart.py index dd4418d3..49297929 100644 --- a/src/python/nimbusml/internal/core/ensemble/booster/dart.py +++ b/src/python/nimbusml/internal/core/ensemble/booster/dart.py @@ -25,9 +25,9 @@ class Dart(Component): `_ is an ensemble method of boosted regression trees. The Dropouts meet Multiple Additive Regression - Trees (DART) employs dropouts in MART and overcomes the issues of over- + Trees (DART) employs dropouts in MART and overcomes the issues of over- specialization of MART, - achiving better performance in many tasks. + achieving better performance in many tasks. **Reference** diff --git a/src/python/nimbusml/internal/core/ensemble/booster/goss.py b/src/python/nimbusml/internal/core/ensemble/booster/goss.py index 694cb8bf..aa552afc 100644 --- a/src/python/nimbusml/internal/core/ensemble/booster/goss.py +++ b/src/python/nimbusml/internal/core/ensemble/booster/goss.py @@ -23,9 +23,9 @@ class Goss(Component): .. remarks:: Gradient-based One-Side Sampling (GOSS) employs an adaptive sampling named gradient-based - sampling. For datasets with large sample size, GOSS has considerable + sampling. For datasets with large sample size, GOSS has considerable advantage in terms of - statistical and computational efficiency. + statistical and computational efficiency. diff --git a/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py index 270584a3..715c2035 100644 --- a/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py @@ -54,7 +54,7 @@ class FastForestBinaryClassifier( **Reference** `Wikipedia: Random forest - `_ + `_ `Quantile regression forest `_ diff --git a/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py b/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py index 74698a6d..37278659 100644 --- a/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py @@ -64,7 +64,7 @@ class FastForestRegressor( **Reference** `Wikipedia: Random forest - `_ + `_ `Quantile regression forest `_ diff --git a/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py index 37e5cd76..eef52d67 100644 --- a/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py @@ -78,7 +78,7 @@ class FastTreesBinaryClassifier( `_ `Greedy function approximation: A gradient boosting machine. - `_ + `_ :param number_of_trees: Specifies the total number of decision trees to create in the ensemble. By creating more decision trees, you can diff --git a/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py b/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py index 3ee724c4..25becac7 100644 --- a/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py @@ -83,7 +83,7 @@ class FastTreesRegressor( `_ `Greedy function approximation: A gradient boosting machine. - `_ + `_ :param number_of_trees: Specifies the total number of decision trees to create in the ensemble. By creating more decision trees, you can diff --git a/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py b/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py index f9340f5d..75a15169 100644 --- a/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py @@ -35,7 +35,7 @@ class FastTreesTweedieRegressor( `_ `Greedy function approximation: A gradient boosting machine. - `_ + `_ :param number_of_trees: Specifies the total number of decision trees to create in the ensemble. By creating more decision trees, you can diff --git a/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py index 56d90d7e..52f2f565 100644 --- a/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py @@ -42,7 +42,7 @@ class GamBinaryClassifier( functions learned will step between the discretization boundaries. This implementation is based on the this `paper - `_, + `_, but diverges from it in several important respects: most significantly, in each round of boosting, rather than do one feature at a time, it @@ -78,7 +78,7 @@ class GamBinaryClassifier( `Generalized additive models `_, `Intelligible Models for Classification and Regression - `_ + `_ :param number_of_iterations: Total number of iterations over all features. diff --git a/src/python/nimbusml/internal/core/ensemble/gamregressor.py b/src/python/nimbusml/internal/core/ensemble/gamregressor.py index 048bf874..de884d9a 100644 --- a/src/python/nimbusml/internal/core/ensemble/gamregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/gamregressor.py @@ -40,7 +40,7 @@ class GamRegressor(BasePipelineItem, DefaultSignatureWithRoles): functions learned will step between the discretization boundaries. This implementation is based on the this `paper - `_, + `_, but diverges from it in several important respects: most significantly, in each round of boosting, rather than do one feature at a time, it @@ -76,7 +76,7 @@ class GamRegressor(BasePipelineItem, DefaultSignatureWithRoles): `Generalized additive models `_, `Intelligible Models for Classification and Regression - `_ + `_ :param number_of_iterations: Total number of iterations over all features. diff --git a/src/python/nimbusml/internal/core/feature_extraction/image/loader.py b/src/python/nimbusml/internal/core/feature_extraction/image/loader.py index ad8c70c1..888afab4 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/image/loader.py +++ b/src/python/nimbusml/internal/core/feature_extraction/image/loader.py @@ -18,7 +18,7 @@ class Loader(BasePipelineItem, DefaultSignature): """ - Loaders image data. + Loads image data. .. remarks:: ``Loader`` loads images from paths. diff --git a/src/python/nimbusml/internal/core/feature_extraction/image/resizer.py b/src/python/nimbusml/internal/core/feature_extraction/image/resizer.py index 34ba1f39..819fb51c 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/image/resizer.py +++ b/src/python/nimbusml/internal/core/feature_extraction/image/resizer.py @@ -18,11 +18,11 @@ class Resizer(BasePipelineItem, DefaultSignature): """ - Resizers an image to a specified dimension using a specified + Resizes an image to a specified dimension using a specified resizing method. .. remarks:: - ``Resizer`` resizers an image to the specified height and width + ``Resizer`` resizes an image to the specified height and width using a specified resizing method. The input variables to this transforms must be images, typically the result of the ``Loader`` transform. diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py index 07fde941..a7292f9c 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py +++ b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py @@ -18,7 +18,7 @@ class Ngram(Component): """ - Extracts NGrams from text and convert them to vector using + Extracts NGrams from text and converts them to vector using dictionary. .. remarks:: diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py index cd08b4be..04cb7713 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py +++ b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py @@ -18,7 +18,7 @@ class NgramHash(Component): """ - Extracts NGrams from text and convert them to vector using hashing + Extracts NGrams from text and converts them to vector using hashing trick. .. remarks:: diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py b/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py index 45743c1b..8fbcc6e5 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py +++ b/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py @@ -28,7 +28,7 @@ class LightLda(BasePipelineItem, DefaultSignature): topical vectors. LightLDA is an extremely efficient implementation of LDA developed in MSR-Asia that incorporates a number of optimization techniques - `(http://arxiv.org/abs/1412.1576) `_. + `(https://arxiv.org/abs/1412.1576) `_. With the LDA transform, we can train a topic model to produce 1 million topics with 1 million vocabulary on a 1-billion-token document set one diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/ngramextractor.py b/src/python/nimbusml/internal/core/feature_extraction/text/ngramextractor.py new file mode 100644 index 00000000..c627addd --- /dev/null +++ b/src/python/nimbusml/internal/core/feature_extraction/text/ngramextractor.py @@ -0,0 +1,111 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +NGramExtractor +""" + +__all__ = ["NGramExtractor"] + + +from ....entrypoints.transforms_ngramtranslator import \ + transforms_ngramtranslator +from ....utils.utils import trace +from ...base_pipeline_item import BasePipelineItem, DefaultSignature + + +class NGramExtractor(BasePipelineItem, DefaultSignature): + """ + **Description** + Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag. + + :param ngram_length: Maximum n-gram length. + + :param all_lengths: Whether to store all n-gram lengths up to ngramLength, + or only ngramLength. + + :param skip_length: Maximum number of tokens to skip when constructing an + n-gram. + + :param max_num_terms: Maximum number of n-grams to store in the dictionary. + + :param weighting: The weighting criteria. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + ngram_length=2, + all_lengths=True, + skip_length=0, + max_num_terms=[10000000], + weighting='Tf', + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.ngram_length = ngram_length + self.all_lengths = all_lengths + self.skip_length = skip_length + self.max_num_terms = max_num_terms + self.weighting = weighting + + @property + def _entrypoint(self): + return transforms_ngramtranslator + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, + Name=o) for i, + o in zip( + input_columns, + output_columns)] if input_columns else None, + ngram_length=self.ngram_length, + all_lengths=self.all_lengths, + skip_length=self.skip_length, + max_num_terms=self.max_num_terms, + weighting=self.weighting) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py b/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py index d67df9db..45553249 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py +++ b/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py @@ -28,7 +28,7 @@ class WordEmbedding(BasePipelineItem, DefaultSignature): available options are various versions of `GloVe Models `_, `FastText `_, and `Sswe - `_. + `_. :param model_kind: Pre-trained model used to create the vocabulary. diff --git a/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py index 26471467..67f10cfc 100644 --- a/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py @@ -67,10 +67,10 @@ class AveragedPerceptronBinaryClassifier( `_ `Large Margin Classification Using the Perceptron Algorithm - `_ + `_ `Discriminative Training Methods for Hidden Markov Models - `_ + `_ :param normalize: Specifies the type of automatic normalization used: diff --git a/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py index 10c5c2a5..f8346814 100644 --- a/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py @@ -23,7 +23,7 @@ class FastLinearBinaryClassifier( """ A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer - for linear binary classification and regression. + for linear binary classification. .. remarks:: ``FastLinearBinaryClassifier`` is a trainer based on the Stochastic @@ -80,8 +80,7 @@ class FastLinearBinaryClassifier( content/uploads/2016/06/main-3.pdf>`_ `Stochastic Dual Coordinate Ascent Methods for Regularized Loss - Minimization `_ + Minimization `_ :param l2_regularization: L2 regularizer constant. By default the l2 diff --git a/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py b/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py index a2880b79..4afcba87 100644 --- a/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py @@ -22,7 +22,8 @@ class FastLinearClassifier( DefaultSignatureWithRoles): """ - Train an SDCA multi class model + A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer for + multi class classification. .. remarks:: ``FastLinearClassifier`` is a trainer based on the Stochastic Dual @@ -78,8 +79,7 @@ class FastLinearClassifier( content/uploads/2016/06/main-3.pdf>`_ `Stochastic Dual Coordinate Ascent Methods for Regularized Loss - Minimization `_ + Minimization `_ :param l2_regularization: L2 regularizer constant. By default the l2 diff --git a/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py b/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py index cf9073e5..597e3dfb 100644 --- a/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py +++ b/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py @@ -23,7 +23,7 @@ class FastLinearRegressor( """ A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer - for linear binary classification and regression. + for linear regression. .. remarks:: ``FastLinearRegressor`` is a trainer based on the Stochastic Dual @@ -78,8 +78,7 @@ class FastLinearRegressor( content/uploads/2016/06/main-3.pdf>`_ `Stochastic Dual Coordinate Ascent Methods for Regularized Loss - Minimization `_ + Minimization `_ :param l2_regularization: L2 regularizer constant. By default the l2 diff --git a/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py index 098c92e9..50b344ac 100644 --- a/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py @@ -90,14 +90,14 @@ class LogisticRegressionBinaryClassifier( **Reference** - `Wikipedia: L-BFGS `_ + `Wikipedia: L-BFGS `_ `Wikipedia: Logistic - regression `_ + regression `_ `Scalable Training of L1-Regularized Log-Linear Models - `_ + `_ `Test Run - L1 and L2 Regularization for Machine Learning diff --git a/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py b/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py index 90af2ffb..3fd6efba 100644 --- a/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py @@ -91,14 +91,14 @@ class LogisticRegressionClassifier( **Reference** - `Wikipedia: L-BFGS `_ + `Wikipedia: L-BFGS `_ `Wikipedia: Logistic - regression `_ + regression `_ `Scalable Training of L1-Regularized Log-Linear Models - `_ + `_ `Test Run - L1 and L2 Regularization for Machine Learning diff --git a/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py index b0c5e898..aada6337 100644 --- a/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py @@ -35,14 +35,14 @@ class SgdBinaryClassifier( associated optimization problem is sparse, then Hogwild SGD achieves a nearly optimal rate of convergence. For a detailed reference, please - refer to `http://arxiv.org/pdf/1106.5730v2.pdf - `_. + refer to `https://arxiv.org/pdf/1106.5730v2.pdf + `_. **Reference** - `http://arxiv.org/pdf/1106.5730v2.pdf - `_ + `https://arxiv.org/pdf/1106.5730v2.pdf + `_ :param normalize: Specifies the type of automatic normalization used: diff --git a/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py b/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py index db2c39ef..a00c3dc6 100644 --- a/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py +++ b/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py @@ -23,8 +23,6 @@ class DateTimeSplitter(BasePipelineItem, DefaultSignature): :param prefix: Output column prefix. - :param columns_to_drop: Columns to drop after the DateTime Expansion. - :param country: Country to get holidays for. Defaults to none if not passed. @@ -36,14 +34,12 @@ class DateTimeSplitter(BasePipelineItem, DefaultSignature): def __init__( self, prefix, - columns_to_drop=None, country='None', **params): BasePipelineItem.__init__( self, type='transform', **params) self.prefix = prefix - self.columns_to_drop = columns_to_drop self.country = country @property @@ -55,7 +51,6 @@ def _get_node(self, **all_args): algo_args = dict( source=self.source, prefix=self.prefix, - columns_to_drop=self.columns_to_drop, country=self.country) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/fromkey.py b/src/python/nimbusml/internal/core/preprocessing/fromkey.py index bd5cfe10..ef7f8efb 100644 --- a/src/python/nimbusml/internal/core/preprocessing/fromkey.py +++ b/src/python/nimbusml/internal/core/preprocessing/fromkey.py @@ -19,8 +19,7 @@ class FromKey(BasePipelineItem, DefaultSignature): """ - Text transforms that can be performed on data before training - a model. + Converts the key types back to their original values. .. remarks:: The ``FromKey`` transform converts a column of keys, generated using diff --git a/src/python/nimbusml/internal/core/preprocessing/missing_values/handler.py b/src/python/nimbusml/internal/core/preprocessing/missing_values/handler.py index 55f0ed01..3fd199aa 100644 --- a/src/python/nimbusml/internal/core/preprocessing/missing_values/handler.py +++ b/src/python/nimbusml/internal/core/preprocessing/missing_values/handler.py @@ -33,14 +33,13 @@ class Handler(BasePipelineItem, DefaultSignature): were imputed. This works for columns that have numeric type. :param replace_with: The method to use to replace NaN values. The - following choices are available. - - * Def: Replace with default value of that type, usually ``0``. If no - replace - method is specified, this is the default strategy. - * Mean: Replace NaN values with the mean of the values in that column. - * Min: Replace with minimum value in the column. - * Max: Replace with maximum value in the column. + following choices are available. + + * Def: Replace with default value of that type, usually ``0``. If no + replace method is specified, this is the default strategy. + * Mean: Replace NaN values with the mean of the values in that column. + * Min: Replace with minimum value in the column. + * Max: Replace with maximum value in the column. :param impute_by_slot: Whether to impute values by slot. diff --git a/src/python/nimbusml/internal/core/preprocessing/schema/prefixcolumnconcatenator.py b/src/python/nimbusml/internal/core/preprocessing/schema/prefixcolumnconcatenator.py index d202e947..003e909f 100644 --- a/src/python/nimbusml/internal/core/preprocessing/schema/prefixcolumnconcatenator.py +++ b/src/python/nimbusml/internal/core/preprocessing/schema/prefixcolumnconcatenator.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand """ PrefixColumnConcatenator """ @@ -15,10 +16,12 @@ from ...base_pipeline_item import BasePipelineItem, DefaultSignature -class PrefixColumnConcatenator(BasePipelineItem, DefaultSignature): +class PrefixColumnConcatenator( + BasePipelineItem, + DefaultSignature): """ - Combines several columns into a single vector-valued column by prefix + Combines several columns into a single vector-valued column by prefix. .. remarks:: ``PrefixColumnConcatenator`` creates a single vector-valued column from diff --git a/src/python/nimbusml/internal/core/preprocessing/tokey.py b/src/python/nimbusml/internal/core/preprocessing/tokey.py index 55cd7200..b1295adf 100644 --- a/src/python/nimbusml/internal/core/preprocessing/tokey.py +++ b/src/python/nimbusml/internal/core/preprocessing/tokey.py @@ -19,8 +19,7 @@ class ToKey(BasePipelineItem, DefaultSignature): """ - Text transforms that can be performed on data before training - a model. + Converts input values (words, numbers, etc.) to index in a dictionary. .. remarks:: The ``ToKey`` transform converts a column of text to key values diff --git a/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py b/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py index da39b2c8..f1ee5f6b 100644 --- a/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py +++ b/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py @@ -30,7 +30,7 @@ class SsaForecaster(BasePipelineItem, DefaultSignature): input time-series where each component in the spectrum corresponds to a trend, seasonal or noise component in the time-series. For details of the Singular Spectrum Analysis (SSA), refer to `this document - `_. + `_. :param window_size: The length of the window on the series for building the trajectory matrix (parameter L). @@ -38,7 +38,7 @@ class SsaForecaster(BasePipelineItem, DefaultSignature): :param series_length: The length of series that is kept in buffer for modeling (parameter N). - :param train_size: The length of series from the begining used for + :param train_size: The length of series from the beginning used for training. :param horizon: The number of values to forecast. diff --git a/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_parquetpathparser.py b/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_parquetpathparser.py deleted file mode 100644 index a5c34acb..00000000 --- a/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_parquetpathparser.py +++ /dev/null @@ -1,26 +0,0 @@ -# - Generated by tools/entrypoint_compiler.py: do not edit by hand -""" -ParquetPathParser -""" - - -from ..utils.entrypoints import Component - - -def parquet_path_parser( - **params): - """ - **Description** - Extract name/value pairs from Parquet formatted directory names. - Example path: Year=2018/Month=12/data1.parquet - - """ - - entrypoint_name = 'ParquetPathParser' - settings = {} - - component = Component( - name=entrypoint_name, - settings=settings, - kind='PartitionedPathParser') - return component diff --git a/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_simplepathparser.py b/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_simplepathparser.py deleted file mode 100644 index 3f63ac19..00000000 --- a/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_simplepathparser.py +++ /dev/null @@ -1,71 +0,0 @@ -# - Generated by tools/entrypoint_compiler.py: do not edit by hand -""" -SimplePathParser -""" - - -from ..utils.entrypoints import Component -from ..utils.utils import try_set - - -def simple_path_parser( - columns=None, - type='TX', - **params): - """ - **Description** - A simple parser that extracts directory names as column values. - Column names are defined as arguments. - - :param columns: Column definitions used to override the - Partitioned Path Parser. Expected with the format - name:type:numeric-source, for example, col=MyFeature:R4:1 - (settings). - :param type: Data type of each column. (settings). - """ - - entrypoint_name = 'SimplePathParser' - settings = {} - - if columns is not None: - settings['Columns'] = try_set( - obj=columns, - none_acceptable=True, - is_of_type=list, - is_column=True) - if type is not None: - settings['Type'] = try_set( - obj=type, - none_acceptable=True, - is_of_type=str, - values=[ - 'I1', - 'U1', - 'I2', - 'U2', - 'I4', - 'U4', - 'I8', - 'U8', - 'R4', - 'Num', - 'R8', - 'TX', - 'Text', - 'TXT', - 'BL', - 'Bool', - 'TimeSpan', - 'TS', - 'DT', - 'DateTime', - 'DZ', - 'DateTimeZone', - 'UG', - 'U16']) - - component = Component( - name=entrypoint_name, - settings=settings, - kind='PartitionedPathParser') - return component diff --git a/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py b/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py deleted file mode 100644 index 3c080eb6..00000000 --- a/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py +++ /dev/null @@ -1,116 +0,0 @@ -# - Generated by tools/entrypoint_compiler.py: do not edit by hand -""" -Models.OnnxConverter -""" - - -from ..utils.entrypoints import EntryPoint -from ..utils.utils import try_set, unlist - - -def models_onnxconverter( - onnx, - data_file=None, - json=None, - name=None, - domain=None, - inputs_to_drop=None, - outputs_to_drop=None, - model=None, - onnx_version='Stable', - predictive_model=None, - **params): - """ - **Description** - Converts the model to ONNX format. - - :param data_file: The data file (inputs). - :param onnx: The path to write the output ONNX to. (inputs). - :param json: The path to write the output JSON to. (inputs). - :param name: The 'name' property in the output ONNX. By default - this will be the ONNX extension-less name. (inputs). - :param domain: The 'domain' property in the output ONNX. - (inputs). - :param inputs_to_drop: Array of input column names to drop - (inputs). - :param outputs_to_drop: Array of output column names to drop - (inputs). - :param model: Model that needs to be converted to ONNX format. - (inputs). - :param onnx_version: The targeted ONNX version. It can be either - "Stable" or "Experimental". If "Experimental" is used, - produced model can contain components that is not officially - supported in ONNX standard. (inputs). - :param predictive_model: Predictor model that needs to be - converted to ONNX format. (inputs). - """ - - entrypoint_name = 'Models.OnnxConverter' - inputs = {} - outputs = {} - - if data_file is not None: - inputs['DataFile'] = try_set( - obj=data_file, - none_acceptable=True, - is_of_type=str) - if onnx is not None: - inputs['Onnx'] = try_set( - obj=onnx, - none_acceptable=False, - is_of_type=str) - if json is not None: - inputs['Json'] = try_set( - obj=json, - none_acceptable=True, - is_of_type=str) - if name is not None: - inputs['Name'] = try_set( - obj=name, - none_acceptable=True, - is_of_type=str, - is_column=True) - if domain is not None: - inputs['Domain'] = try_set( - obj=domain, - none_acceptable=True, - is_of_type=str) - if inputs_to_drop is not None: - inputs['InputsToDrop'] = try_set( - obj=inputs_to_drop, - none_acceptable=True, - is_of_type=list) - if outputs_to_drop is not None: - inputs['OutputsToDrop'] = try_set( - obj=outputs_to_drop, - none_acceptable=True, - is_of_type=list) - if model is not None: - inputs['Model'] = try_set( - obj=model, - none_acceptable=True, - is_of_type=str) - if onnx_version is not None: - inputs['OnnxVersion'] = try_set( - obj=onnx_version, - none_acceptable=True, - is_of_type=str, - values=[ - 'Stable', - 'Experimental']) - if predictive_model is not None: - inputs['PredictiveModel'] = try_set( - obj=predictive_model, none_acceptable=True, is_of_type=str) - - input_variables = { - x for x in unlist(inputs.values()) - if isinstance(x, str) and x.startswith("$")} - output_variables = { - x for x in unlist(outputs.values()) - if isinstance(x, str) and x.startswith("$")} - - entrypoint = EntryPoint( - name=entrypoint_name, inputs=inputs, outputs=outputs, - input_variables=input_variables, - output_variables=output_variables) - return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/models_schema.py b/src/python/nimbusml/internal/entrypoints/models_schema.py index 0b8b0056..096aa2e5 100644 --- a/src/python/nimbusml/internal/entrypoints/models_schema.py +++ b/src/python/nimbusml/internal/entrypoints/models_schema.py @@ -1,5 +1,6 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand """ -Models.Summarizer +Models.Schema """ @@ -8,23 +9,24 @@ def models_schema( - transform_model, + model, schema=None, **params): """ **Description** - Retreives input/output column schema for transform model. + Retrieve output model schema - :param transform_model: The transform model. + :param model: The transform model. (inputs). + :param schema: The model schema (outputs). """ entrypoint_name = 'Models.Schema' inputs = {} outputs = {} - if transform_model is not None: + if model is not None: inputs['Model'] = try_set( - obj=transform_model, + obj=model, none_acceptable=False, is_of_type=str) if schema is not None: @@ -32,7 +34,7 @@ def models_schema( obj=schema, none_acceptable=False, is_of_type=str) - + input_variables = { x for x in unlist(inputs.values()) if isinstance(x, str) and x.startswith("$")} diff --git a/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py b/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py index f02da3a7..1684783c 100644 --- a/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py +++ b/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py @@ -43,7 +43,7 @@ def timeseriesprocessingentrypoints_ssaforecasting( building the trajectory matrix (parameter L). (inputs). :param series_length: The length of series that is kept in buffer for modeling (parameter N). (inputs). - :param train_size: The length of series from the begining used + :param train_size: The length of series from the beginning used for training. (inputs). :param horizon: The number of values to forecast. (inputs). :param confidence_level: The confidence level in [0, 1) for diff --git a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py index e5b62a23..5c281338 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py @@ -36,7 +36,7 @@ def trainers_generalizedadditivemodelbinaryclassifier( **Description** Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It - mantains no interactions between features. + maintains no interactions between features. :param number_of_iterations: Total number of iterations over all features (inputs). diff --git a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py index 1c56a706..2b9334f8 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py @@ -36,7 +36,7 @@ def trainers_generalizedadditivemodelregressor( **Description** Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It - mantains no interactions between features. + maintains no interactions between features. :param number_of_iterations: Total number of iterations over all features (inputs). diff --git a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py index 5db498b1..61759e4d 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py @@ -33,7 +33,7 @@ def trainers_logisticregressionclassifier( **params): """ **Description** - Maximum entrypy classification is a method in statistics used to + Maximum entropy classification is a method in statistics used to predict the probabilities of parallel events. The model predicts the probabilities of parallel events by fitting data to a softmax function. diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py index 7a5d8c71..addc2298 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py @@ -1,3 +1,4 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand """ Transforms.DatasetScorerEx """ diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py b/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py index 7afc028a..ac2524c8 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py @@ -14,7 +14,6 @@ def transforms_datetimesplitter( prefix, output_data=None, model=None, - columns_to_drop=None, country='None', **params): """ @@ -24,8 +23,6 @@ def transforms_datetimesplitter( :param source: Input column (inputs). :param data: Input dataset (inputs). :param prefix: Output column prefix (inputs). - :param columns_to_drop: Columns to drop after the DateTime - Expansion (inputs). :param country: Country to get holidays for. Defaults to none if not passed (inputs). :param output_data: Transformed dataset (outputs). @@ -52,12 +49,6 @@ def transforms_datetimesplitter( obj=prefix, none_acceptable=False, is_of_type=str) - if columns_to_drop is not None: - inputs['ColumnsToDrop'] = try_set( - obj=columns_to_drop, - none_acceptable=True, - is_of_type=list, - is_column=True) if country is not None: inputs['Country'] = try_set( obj=country, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py b/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py index 1f1a3870..121115b4 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py @@ -21,7 +21,7 @@ def transforms_missingvaluehandler( **Description** Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An - indicator column can optionally be concatenated, if theinput + indicator column can optionally be concatenated, if the input column type is numeric. :param column: New column definition(s) (optional form: diff --git a/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py b/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py index cfe672b7..301f1c2f 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py @@ -1,3 +1,4 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand """ Transforms.PrefixColumnConcatenator """ @@ -15,10 +16,10 @@ def transforms_prefixcolumnconcatenator( **params): """ **Description** - Concatenates one or more columns of the same item type by prefix. + Concatenates one or more columns of the same item type. - :param column: New column definition(s) (optional form: - name:srcs) (inputs). + :param column: New column definition(s) (optional form: name:src) + (inputs). :param data: Input dataset (inputs). :param output_data: Transformed dataset (outputs). :param model: Transform model (outputs). diff --git a/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py b/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py index e19bd1f1..499a08e3 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py @@ -83,8 +83,7 @@ def transforms_timeseriesimputer( values=[ 'ForwardFill', 'BackFill', - 'Median', - 'Interpolate']) + 'Median']) if supress_type_errors is not None: inputs['SupressTypeErrors'] = try_set( obj=supress_type_errors, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py b/src/python/nimbusml/internal/entrypoints/transforms_variablecolumntransform.py similarity index 82% rename from src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py rename to src/python/nimbusml/internal/entrypoints/transforms_variablecolumntransform.py index 16fca0ad..febcffde 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_variablecolumntransform.py @@ -1,3 +1,4 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand """ Transforms.VariableColumnTransform """ @@ -7,7 +8,7 @@ from ..utils.utils import try_set, unlist -def transforms_variablecolumn( +def transforms_variablecolumntransform( data, output_data=None, model=None, @@ -16,10 +17,12 @@ def transforms_variablecolumn( **params): """ **Description** - Combines the specified input columns in to a - single variable length vectorized column. + Combines the specified input columns in to a single variable length + vectorized column. :param data: Input dataset (inputs). + :param features: Features (inputs). + :param length_column_name: Length Column Name (inputs). :param output_data: Transformed dataset (outputs). :param model: Transform model (outputs). """ @@ -43,7 +46,8 @@ def transforms_variablecolumn( inputs['LengthColumnName'] = try_set( obj=length_column_name, none_acceptable=True, - is_of_type=str) + is_of_type=str, + is_column=True) if output_data is not None: outputs['OutputData'] = try_set( obj=output_data, diff --git a/src/python/nimbusml/internal/utils/entrypoints.py b/src/python/nimbusml/internal/utils/entrypoints.py index 0e06ff15..a907e52a 100644 --- a/src/python/nimbusml/internal/utils/entrypoints.py +++ b/src/python/nimbusml/internal/utils/entrypoints.py @@ -320,8 +320,8 @@ def _get_separator(self): return pieces[0].replace("sep=", "").strip() def run(self, X, y=None, max_slots=-1, random_state=None, verbose=1, **params): - if params.get("dryrun") is not None: - return 'graph = %s' % (str(self)) + if params.get("dry_run", False): + return str(self) output_modelfilename = None output_predictor_modelfilename = None diff --git a/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py b/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py index 0b467a37..3825c9e0 100644 --- a/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py +++ b/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py @@ -67,10 +67,10 @@ class AveragedPerceptronBinaryClassifier( `_ `Large Margin Classification Using the Perceptron Algorithm - `_ + `_ `Discriminative Training Methods for Hidden Markov Models - `_ + `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py b/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py index 4758454b..3f0fd7c7 100644 --- a/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py +++ b/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py @@ -23,7 +23,7 @@ class FastLinearBinaryClassifier( """ A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer - for linear binary classification and regression. + for linear binary classification. .. remarks:: ``FastLinearBinaryClassifier`` is a trainer based on the Stochastic @@ -80,8 +80,7 @@ class FastLinearBinaryClassifier( content/uploads/2016/06/main-3.pdf>`_ `Stochastic Dual Coordinate Ascent Methods for Regularized Loss - Minimization `_ + Minimization `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/linear_model/fastlinearclassifier.py b/src/python/nimbusml/linear_model/fastlinearclassifier.py index d1ef7644..50162961 100644 --- a/src/python/nimbusml/linear_model/fastlinearclassifier.py +++ b/src/python/nimbusml/linear_model/fastlinearclassifier.py @@ -21,7 +21,8 @@ class FastLinearClassifier(core, BasePredictor, ClassifierMixin): """ - Train an SDCA multi class model + A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer for + multi class classification. .. remarks:: ``FastLinearClassifier`` is a trainer based on the Stochastic Dual @@ -77,8 +78,7 @@ class FastLinearClassifier(core, BasePredictor, ClassifierMixin): content/uploads/2016/06/main-3.pdf>`_ `Stochastic Dual Coordinate Ascent Methods for Regularized Loss - Minimization `_ + Minimization `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/linear_model/fastlinearregressor.py b/src/python/nimbusml/linear_model/fastlinearregressor.py index 766a79ae..73745f22 100644 --- a/src/python/nimbusml/linear_model/fastlinearregressor.py +++ b/src/python/nimbusml/linear_model/fastlinearregressor.py @@ -22,7 +22,7 @@ class FastLinearRegressor(core, BasePredictor, RegressorMixin): """ A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer - for linear binary classification and regression. + for linear regression. .. remarks:: ``FastLinearRegressor`` is a trainer based on the Stochastic Dual @@ -77,8 +77,7 @@ class FastLinearRegressor(core, BasePredictor, RegressorMixin): content/uploads/2016/06/main-3.pdf>`_ `Stochastic Dual Coordinate Ascent Methods for Regularized Loss - Minimization `_ + Minimization `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py b/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py index 1cf29de4..76410659 100644 --- a/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py +++ b/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py @@ -91,14 +91,14 @@ class LogisticRegressionBinaryClassifier( **Reference** - `Wikipedia: L-BFGS `_ + `Wikipedia: L-BFGS `_ `Wikipedia: Logistic - regression `_ + regression `_ `Scalable Training of L1-Regularized Log-Linear Models - `_ + `_ `Test Run - L1 and L2 Regularization for Machine Learning diff --git a/src/python/nimbusml/linear_model/logisticregressionclassifier.py b/src/python/nimbusml/linear_model/logisticregressionclassifier.py index 265adc10..9155799e 100644 --- a/src/python/nimbusml/linear_model/logisticregressionclassifier.py +++ b/src/python/nimbusml/linear_model/logisticregressionclassifier.py @@ -92,14 +92,14 @@ class LogisticRegressionClassifier( **Reference** - `Wikipedia: L-BFGS `_ + `Wikipedia: L-BFGS `_ `Wikipedia: Logistic - regression `_ + regression `_ `Scalable Training of L1-Regularized Log-Linear Models - `_ + `_ `Test Run - L1 and L2 Regularization for Machine Learning diff --git a/src/python/nimbusml/linear_model/sgdbinaryclassifier.py b/src/python/nimbusml/linear_model/sgdbinaryclassifier.py index a5ee573d..893f6465 100644 --- a/src/python/nimbusml/linear_model/sgdbinaryclassifier.py +++ b/src/python/nimbusml/linear_model/sgdbinaryclassifier.py @@ -34,14 +34,14 @@ class SgdBinaryClassifier(core, BasePredictor, ClassifierMixin): associated optimization problem is sparse, then Hogwild SGD achieves a nearly optimal rate of convergence. For a detailed reference, please - refer to `http://arxiv.org/pdf/1106.5730v2.pdf - `_. + refer to `https://arxiv.org/pdf/1106.5730v2.pdf + `_. **Reference** - `http://arxiv.org/pdf/1106.5730v2.pdf - `_ + `https://arxiv.org/pdf/1106.5730v2.pdf + `_ :param feature: see `Columns `_. diff --git a/src/python/nimbusml/model_selection/cv.py b/src/python/nimbusml/model_selection/cv.py index 79a5def4..effff597 100644 --- a/src/python/nimbusml/model_selection/cv.py +++ b/src/python/nimbusml/model_selection/cv.py @@ -96,7 +96,7 @@ class CV: the average of each metric on all models. :param pipeline: Pipeline object or a list of pipeline steps that's - used for cross validation + used for cross validation ''' fold_column_name = 'Fold' @@ -307,27 +307,22 @@ def _cleanup_results(self, results, cv): return clean_results - def _process_split_start(self, split_start): - nodes = self._pipeline.nodes - pipeline_len = len(nodes) + def _process_split_start(self, split_start, num_transform_nodes): if isinstance(split_start, str): if split_start == 'before_transforms': split_index = 0 elif split_start == 'after_transforms': - split_index = pipeline_len - 1 + split_index = num_transform_nodes else: raise ValueError( 'String value for split_start should be either ' '"before_transforms" or "after_transforms"') if isinstance(split_start, six.integer_types): - try: - nodes[split_start] - except IndexError: + if split_start > num_transform_nodes: raise ValueError( 'Pipeline doesn\'t contain a step for split_start={' - '}'.format( - split_start)) + '}'.format(split_start)) split_index = split_start @@ -335,7 +330,10 @@ def _process_split_start(self, split_start): # Convert split_index to positive number, so that it can index into # list of transfroms without the learner. if split_index < 0: - split_index = split_index + pipeline_len + split_index = split_index + num_transform_nodes + + if split_index < 0: + raise ValueError('Invalid split index.') return split_index @@ -426,6 +424,7 @@ def fit( self._results = None self._raw_results = None verbose = 1 + dry_run = params.pop('dry_run', False) # _fit_graph() seems to have side-effects on the pipeline object # Use a clone, so that we can reuse CV object for multiple calls to @@ -468,9 +467,10 @@ def fit( 'groups in .fit() function.') - split_index = self._process_split_start(split_start) graph_sections = cv_aux_info.graph_sections transforms = graph_sections.get('transform_nodes', []) + + split_index = self._process_split_start(split_start, len(transforms)) pre_split_transforms = transforms[:split_index] post_split_transforms = transforms[split_index:] implicit_nodes = graph_sections['implicit_nodes'] @@ -562,11 +562,16 @@ def fit( telemetry_info=telemetry_info, is_cv=True, output_types=self.output_types, + dry_run=dry_run, **params) except RuntimeError as e: self._run_time = time.time() - start_time raise e - self._raw_results = graph_run_results - self._results = self._cleanup_results(graph_run_results, cv) + if dry_run: + self._results = graph_run_results + else: + self._raw_results = graph_run_results + self._results = self._cleanup_results(graph_run_results, cv) + return self._results diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 3e0dce27..4efa9cf0 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -119,10 +119,10 @@ class Pipeline: for more details on how to select these. :param steps: the list of operator or (name, operator) tuples that - are chained in the appropriate order. + are chained in the appropriate order. :param model: the path to the model file (".zip") if want to load a - model directly from file (such as a trained model from ML.NET). + model directly from file (such as a trained model from ML.NET). :param random_state: the integer used as the random seed. @@ -1223,6 +1223,19 @@ def fit_transform( :param X: {array-like [n_samples, n_features], :py:func:`FileDataStream ` } :param y: {array-like [n_samples]} + :param as_binary_data_stream: If ``True`` then output an IDV file. + See `here `_ + for more information. + :param params: Additional arguments. + If ``as_csr=True`` and ``as_binary_data_stream=False`` then + return the transformed data in CSR (sparse matrix) format. + If ``as_binary_data_stream`` is also true then that + parameter takes precedence over ``as_csr`` and the output will + be an IDV file. + + :return: Returns a pandas DataFrame if no other output format + is specified. See ``as_binary_data_stream`` and ``as_csr`` + for other available output formats. """ self.fit( X, @@ -1529,10 +1542,14 @@ def _evaluation_infer(self, evaltype, label_column, group_id, models_anomalydetectionevaluator(**params)]) elif type_ == 'ranking': - svd = "$scoredVectorData" column = [OrderedDict(Source=group_id, Name=group_id)] - algo_args = dict(data=svd, output_data=svd, column=column) + algo_args = dict( + data="$scoredVectorData", + output_data="$scoredVectorData2", + column=column) key_node = transforms_texttokeyconverter(**algo_args) + + params['data'] = "$scoredVectorData2" evaluate_node = models_rankingevaluator( group_id_column=group_id, **params) all_nodes.extend([ @@ -1834,7 +1851,7 @@ def get_output_columns(self, verbose=0, **params): inputs = dict([('transform_model', self.model)]) schema_node = models_schema( - transform_model="$transform_model", + model="$transform_model", schema="$output_data") all_nodes = [schema_node] @@ -2443,7 +2460,19 @@ def transform( :param X: {array-like [n_samples, n_features], :py:class:`nimbusml.FileDataStream` } :param y: {array-like [n_samples]} - + :param as_binary_data_stream: If ``True`` then output an IDV file. + See `here `_ + for more information. + :param params: Additional arguments. + If ``as_csr=True`` and ``as_binary_data_stream=False`` then + return the transformed data in CSR (sparse matrix) format. + If ``as_binary_data_stream`` is also true then that + parameter takes precedence over ``as_csr`` and the output will + be an IDV file. + + :return: Returns a pandas DataFrame if no other output format + is specified. See ``as_binary_data_stream`` and ``as_csr`` + for other available output formats. """ # start the clock! start_time = time.time() @@ -2539,7 +2568,7 @@ def summary(self, verbose=0, **params): if len(self.steps) > 0 and not isinstance( self.last_node, BasePredictor): raise ValueError( - "Summary is availabe only for predictor types, instead " + "Summary is available only for predictor types, instead " "got " + self.last_node.type) @@ -2577,6 +2606,10 @@ def summary(self, verbose=0, **params): self._run_time = time.time() - start_time raise e + # .summary() not supported if size of summary_data + # is less or equal to 1 (if only PredictedName in summary_data) + if summary_data.size == 1 and summary_data.columns.values == ["PredictorName"]: + raise TypeError("One or more predictors in this pipeline do not support the .summary() function.") self.model_summary = summary_data # stop the clock diff --git a/src/python/nimbusml/preprocessing/datetimesplitter.py b/src/python/nimbusml/preprocessing/datetimesplitter.py index fb33337b..c3fceb43 100644 --- a/src/python/nimbusml/preprocessing/datetimesplitter.py +++ b/src/python/nimbusml/preprocessing/datetimesplitter.py @@ -27,8 +27,6 @@ class DateTimeSplitter(core, BaseTransform, TransformerMixin): :param prefix: Output column prefix. - :param columns_to_drop: Columns to drop after the DateTime Expansion. - :param country: Country to get holidays for. Defaults to none if not passed. @@ -40,7 +38,6 @@ class DateTimeSplitter(core, BaseTransform, TransformerMixin): def __init__( self, prefix, - columns_to_drop=None, country='None', columns=None, **params): @@ -51,7 +48,6 @@ def __init__( core.__init__( self, prefix=prefix, - columns_to_drop=columns_to_drop, country=country, **params) self._columns = columns diff --git a/src/python/nimbusml/preprocessing/fromkey.py b/src/python/nimbusml/preprocessing/fromkey.py index f83d90a7..126d6b5f 100644 --- a/src/python/nimbusml/preprocessing/fromkey.py +++ b/src/python/nimbusml/preprocessing/fromkey.py @@ -20,8 +20,7 @@ class FromKey(core, BaseTransform, TransformerMixin): """ - Text transforms that can be performed on data before training - a model. + Converts the key types back to their original values. .. remarks:: The ``FromKey`` transform converts a column of keys, generated using diff --git a/src/python/nimbusml/preprocessing/missing_values/handler.py b/src/python/nimbusml/preprocessing/missing_values/handler.py index 1a1fac0a..01da758b 100644 --- a/src/python/nimbusml/preprocessing/missing_values/handler.py +++ b/src/python/nimbusml/preprocessing/missing_values/handler.py @@ -54,14 +54,13 @@ class Handler(core, BaseTransform, TransformerMixin): For more details see `Columns `_. :param replace_with: The method to use to replace NaN values. The - following choices are available. - - * Def: Replace with default value of that type, usually ``0``. If no - replace - method is specified, this is the default strategy. - * Mean: Replace NaN values with the mean of the values in that column. - * Min: Replace with minimum value in the column. - * Max: Replace with maximum value in the column. + following choices are available. + + * Def: Replace with default value of that type, usually ``0``. If no + replace method is specified, this is the default strategy. + * Mean: Replace NaN values with the mean of the values in that column. + * Min: Replace with minimum value in the column. + * Max: Replace with maximum value in the column. :param impute_by_slot: Whether to impute values by slot. diff --git a/src/python/nimbusml/preprocessing/schema/prefixcolumnconcatenator.py b/src/python/nimbusml/preprocessing/schema/prefixcolumnconcatenator.py index 9a3aa443..6e0662e1 100644 --- a/src/python/nimbusml/preprocessing/schema/prefixcolumnconcatenator.py +++ b/src/python/nimbusml/preprocessing/schema/prefixcolumnconcatenator.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand """ PrefixColumnConcatenator """ @@ -17,7 +18,10 @@ from ...internal.utils.utils import trace -class PrefixColumnConcatenator(core, BaseTransform, TransformerMixin): +class PrefixColumnConcatenator( + core, + BaseTransform, + TransformerMixin): """ Combines several columns into a single vector-valued column by prefix. diff --git a/src/python/nimbusml/preprocessing/tokey.py b/src/python/nimbusml/preprocessing/tokey.py index 97c00ad3..c94c2eac 100644 --- a/src/python/nimbusml/preprocessing/tokey.py +++ b/src/python/nimbusml/preprocessing/tokey.py @@ -20,8 +20,7 @@ class ToKey(core, BaseTransform, TransformerMixin): """ - Text transforms that can be performed on data before training - a model. + Converts input values (words, numbers, etc.) to index in a dictionary. .. remarks:: The ``ToKey`` transform converts a column of text to key values diff --git a/src/python/nimbusml/tests/ensemble/test_lightgbmclassifier.py b/src/python/nimbusml/tests/ensemble/test_lightgbmclassifier.py index 0c31c9ff..16442dff 100644 --- a/src/python/nimbusml/tests/ensemble/test_lightgbmclassifier.py +++ b/src/python/nimbusml/tests/ensemble/test_lightgbmclassifier.py @@ -19,8 +19,10 @@ class TestLightGbmClassifier(unittest.TestCase): - @unittest.skipIf(platform.system() in ("Linux", "Darwin") and six.PY2, - "encoding/decoding issues with linux py2.7, bug 286536") + @unittest.skipIf(platform.system() == "Darwin" and six.PY2, + "Disabled due to bug on Mac Python 2.7 build, more info: \ + https://github.com/microsoft/NimbusML/issues/366, \ + https://github.com/microsoft/NimbusML/pull/362") def test_lightgbmclassifier(self): np.random.seed(0) train_file = get_dataset('wiki_detox_train').as_filepath() diff --git a/src/python/nimbusml/tests/feature_extraction/text/test_ngramextractor.py b/src/python/nimbusml/tests/feature_extraction/text/test_ngramextractor.py new file mode 100644 index 00000000..5914a24f --- /dev/null +++ b/src/python/nimbusml/tests/feature_extraction/text/test_ngramextractor.py @@ -0,0 +1,36 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import os +import unittest +import pandas + +from nimbusml import FileDataStream, Pipeline +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.text import NGramExtractor +from nimbusml.preprocessing.text import CharTokenizer +from nimbusml.preprocessing.schema import ColumnDropper + + +class TestNGramExtractor(unittest.TestCase): + + def test_ngramfeaturizer(self): + train_df = pandas.DataFrame(data=dict(review=['one', 'two'])) + + pipeline = Pipeline([ + CharTokenizer(columns={'review_transform': 'review'}), + NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}), + ColumnDropper(columns=['review_transform', 'review']) + ]) + + result = pipeline.fit_transform(train_df) + self.assertEqual(len(result.columns), 6) + self.assertEqual(result.loc[0, 'ngrams.o|n|e'], 1.0) + self.assertEqual(result.loc[1, 'ngrams.o|n|e'], 0.0) + self.assertEqual(result.loc[0, 'ngrams.t|w|o'], 0.0) + self.assertEqual(result.loc[1, 'ngrams.t|w|o'], 1.0) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py b/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py index 6b183b91..21ad6c12 100644 --- a/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py +++ b/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py @@ -3,7 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- -import os +import platform import unittest import numpy as np @@ -18,8 +18,10 @@ class TestNGramFeaturizer(unittest.TestCase): - @unittest.skipIf(os.name != "nt" and six.PY2, - "encoding/decoding issues with linux py2.7, bug 286536") + @unittest.skipIf(platform.system() == "Darwin" and six.PY2, + "Disabled due to bug on Mac Python 2.7 build, more info: \ + https://github.com/microsoft/NimbusML/issues/366, \ + https://github.com/microsoft/NimbusML/pull/362") def test_ngramfeaturizer(self): np.random.seed(0) train_file = get_dataset('wiki_detox_train').as_filepath() diff --git a/src/python/nimbusml/tests/model_selection/test_cv.py b/src/python/nimbusml/tests/model_selection/test_cv.py index 2f264de2..b6883331 100644 --- a/src/python/nimbusml/tests/model_selection/test_cv.py +++ b/src/python/nimbusml/tests/model_selection/test_cv.py @@ -4,6 +4,7 @@ # -------------------------------------------------------------------------------------------- import os +import json import unittest import numpy as np @@ -11,13 +12,14 @@ from nimbusml import Pipeline, FileDataStream, Role, DataSchema from nimbusml.cluster import KMeansPlusPlus from nimbusml.datasets import get_dataset -from nimbusml.ensemble import FastForestRegressor, LightGbmRanker +from nimbusml.ensemble import FastForestRegressor, LightGbmRanker, LightGbmRegressor from nimbusml.feature_extraction.categorical import OneHotVectorizer, \ OneHotHashVectorizer from nimbusml.linear_model import FastLinearClassifier, \ LogisticRegressionBinaryClassifier, LogisticRegressionClassifier from nimbusml.model_selection import CV from nimbusml.preprocessing import ToKey +from nimbusml.preprocessing.missing_values import Indicator, Handler from nimbusml.preprocessing.schema import ColumnConcatenator, ColumnDropper from nimbusml.tests.test_utils import split_features_and_label from sklearn.utils.testing import assert_equal, assert_true, \ @@ -123,6 +125,9 @@ def check_cv( cv = CV(pipeline) if split_start == 'try_all': len_pipeline = len(pipeline.nodes) + if pipeline.last_node.type != 'transform': + len_pipeline = len_pipeline - 1 + values_to_test = ['after_transforms', 'before_transforms'] values_to_test.extend(list(range(len_pipeline))) values_to_test.extend(list(range(-len_pipeline, 0))) @@ -249,6 +254,38 @@ def test_unsupported_split_start(self): self.check_cv_with_defaults( split_start=split_start, graph_id=str(split_start)) + def test_split_start_with_transforms_with_presteps(self): + path = get_dataset("airquality").as_filepath() + schema = DataSchema.read_schema(path) + data = FileDataStream(path, schema) + + pipeline_steps = [ + Indicator() << {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'}, + Handler(replace_with='Mean') << { + 'Solar_R': 'Solar_R', + 'Ozone': 'Ozone'}, + LightGbmRegressor( + feature=['Ozone', + 'Solar_R', + 'Ozone_ind', + 'Solar_R_ind', + 'Temp'], + label='Wind')] + + results = CV(pipeline_steps).fit(data, + split_start='after_transforms', + dry_run=True) + results = json.loads(results) + + node_names = [ep['Name'] for ep in results['nodes']] + cv_node = [ep for ep in results['nodes'] + if 'Models.CrossValidator' in ep['Name']][0] + cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']] + + self.assertTrue('Transforms.MissingValueHandler' in node_names) + self.assertTrue('Transforms.MissingValueHandler' not in cv_sub_node_names) + self.assertTrue('Transforms.ModelCombiner' in node_names) + class TestCvBinary(unittest.TestCase): infert_case_index = 5 @@ -562,3 +599,7 @@ def test_df(self): y=[0, 1, 2, 10, 11, 12, -10, -11, -12], z=[0, 1, 2, 10, 11, 12, -10, -11, -12])) check_cv([KMeansPlusPlus(n_clusters=3)], X) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/model_summary/test_model_summary.py b/src/python/nimbusml/tests/model_summary/test_model_summary.py index 650238ae..3d0c659c 100644 --- a/src/python/nimbusml/tests/model_summary/test_model_summary.py +++ b/src/python/nimbusml/tests/model_summary/test_model_summary.py @@ -66,25 +66,25 @@ #SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor(), - OneVsRestClassifier(FastLinearBinaryClassifier()), GamRegressor(), GamBinaryClassifier(), PcaAnomalyDetector(), - FactorizationMachineBinaryClassifier(), - KMeansPlusPlus(), - NaiveBayesClassifier(), FastForestBinaryClassifier(number_of_trees=2), FastForestRegressor(number_of_trees=2), FastTreesBinaryClassifier(number_of_trees=2), FastTreesRegressor(number_of_trees=2), FastTreesTweedieRegressor(number_of_trees=2), LightGbmRegressor(number_of_iterations=2), - LightGbmClassifier(), LightGbmBinaryClassifier(number_of_iterations=2) ] learners_not_supported = [ - #PcaTransformer(), # REVIEW: crashes + FactorizationMachineBinaryClassifier(), + OneVsRestClassifier(FastLinearBinaryClassifier()), + FactorizationMachineBinaryClassifier(), + KMeansPlusPlus(n_clusters=2), + NaiveBayesClassifier(), + LightGbmClassifier() ] @@ -98,7 +98,6 @@ def test_model_summary(self): pipeline.fit(train_stream, label_column) pipeline.summary() - @unittest.skip("No unsupported learners") def test_model_summary_not_supported(self): for learner in learners_not_supported: pipeline = Pipeline( @@ -107,6 +106,23 @@ def test_model_summary_not_supported(self): pipeline.fit(train_stream, label_column) assert_raises(TypeError, pipeline.summary) + def test_model_summary_not_supported_specific(self): + path = get_dataset('infert').as_filepath() + data = FileDataStream.read_csv(path, sep=',', + names={0: 'row_num', 5: 'case'}) + pipeline = Pipeline([ + OneHotVectorizer(columns={'edu': 'education'}), + FactorizationMachineBinaryClassifier(feature=['induced', 'edu', 'parity'], + label='case') + ]) + pipeline.fit(data) + try: + pipeline.summary() + except TypeError as e: + self.assertEqual(e.args[0], "One or more predictors in this pipeline do not support the .summary() function.") + else: + assert False + def test_summary_called_back_to_back_on_predictor(self): """ When a predictor is fit without using a Pipeline, @@ -119,24 +135,24 @@ def test_summary_called_back_to_back_on_predictor(self): ols.summary() def test_pipeline_summary_is_refreshed_after_refitting(self): - predictor = OrdinaryLeastSquaresRegressor(normalize='No', l2_regularization=0) + predictor = OrdinaryLeastSquaresRegressor() pipeline = Pipeline([predictor]) pipeline.fit([0,1,2,3], [1,2,3,4]) summary1 = pipeline.summary() - pipeline.fit([0,1,2,3], [2,5,8,11]) + pipeline.fit([0,1,2.5,3], [2,5,8,11]) summary2 = pipeline.summary() self.assertFalse(summary1.equals(summary2)) def test_predictor_summary_is_refreshed_after_refitting(self): - predictor = OrdinaryLeastSquaresRegressor(normalize='No', l2_regularization=0) + predictor = OrdinaryLeastSquaresRegressor() predictor.fit([0,1,2,3], [1,2,3,4]) summary1 = predictor.summary() - predictor.fit([0,1,2,3], [2,5,8,11]) + predictor.fit([0,1,2.5,3], [2,5,8,11]) summary2 = predictor.summary() self.assertFalse(summary1.equals(summary2)) diff --git a/src/python/nimbusml/tests/naive_bayes/test_naivebayesclassifier.py b/src/python/nimbusml/tests/naive_bayes/test_naivebayesclassifier.py index 4b414c38..36d44b85 100644 --- a/src/python/nimbusml/tests/naive_bayes/test_naivebayesclassifier.py +++ b/src/python/nimbusml/tests/naive_bayes/test_naivebayesclassifier.py @@ -19,8 +19,6 @@ class TestNaiveBayesClassifier(unittest.TestCase): - @unittest.skipIf(os.name != "nt" and six.PY2, - "encoding/decoding issues with linux py2.7, bug 286536") def test_naivebayesclassifier(self): np.random.seed(0) train_file = get_dataset("wiki_detox_train").as_filepath() diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py index 19bc26ce..3807507e 100644 --- a/src/python/nimbusml/tests/pipeline/test_load_save.py +++ b/src/python/nimbusml/tests/pipeline/test_load_save.py @@ -5,6 +5,7 @@ import os import pickle +import tempfile import unittest import numpy as np @@ -32,6 +33,12 @@ (train, label) = get_X_y(train_file, label_column, sep=',') (test, test_label) = get_X_y(test_file, label_column, sep=',') +def get_temp_file(suffix=None): + fd, file_name = tempfile.mkstemp(suffix=suffix) + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + class TestLoadSave(unittest.TestCase): @@ -48,7 +55,7 @@ def test_model_dataframe(self): model_nimbusml.fit(train, label) # Save with pickle - pickle_filename = 'nimbusml_model.p' + pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(model_nimbusml, f) @@ -65,9 +72,10 @@ def test_model_dataframe(self): test, test_label, output_scores=True) # Save load with pipeline methods - model_nimbusml.save_model('model.nimbusml.m') + model_filename = get_temp_file(suffix='.m') + model_nimbusml.save_model(model_filename) model_nimbusml_load = Pipeline() - model_nimbusml_load.load_model('model.nimbusml.m') + model_nimbusml_load.load_model(model_filename) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_load.predict(test).head(5) @@ -82,7 +90,7 @@ def test_model_dataframe(self): model_nimbusml_load.sum().sum(), decimal=2) - os.remove('model.nimbusml.m') + os.remove(model_filename) def test_model_datastream(self): model_nimbusml = Pipeline( @@ -97,7 +105,7 @@ def test_model_datastream(self): model_nimbusml.fit(train, label) # Save with pickle - pickle_filename = 'nimbusml_model.p' + pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(model_nimbusml, f) @@ -120,9 +128,10 @@ def test_model_datastream(self): decimal=2) # Save load with pipeline methods - model_nimbusml.save_model('model.nimbusml.m') + model_filename = get_temp_file(suffix='.m') + model_nimbusml.save_model(model_filename) model_nimbusml_load = Pipeline() - model_nimbusml_load.load_model('model.nimbusml.m') + model_nimbusml_load.load_model(model_filename) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_load.predict(test).head(5) @@ -137,7 +146,7 @@ def test_model_datastream(self): model_nimbusml_load.sum().sum(), decimal=2) - os.remove('model.nimbusml.m') + os.remove(model_filename) def test_pipeline_saves_complete_model_file_when_pickled(self): model_nimbusml = Pipeline( @@ -152,7 +161,7 @@ def test_pipeline_saves_complete_model_file_when_pickled(self): model_nimbusml.fit(train, label) metrics, score = model_nimbusml.test(test, test_label, output_scores=True) - pickle_filename = 'nimbusml_model.p' + pickle_filename = get_temp_file(suffix='.p') # Save with pickle with open(pickle_filename, 'wb') as f: @@ -202,7 +211,7 @@ def test_unfitted_pickled_pipeline_can_be_fit(self): shuffle=False, number_of_threads=1))]) - pickle_filename = 'nimbusml_model.p' + pickle_filename = get_temp_file(suffix='.p') # Save with pickle with open(pickle_filename, 'wb') as f: @@ -234,7 +243,7 @@ def test_unpickled_pipeline_has_feature_contributions(self): fc = model_nimbusml.get_feature_contributions(test) # Save with pickle - pickle_filename = 'nimbusml_model.p' + pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(model_nimbusml, f) # Unpickle model @@ -260,7 +269,7 @@ def test_unpickled_predictor_has_feature_contributions(self): fc = model_nimbusml.get_feature_contributions(test) # Save with pickle - pickle_filename = 'nimbusml_model.p' + pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(model_nimbusml, f) # Unpickle model @@ -287,7 +296,7 @@ def test_pipeline_loaded_from_zip_has_feature_contributions(self): fc = model_nimbusml.get_feature_contributions(test) # Save the model to zip - model_filename = 'nimbusml_model.zip' + model_filename = get_temp_file(suffix='.zip') model_nimbusml.save_model(model_filename) # Load the model from zip model_nimbusml_zip = Pipeline() @@ -312,7 +321,7 @@ def test_predictor_loaded_from_zip_has_feature_contributions(self): fc = model_nimbusml.get_feature_contributions(test) # Save the model to zip - model_filename = 'nimbusml_model.zip' + model_filename = get_temp_file(suffix='.zip') model_nimbusml.save_model(model_filename) # Load the model from zip model_nimbusml_zip = Pipeline() @@ -347,7 +356,7 @@ def test_pickled_pipeline_with_predictor_model(self): self.assertTrue(pipeline.predictor_model) self.assertNotEqual(pipeline.model, pipeline.predictor_model) - pickle_filename = 'nimbusml_model.p' + pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(pipeline, f) diff --git a/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py b/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py index 347b2798..04f1bc35 100644 --- a/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py +++ b/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- import os +import tempfile import unittest from nimbusml import FileDataStream @@ -16,6 +17,14 @@ from numpy.testing import assert_almost_equal from pandas.testing import assert_frame_equal + +def get_temp_model_file(): + fd, file_name = tempfile.mkstemp(suffix='.zip') + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + + class TestPermutationFeatureImportance(unittest.TestCase): @classmethod @@ -65,7 +74,7 @@ def test_binary_classifier(self): assert_almost_equal(self.binary_pfi['AreaUnderPrecisionRecallCurve'].sum(), -0.19365, 5) def test_binary_classifier_from_loaded_model(self): - model_path = "model.zip" + model_path = get_temp_model_file() self.binary_model.save_model(model_path) loaded_model = Pipeline() loaded_model.load_model(model_path) @@ -81,7 +90,7 @@ def test_clasifier(self): assert_almost_equal(self.classifier_pfi['PerClassLogLoss.1'].sum(), 0.419826, 6) def test_classifier_from_loaded_model(self): - model_path = "model.zip" + model_path = get_temp_model_file() self.classifier_model.save_model(model_path) loaded_model = Pipeline() loaded_model.load_model(model_path) @@ -96,7 +105,7 @@ def test_regressor(self): assert_almost_equal(self.regressor_pfi['RSquared'].sum(), -0.203612, 6) def test_regressor_from_loaded_model(self): - model_path = "model.zip" + model_path = get_temp_model_file() self.regressor_model.save_model(model_path) loaded_model = Pipeline() loaded_model.load_model(model_path) @@ -113,7 +122,7 @@ def test_ranker(self): assert_almost_equal(self.ranker_pfi['NDCG@3'].sum(), -0.236544, 6) def test_ranker_from_loaded_model(self): - model_path = "model.zip" + model_path = get_temp_model_file() self.ranker_model.save_model(model_path) loaded_model = Pipeline() loaded_model.load_model(model_path) diff --git a/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py b/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py index 0b9c8141..aecfc20e 100644 --- a/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py +++ b/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py @@ -6,7 +6,9 @@ import unittest import pandas +from nimbusml import Pipeline from nimbusml.preprocessing import DateTimeSplitter +from nimbusml.preprocessing.schema import ColumnSelector from sklearn.utils.testing import assert_equal @@ -25,16 +27,15 @@ def test_holidays(self): )) cols_to_drop = [ - 'Hour12', 'DayOfWeek', 'DayOfQuarter', - 'DayOfYear', 'WeekOfMonth', 'QuarterOfYear', - 'HalfOfYear', 'WeekIso', 'YearIso', 'MonthLabel', - 'AmPmLabel', 'DayOfWeekLabel', 'IsPaidTimeOff' + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' ] - dts = DateTimeSplitter(prefix='dt', - country='Canada', - columns_to_drop=cols_to_drop) << 'tokens1' - y = dts.fit_transform(df) + dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1' + pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) + y = pipeline.fit_transform(df) self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day') diff --git a/src/python/nimbusml/tests/test_entrypoints.py b/src/python/nimbusml/tests/test_entrypoints.py index c4e53546..5ff8e0c3 100644 --- a/src/python/nimbusml/tests/test_entrypoints.py +++ b/src/python/nimbusml/tests/test_entrypoints.py @@ -118,8 +118,12 @@ def test_logistic_regression_graph(self): input_data=""), dict( output_model=""), DataOutputFormat.DF, *all_nodes) # print(graph) - graph.run(X=None, dryrun=True) + graph.run(X=None, dry_run=True) # lr = graph.run(formula = "ylogical ~ xint1", data = ds # , blocks_per_read = 1, report_progress = True # ) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/test_variable_column.py b/src/python/nimbusml/tests/test_variable_column.py index 6c1fc8bd..318094ff 100644 --- a/src/python/nimbusml/tests/test_variable_column.py +++ b/src/python/nimbusml/tests/test_variable_column.py @@ -8,17 +8,17 @@ import numpy as np import pandas as pd from nimbusml import Pipeline -from nimbusml.internal.entrypoints.transforms_variablecolumn import transforms_variablecolumn +from nimbusml.internal.entrypoints.transforms_variablecolumntransform import transforms_variablecolumntransform from nimbusml.internal.utils.entrypoints import Graph, DataOutputFormat class TestVariableColumn(unittest.TestCase): def to_variable_column(self, input, features=None, length_column_name=None): - node = transforms_variablecolumn(data='$data', - output_data='$output_data', - features=features, - length_column_name=length_column_name) + node = transforms_variablecolumntransform(data='$data', + output_data='$output_data', + features=features, + length_column_name=length_column_name) graph_nodes = [node] graph = Graph(dict(data=''), diff --git a/src/python/nimbusml/timeseries/ssaforecaster.py b/src/python/nimbusml/timeseries/ssaforecaster.py index dd7e0296..35516d15 100644 --- a/src/python/nimbusml/timeseries/ssaforecaster.py +++ b/src/python/nimbusml/timeseries/ssaforecaster.py @@ -31,7 +31,7 @@ class SsaForecaster(core, BaseTransform, TransformerMixin): input time-series where each component in the spectrum corresponds to a trend, seasonal or noise component in the time-series. For details of the Singular Spectrum Analysis (SSA), refer to `this document - `_. + `_. :param columns: see `Columns `_. @@ -41,7 +41,7 @@ class SsaForecaster(core, BaseTransform, TransformerMixin): :param series_length: The length of series that is kept in buffer for modeling (parameter N). - :param train_size: The length of series from the begining used for + :param train_size: The length of series from the beginning used for training. :param horizon: The number of values to forecast. diff --git a/src/python/setup.py b/src/python/setup.py index 5fc3fcba..e8481345 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -45,7 +45,7 @@ # Versions should comply with PEP440. For a discussion on # single-sourcing the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.5.1', + version='1.6.1', description='NimbusML', long_description=long_description, @@ -115,7 +115,7 @@ 'nose>=1.3', 'pytest>=4.4.0', 'graphviz', 'imageio', ], - 'dprep': ['azureml-dataprep>=1.1.12'], + 'dprep': ['azureml-dataprep>=1.1.33'], 'utils': ['graphviz', 'imageio'], }, @@ -148,7 +148,7 @@ # Although 'package_data' is the preferred approach, in some case # you may need to place data files outside of your packages. See: - # http://docs.python.org/3.4/distutils/setupscript.html#installing + # https://docs.python.org/3.4/distutils/setupscript.html#installing # -additional-files # noqa # In this case, 'data_file' will be installed into # '/my_data' diff --git a/src/python/setup.py.in b/src/python/setup.py.in index e65db7d8..0489bc13 100644 --- a/src/python/setup.py.in +++ b/src/python/setup.py.in @@ -115,7 +115,7 @@ setup( 'nose>=1.3', 'pytest>=4.4.0', 'graphviz', 'imageio', ], - 'dprep': ['azureml-dataprep>=1.1.12'], + 'dprep': ['azureml-dataprep>=1.1.33'], 'utils': ['graphviz', 'imageio'], }, @@ -148,7 +148,7 @@ setup( # Although 'package_data' is the preferred approach, in some case # you may need to place data files outside of your packages. See: - # http://docs.python.org/3.4/distutils/setupscript.html#installing + # https://docs.python.org/3.4/distutils/setupscript.html#installing # -additional-files # noqa # In this case, 'data_file' will be installed into # '/my_data' diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index 1a835fbc..4a4bd65f 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -7,7 +7,10 @@ """ import json import os +import unittest +from nimbusml.cluster import KMeansPlusPlus +from nimbusml.decomposition import FactorizationMachineBinaryClassifier from nimbusml.ensemble import EnsembleClassifier from nimbusml.ensemble import EnsembleRegressor from nimbusml.ensemble import LightGbmBinaryClassifier @@ -59,7 +62,10 @@ 'check_transformer_general, check_pipeline_consistency' 'check_estimators_pickle, check_estimators_dtypes' 'check_dict_unchanged, check_dtype_object, check_fit_score_takes_y' - 'check_transformer_data_not_an_array', + 'check_transformer_data_not_an_array, check_fit1d_1feature,' + 'check_fit2d_1feature, check_fit2d_predict1d, check_estimators_overwrite_params,' + 'check_estimator_sparse_data, check_fit2d_1sample, check_dont_overwrite_parameters,' + 'check_estimators_fit_returns_self', # by design returns smaller number of rows 'SkipFilter': 'check_transformer_general, ' 'check_transformer_data_not_an_array', @@ -163,21 +169,24 @@ 'check_estimators_nan_inf', # RobustScaler does not support vectorized types 'RobustScaler': 'check_estimator_sparse_data', - 'ToKeyImputer': 'check_estimator_sparse_data', + 'ToKeyImputer': + 'check_estimator_sparse_data, check_estimators_dtypes', # Most of these skipped tests are failing because the checks # require numerical types. ToString returns object types. # TypeError: ufunc 'isfinite' not supported for the input types 'ToString': 'check_estimator_sparse_data, check_pipeline_consistency' 'check_transformer_data_not_an_array, check_estimators_pickle' 'check_transformer_general', + 'OrdinaryLeastSquaresRegressor': 'check_fit2d_1sample' } OMITTED_CHECKS_TUPLE = ( - 'OneHotHashVectorizer, FromKey, DssmFeaturizer, DnnFeaturizer, ' + 'OneHotHashVectorizer, FromKey, DnnFeaturizer, ' 'PixelExtractor, Loader, Resizer, \ GlobalContrastRowScaler, PcaTransformer, ' 'ColumnConcatenator, Sentiment, CharTokenizer, LightLda, ' - 'NGramFeaturizer, WordEmbedding, LpScaler, WordTokenizer', + 'NGramFeaturizer, WordEmbedding, LpScaler, WordTokenizer' + 'NGramExtractor', 'check_transformer_data_not_an_array, check_pipeline_consistency, ' 'check_fit2d_1feature, check_estimators_fit_returns_self,\ check_fit2d_1sample, ' @@ -210,6 +219,8 @@ 'DateTimeSplitter': DateTimeSplitter(prefix='dt', columns=['F0']), 'EnsembleClassifier': EnsembleClassifier(num_models=3), 'EnsembleRegressor': EnsembleRegressor(num_models=3), + 'FactorizationMachineBinaryClassifier': FactorizationMachineBinaryClassifier(shuffle=False), + 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=2), 'LightGbmBinaryClassifier': LightGbmBinaryClassifier( minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier( @@ -262,13 +273,14 @@ MULTI_OUTPUT.extend(MULTI_OUTPUT_EX) - -def my_import(name): - components = name.split('.') - mod = __import__(components[0]) - for comp in components[1:]: - mod = getattr(mod, comp) - return mod +skip_epoints = set([ + 'OneVsRestClassifier', + 'TreeFeaturizer', + # skip SymSgdBinaryClassifier for now, because of crashes. + 'SymSgdBinaryClassifier', + 'DatasetTransformer', + 'TimeSeriesImputer' +]) def load_json(file_path): @@ -278,98 +290,79 @@ def load_json(file_path): content_without_comments = '\n'.join(lines) return json.loads(content_without_comments) +def get_epoints(): + epoints = [] + my_path = os.path.realpath(__file__) + my_dir = os.path.dirname(my_path) + manifest_diff_json = os.path.join(my_dir, '..', 'tools', + 'manifest_diff.json') + manifest_diff = load_json(manifest_diff_json) + for e in manifest_diff['EntryPoints']: + if (e['NewName'] not in skip_epoints) and ('LightGbm' not in e['NewName']): + epoints.append((e['Module'], e['NewName'])) + + return epoints -skip_epoints = set([ - 'OneVsRestClassifier', - 'TreeFeaturizer', - # skip SymSgdBinaryClassifier for now, because of crashes. - 'SymSgdBinaryClassifier', - 'DatasetTransformer', - 'TimeSeriesImputer' -]) -epoints = [] -my_path = os.path.realpath(__file__) -my_dir = os.path.dirname(my_path) -manifest_diff_json = os.path.join(my_dir, '..', 'tools', - 'manifest_diff.json') -manifest_diff = load_json(manifest_diff_json) -for e in manifest_diff['EntryPoints']: - if e['NewName'] not in skip_epoints: - epoints.append((e['Module'], e['NewName'])) +class TestEstimatorChecks(unittest.TestCase): + # This method is a static method of the class + # because there were pytest fixture related + # issues when the method was in the global scope. + @staticmethod + def generate_test_method(epoint): + def method(self): + failed_checks = set() + passed_checks = set() + class_name = epoint[1] + print("\n======== now Estimator is %s =========== " % class_name) -all_checks = {} -all_failed_checks = {} -all_passed_checks = {} -total_checks_passed = 0 + mod = __import__('nimbusml.' + epoint[0], fromlist=[str(class_name)]) + the_class = getattr(mod, class_name) + if class_name in INSTANCES: + estimator = INSTANCES[class_name] + else: + estimator = the_class() -print("total entrypoints: {}", len(epoints)) + if estimator._use_single_input_as_string(): + estimator = estimator << 'F0' -for e in epoints: - checks = set() - failed_checks = set() - passed_checks = set() - class_name = e[1] - print("======== now Estimator is %s =========== " % class_name) - # skip LighGbm for now, because of random crashes. - if 'LightGbm' in class_name: - continue + for check in _yield_all_checks(class_name, estimator): + # Skip check_dict_unchanged for estimators which + # update the classes_ attribute. For more details + # see https://github.com/microsoft/NimbusML/pull/200 + if (check.__name__ == 'check_dict_unchanged') and \ + (hasattr(estimator, 'predict_proba') or + hasattr(estimator, 'decision_function')): + continue - mod = __import__('nimbusml.' + e[0], fromlist=[str(class_name)]) - the_class = getattr(mod, class_name) - if class_name in INSTANCES: - estimator = INSTANCES[class_name] - else: - estimator = the_class() + if check.__name__ in OMITTED_CHECKS_ALWAYS: + continue + if 'Binary' in class_name and check.__name__ in NOBINARY_CHECKS: + continue + if class_name in OMITTED_CHECKS and check.__name__ in \ + OMITTED_CHECKS[class_name]: + continue + if class_name in OMITTED_CHECKS_TUPLE[0] and check.__name__ in \ + OMITTED_CHECKS_TUPLE[1]: + continue - if estimator._use_single_input_as_string(): - estimator = estimator << 'F0' + try: + check(class_name, estimator.clone()) + passed_checks.add(check.__name__) + except Exception as e: + failed_checks.add(check.__name__) - for check in _yield_all_checks(class_name, estimator): - # Skip check_dict_unchanged for estimators which - # update the classes_ attribute. For more details - # see https://github.com/microsoft/NimbusML/pull/200 - if (check.__name__ == 'check_dict_unchanged') and \ - (hasattr(estimator, 'predict_proba') or - hasattr(estimator, 'decision_function')): - continue + if len(failed_checks) > 0: + self.fail(msg=str(failed_checks)) - if check.__name__ in OMITTED_CHECKS_ALWAYS: - continue - if 'Binary' in class_name and check.__name__ in NOBINARY_CHECKS: - continue - if class_name in OMITTED_CHECKS and check.__name__ in \ - OMITTED_CHECKS[class_name]: - continue - if class_name in OMITTED_CHECKS_TUPLE[0] and check.__name__ in \ - OMITTED_CHECKS_TUPLE[1]: - continue - checks.add(check.__name__) - try: - check(class_name, estimator.clone()) - passed_checks.add(check.__name__) - total_checks_passed = total_checks_passed + 1 - except Exception as e: - failed_checks.add(check.__name__) + return method - if frozenset(checks) not in all_checks: - all_checks[frozenset(checks)] = [] - all_checks[frozenset(checks)].append(class_name) - if len(failed_checks) > 0: - if frozenset(failed_checks) not in all_failed_checks: - all_failed_checks[frozenset(failed_checks)] = [] - all_failed_checks[frozenset(failed_checks)].append(class_name) +for epoint in get_epoints(): + test_name = 'test_%s' % epoint[1].lower() + method = TestEstimatorChecks.generate_test_method(epoint) + setattr(TestEstimatorChecks, test_name, method) - if frozenset(passed_checks) not in all_passed_checks: - all_passed_checks[frozenset(passed_checks)] = [] - all_passed_checks[frozenset(passed_checks)].append(class_name) -if len(all_failed_checks) > 0: - print("Following tests failed for components:") - for key, value in all_failed_checks.items(): - print('========================') - print(key) - print(value) - raise RuntimeError("estimator checks failed") -print("success, total checks passed %s ", total_checks_passed) +if __name__ == '__main__': + unittest.main() diff --git a/src/python/tests_extended/test_docs_example.py b/src/python/tests_extended/test_docs_example.py index 3c93d010..552bbdf2 100644 --- a/src/python/tests_extended/test_docs_example.py +++ b/src/python/tests_extended/test_docs_example.py @@ -6,94 +6,123 @@ import platform import subprocess import sys -import time import unittest import six from nimbusml import __file__ as myfile -class TestDocsExamples(unittest.TestCase): - - def test_examples(self): - this = os.path.abspath(os.path.dirname(__file__)) - fold = os.path.normpath( - os.path.join( - this, - '..', - 'nimbusml', - 'examples')) - if not os.path.exists(fold): - raise FileNotFoundError("Unable to find '{0}'.".format(fold)) - - fold_files = [(fold, _) for _ in os.listdir( - fold) if os.path.splitext(_)[-1] == '.py'] - if len(fold_files) == 0: - raise FileNotFoundError( - "Unable to find examples in '{0}'".format(fold)) - - # also include the 'examples_from_dataframe' files - fold_df = os.path.join(fold, 'examples_from_dataframe') - fold_files_df = [(fold_df, _) for _ in os.listdir( - fold_df) if os.path.splitext(_)[-1] == '.py'] - - # merge details of all examples into one list - fold_files.extend(fold_files_df) - fold_files.sort() - - modpath = os.path.abspath(os.path.dirname(myfile)) - modpath = os.path.normpath(os.path.join(os.path.join(modpath), '..')) - os.environ['PYTHONPATH'] = modpath - os.environ['PYTHONIOENCODING'] = 'UTF-8' - - ran = 0 - excs = [] - - for i, (fold, name) in enumerate(fold_files): +exps = [ + "Exception: 'Missing 'English.tok'", + "Missing resource for SSWE", + "Model file for Word Embedding transform could not " + "be found", + "was already trained. Its coefficients will be " + "overwritten. Use clone() to get an untrained " + "version of it.", + "LdaNative.dll", + "CacheClassesFromAssembly", + "Your CPU supports instructions that this TensorFlow", + "CacheClassesFromAssembly: can't map name " + "OLSLinearRegression to Void, already mapped to Void", + # TensorFlowScorer.py + "tensorflow/compiler/xla/service/service.cc:168] XLA service", + "tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device", + "tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency:", + "tensorflow/compiler/jit/mark_for_compilation_pass.cc:1412] (One-time warning): Not using XLA:CPU", + # Binner.py + "from collections import Mapping, defaultdict", + "DeprecationWarning: Using or importing the ABCs", + # BootStrapSample.py + "DeprecationWarning: the imp module is deprecated", + # PipelineWithGridSearchCV2.py + "FutureWarning: You should specify a value for 'cv'", + # PipelineWithGridSearchCV2.py + "DeprecationWarning: The default of the 'iid' parameter", + # PcaAnomalyDetector.py + "UserWarning: Model", + # FastLinearClassifier_iris_df.py + "FutureWarning: elementwise comparison failed", + # PcaAnomalyDetector_df.py + "FutureWarning: Sorting because non-concatenation axis", + # Image.py + "Unable to revert mtime: /Library/Fonts", + "Fontconfig error: Cannot load default config file", + ] +if sys.version_info[:2] <= (3, 6): + # This warning is new but it does not break any + # other unit tests. + # (3, 5) -> (3, 6) for tests on mac + # TODO: Investigate. + exps.append("RuntimeWarning: numpy.dtype size changed") + + +def get_examples(): + this = os.path.abspath(os.path.dirname(__file__)) + folder = os.path.normpath( + os.path.join( + this, + '..', + 'nimbusml', + 'examples')) + if not os.path.exists(folder): + raise FileNotFoundError("Unable to find '{0}'.".format(folder)) + + folder_files = [(folder, _) for _ in os.listdir( + folder) if os.path.splitext(_)[-1] == '.py'] + if len(folder_files) == 0: + raise FileNotFoundError( + "Unable to find examples in '{0}'".format(folder)) + + # also include the 'examples_from_dataframe' files + folder_df = os.path.join(folder, 'examples_from_dataframe') + folder_files_df = [(folder_df, _) for _ in os.listdir( + folder_df) if os.path.splitext(_)[-1] == '.py'] + + # merge details of all examples into one list + folder_files.extend(folder_files_df) + folder_files.sort() + + examples = [] + for folder, name in folder_files: + if name in ['__init__.py',]: + continue + # skip for all linux tests, mac is ok + if os.name == "posix" and platform.linux_distribution()[0] != '': if name in [ - # Bug 294481: CharTokenizer_df fails - # with error about variable length vector - 'CharTokenizer_df.py', - # Bug todo: CustomStopWordsRemover fails on ML.NET side - 'NGramFeaturizer2.py', - ]: + # SymSgdNative fails to load on linux + 'SymSgdBinaryClassifier.py', + 'SymSgdBinaryClassifier_infert_df.py', + # MICROSOFTML_RESOURCE_PATH needs to be setup on linux + 'CharTokenizer.py', + 'WordEmbedding.py', + 'WordEmbedding_df.py', + 'NaiveBayesClassifier_df.py']: continue - # skip for all linux tests, mac is ok - if os.name == "posix" and platform.linux_distribution()[0] != '': - if name in [ - # SymSgdNative fails to load on linux - 'SymSgdBinaryClassifier.py', - 'SymSgdBinaryClassifier_infert_df.py', - # MICROSOFTML_RESOURCE_PATH needs to be setup on linux - 'CharTokenizer.py', - 'WordEmbedding.py', - 'WordEmbedding_df.py', - 'NaiveBayesClassifier_df.py' - ]: - continue - # skip for ubuntu 14 tests - if platform.linux_distribution()[1] == 'jessie/sid': - if name in [ - # libdl needs to be setup - 'Image.py', - 'Image_df.py' - ]: - continue - # skip for centos7 tests - if platform.linux_distribution()[0] == 'CentOS Linux': - if name in [ - # libgdiplus needs to be setup - 'Image.py', - 'Image_df.py' - ]: - continue - - full = os.path.join(fold, name) - cmd = '"{0}" -u "{1}"'.format( - sys.executable.replace( - 'w.exe', '.exe'), full) - - begin = time.clock() + + examples.append((folder, name)) + + return examples + + +class TestDocsExamples(unittest.TestCase): + # This method is a static method of the class + # because there were pytest fixture related + # issues when the method was in the global scope. + @staticmethod + def generate_test_method(folder, name): + def method(self): + print("\n======== Example: %s =========== " % name) + + modpath = os.path.abspath(os.path.dirname(myfile)) + modpath = os.path.normpath(os.path.join(os.path.join(modpath), '..')) + os.environ['PYTHONPATH'] = modpath + os.environ['PYTHONIOENCODING'] = 'UTF-8' + + full = os.path.join(folder, name) + python_exe = sys.executable.replace('w.exe', '.exe') + cmd = '"{0}" -u "{1}"'.format(python_exe, full) + if six.PY2: FNULL = open(os.devnull, 'w') p = subprocess.Popen( @@ -104,59 +133,14 @@ def test_examples(self): shell=True) stdout, stderr = p.communicate() else: - with subprocess.Popen(cmd, stdout=subprocess.PIPE, + with subprocess.Popen(cmd, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.DEVNULL, shell=True) as p: stdout, stderr = p.communicate() - total = time.clock() - begin - stderr = stderr.decode('utf-8', errors='ignore').strip( - "\n\r\t ") - stdout = stdout.decode('utf-8', errors='ignore').strip( - "\n\r\t ") - exps = [ - "Exception: 'Missing 'English.tok'", - "Missing resource for SSWE", - "Model file for Word Embedding transform could not " - "be found", - "was already trained. Its coefficients will be " - "overwritten. Use clone() to get an untrained " - "version of it.", - "LdaNative.dll", - "CacheClassesFromAssembly", - "Your CPU supports instructions that this TensorFlow", - "CacheClassesFromAssembly: can't map name " - "OLSLinearRegression to Void, already mapped to Void", - # TensorFlowScorer.py - "tensorflow/compiler/xla/service/service.cc:168] XLA service", - "tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device", - "tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency:", - "tensorflow/compiler/jit/mark_for_compilation_pass.cc:1412] (One-time warning): Not using XLA:CPU", - # Binner.py - "from collections import Mapping, defaultdict", - "DeprecationWarning: Using or importing the ABCs", - # BootStrapSample.py - "DeprecationWarning: the imp module is deprecated", - # PipelineWithGridSearchCV2.py - "FutureWarning: You should specify a value for 'cv'", - # PipelineWithGridSearchCV2.py - "DeprecationWarning: The default of the 'iid' parameter", - # PcaAnomalyDetector.py - "UserWarning: Model", - # FastLinearClassifier_iris_df.py - "FutureWarning: elementwise comparison failed", - # PcaAnomalyDetector_df.py - "FutureWarning: Sorting because non-concatenation axis", - # Image.py - "Unable to revert mtime: /Library/Fonts", - "Fontconfig error: Cannot load default config file", - ] - if sys.version_info[:2] <= (3, 6): - # This warning is new but it does not break any - # other unit tests. - # (3, 5) -> (3, 6) for tests on mac - # TODO: Investigate. - exps.append("RuntimeWarning: numpy.dtype size changed") + stderr = stderr.decode('utf-8', errors='ignore').strip("\n\r\t ") + stdout = stdout.decode('utf-8', errors='ignore').strip("\n\r\t ") errors = None if stderr != '': @@ -165,25 +149,6 @@ def test_examples(self): errors = [_ for _ in errors if exp not in _] if errors and (len(errors) > 1 or (len(errors) == 1 and errors[0] != '')): - excs.append(RuntimeError( - "Issue with\n File '{0}'\n--CMD\n{1}\n--ERR\n{2}\n--OUT\n" - "{3}\n--".format(full, cmd, '\n'.join(errors), stdout))) - print("{0}/{1} FAIL - '{2}' in {3}s".format(i + 1, len( - fold_files), name, total)) - if len(excs) > 1: - for ex in excs: - print('--------------') - print(ex) - raise excs[-1] - else: - print("{0}/{1} OK - '{2}' in " - "{3}s".format(i + 1, len(fold_files), name, total)) - ran += 1 - - if len(excs) > 0: - for ex in excs[1:]: - print('--------------') - print(ex) import numpy import pandas import sklearn @@ -192,10 +157,18 @@ def test_examples(self): sklearn.__version__, numpy.__version__] print("DEBUG VERSIONS", versions) - raise excs[0] - elif ran == 0: - raise Exception( - "No example was run in path '{0}'.".format(fold)) + + raise RuntimeError( + "Issue with\n File '{0}'\n--CMD\n{1}\n--ERR\n{2}\n--OUT\n" + "{3}\n--".format(full, cmd, '\n'.join(errors), stdout)) + + return method + + +for example in get_examples(): + test_name = 'test_%s' % example[1].replace('.py', '').lower() + method = TestDocsExamples.generate_test_method(*example) + setattr(TestDocsExamples, test_name, method) if __name__ == "__main__": diff --git a/src/python/tools/change_to_https.py b/src/python/tools/change_to_https.py new file mode 100644 index 00000000..9091452e --- /dev/null +++ b/src/python/tools/change_to_https.py @@ -0,0 +1,44 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------- + +# Converts all valid HTTP links to HTTPS, where the fed +# HTTP links are found in alterable_urls.csv, which +# is generated by find_http_urls.py +# Usage: python3 change_to_https.py urls.csv path_to_repo + +import sys +import os +import csv + +def changeUrls(pathToReportCsv, pathToRootDirectory): + with open(pathToReportCsv, newline='') as csvFile: + csv_reader = csv.reader(csvFile, delimiter='\t') + line_count = 0 + for row in csv_reader: + if line_count == 0: + line_count += 1 + else: + #URL: row[0] + #relativePath: row[1] + print(row[1]) + absolutePath = pathToRootDirectory+row[1] + fullText = open(absolutePath).read() + fullText = fullText.replace(row[0], row[0].replace('http', 'https')) + f = open(absolutePath, 'w') + f.write(fullText) + f.close() + print("Altered {} in file: {}".format(row[0], absolutePath)) + line_count += 1 + print(f'Processed {line_count} URLs.') + +def main(): + if len(sys.argv) < 3: + print("Usage: python3 change_to_https.py urls.csv path_to_repo") + exit(1) + changeUrls(sys.argv[1], sys.argv[2]) + + +if __name__ == "__main__": + main() diff --git a/src/python/tools/code_fixer.py b/src/python/tools/code_fixer.py index 21b6d1f4..3aa233ac 100644 --- a/src/python/tools/code_fixer.py +++ b/src/python/tools/code_fixer.py @@ -247,12 +247,18 @@ def fix_code(class_name, filename): all_args['output_for_sub_graph'] = {'Model' : \ all_args['predictor_model']}""" +prefixcolumnconcatenator_1 = "output_columns = input_columns" +prefixcolumnconcatenator_1_correct = """raise ValueError( + "'None' output passed when it cannot be none.")""" + signature_fixes_core = { 'NGramFeaturizer': (textTransform_1, textTransform_1_correct), 'ColumnConcatenator': [(concatColumns_1, concatColumns_1_correct)], 'ColumnSelector': [(columnselector_1, columnselector_1_correct)], 'OneVsRestClassifier': [ (onevsrestclassifier_1, onevsrestclassifier_1_correct)], + 'PrefixColumnConcatenator': (prefixcolumnconcatenator_1, + prefixcolumnconcatenator_1_correct) } diff --git a/src/python/tools/find_http_urls.py b/src/python/tools/find_http_urls.py new file mode 100644 index 00000000..e22eb757 --- /dev/null +++ b/src/python/tools/find_http_urls.py @@ -0,0 +1,103 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------- + +# Finds all HTTP URLs found in the NimbusML repository +# Converts all valid HTTP links to HTTPS +# Usage: python3 find_http_urls.py path_to_repo +# Output: alterable_urls.csv, non_alterable_urls.csv, invalid_urls.csv + +# Required non-standard pip library: urlextract + +import sys +import os +import requests +import csv +import collections +import pathlib +from urlextract import URLExtract + +def addToDictionary(dict, key, value): + if key not in dict: + dict[key] = [value] + else: + if value not in dict[key]: + dict[key].append(value) + return dict + +def findHttpUrls(searchRootDirectory): + alterableUrlsStore = {} + nonAlterableUrlsStore = {} + invalidUrlsStore = {} + extractor = URLExtract() + lengthOfOriginalRootPath = -1 + for root, _, files in os.walk(searchRootDirectory, onerror=None): + if lengthOfOriginalRootPath == -1: + lengthOfOriginalRootPath = len(root) + for filename in files: + if pathlib.Path(filename).suffix in ['.props', '.pyproj', '.vcxproj', '.snk'] or '.git' in root: + continue + absoluteFilePath = os.path.join(root, filename) + relativeFilePath = '.' + absoluteFilePath[lengthOfOriginalRootPath:] + try: + with open(absoluteFilePath, "rb") as f: + data = f.read() + try: + data = data.decode("utf-8") + except Exception as e: + print("Unable to decodefile: {} in UTF-8 Encoding.".format(relativeFilePath)) + print(str(e)) + continue + currentUrlList = extractor.find_urls(data) + currentUrlList = [url for url in currentUrlList if url[:5] == "http:"] + for selectedUrl in currentUrlList: + try: + request = requests.get(selectedUrl) + if request.status_code == 200: + changedSelectedUrl = selectedUrl.replace("http", "https") + try: + newRequest = requests.get(changedSelectedUrl) + if newRequest.status_code == 200: + alterableUrlsStore = addToDictionary(alterableUrlsStore, selectedUrl, relativeFilePath) + else: + nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, selectedUrl, relativeFilePath) + except: + nonAlterableUrlsStore = addToDictionary(nonAlterableUrlsStore, selectedUrl, relativeFilePath) + else: + invalidUrlsStore = addToDictionary(invalidUrlsStore, selectedUrl, relativeFilePath) + except ConnectionError: + invalidUrlsStore = addToDictionary(invalidUrlsStore, selectedUrl, relativeFilePath) + except (IOError, OSError): + pass + makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore) + +def makeReports(alterableUrlsStore, nonAlterableUrlsStore, invalidUrlsStore): + with open('alterable_urls.csv', mode='w', newline='') as csv_file: + writer1 = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) + writer1.writerow(["url", "relativeFilepath"]) + for urlKey in alterableUrlsStore: + for fileValue in alterableUrlsStore[urlKey]: + writer1.writerow([urlKey, fileValue]) + with open('non_alterable_urls.csv', mode='w', newline='') as csv_file: + writer2 = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) + writer2.writerow(["url", "relativeFilepath"]) + for urlKey in nonAlterableUrlsStore: + for fileValue in nonAlterableUrlsStore[urlKey]: + writer2.writerow([urlKey, fileValue]) + with open('invalid_urls.csv', mode='w', newline='') as csv_file: + writer3 = csv.writer(csv_file, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) + writer3.writerow(["url", "relativeFilepath"]) + for urlKey in invalidUrlsStore: + for fileValue in invalidUrlsStore[urlKey]: + writer3.writerow([urlKey, fileValue]) + return + +def main(): + if len(sys.argv) < 2: + print("Usage: python3 find_http_urls.py path_to_repo") + exit(1) + findHttpUrls(sys.argv[1]) + +if __name__ == "__main__": + main() diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json index 45eb1a38..5b739e57 100644 --- a/src/python/tools/manifest.json +++ b/src/python/tools/manifest.json @@ -2194,119 +2194,6 @@ "ITrainerInput" ] }, - { - "Name": "Models.OnnxConverter", - "Desc": "Converts the model to ONNX format.", - "FriendlyName": "ONNX Converter.", - "ShortName": null, - "Inputs": [ - { - "Name": "DataFile", - "Type": "String", - "Desc": "The data file", - "Aliases": [ - "data" - ], - "Required": false, - "SortOrder": 0.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Onnx", - "Type": "String", - "Desc": "The path to write the output ONNX to.", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Json", - "Type": "String", - "Desc": "The path to write the output JSON to.", - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "The 'name' property in the output ONNX. By default this will be the ONNX extension-less name.", - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Domain", - "Type": "String", - "Desc": "The 'domain' property in the output ONNX.", - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "InputsToDrop", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Array of input column names to drop", - "Required": false, - "SortOrder": 6.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "OutputsToDrop", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Array of output column names to drop", - "Required": false, - "SortOrder": 8.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Model that needs to be converted to ONNX format.", - "Required": false, - "SortOrder": 10.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "OnnxVersion", - "Type": { - "Kind": "Enum", - "Values": [ - "Stable", - "Experimental" - ] - }, - "Desc": "The targeted ONNX version. It can be either \"Stable\" or \"Experimental\". If \"Experimental\" is used, produced model can contain components that is not officially supported in ONNX standard.", - "Required": false, - "SortOrder": 11.0, - "IsNullable": false, - "Default": "Stable" - }, - { - "Name": "PredictiveModel", - "Type": "PredictorModel", - "Desc": "Predictor model that needs to be converted to ONNX format.", - "Required": false, - "SortOrder": 12.0, - "IsNullable": false, - "Default": null - } - ], - "Outputs": [] - }, { "Name": "Models.OvaModelCombiner", "Desc": "Combines a sequence of PredictorModels into a single model", @@ -3061,6 +2948,29 @@ "ITrainerOutput" ] }, + { + "Name": "Models.Schema", + "Desc": "Retrieve output model schema", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "The transform model.", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "Schema", + "Type": "DataView", + "Desc": "The model schema" + } + ] + }, { "Name": "Models.Summarizer", "Desc": "Summarize a linear regression predictor.", @@ -4071,7 +3981,7 @@ { "Name": "TrainSize", "Type": "Int", - "Desc": "The length of series from the begining used for training.", + "Desc": "The length of series from the beginning used for training.", "Required": true, "SortOrder": 2.0, "IsNullable": false, @@ -10508,7 +10418,7 @@ }, { "Name": "Trainers.GeneralizedAdditiveModelBinaryClassifier", - "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", + "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It maintains no interactions between features.", "FriendlyName": "Generalized Additive Model for Binary Classification", "ShortName": "gam", "Inputs": [ @@ -10808,7 +10718,7 @@ }, { "Name": "Trainers.GeneralizedAdditiveModelRegressor", - "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", + "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It maintains no interactions between features.", "FriendlyName": "Generalized Additive Model for Regression", "ShortName": "gamr", "Inputs": [ @@ -13829,7 +13739,7 @@ }, { "Name": "Trainers.LogisticRegressionClassifier", - "Desc": "Maximum entrypy classification is a method in statistics used to predict the probabilities of parallel events. The model predicts the probabilities of parallel events by fitting data to a softmax function.", + "Desc": "Maximum entropy classification is a method in statistics used to predict the probabilities of parallel events. The model predicts the probabilities of parallel events by fitting data to a softmax function.", "FriendlyName": "Multi-class Logistic Regression", "ShortName": "mlr", "Inputs": [ @@ -18117,6 +18027,51 @@ } ] }, + { + "Name": "Transforms.DatasetScorerEx", + "Desc": "Score a dataset with a predictor model", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "The dataset to be scored", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The predictor model to apply to data", + "Required": true, + "SortOrder": 2.0, + "IsNullable": false + }, + { + "Name": "Suffix", + "Type": "String", + "Desc": "Suffix to append to the score columns", + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ + { + "Name": "ScoredData", + "Type": "DataView", + "Desc": "The scored dataset" + }, + { + "Name": "ScoringTransform", + "Type": "TransformModel", + "Desc": "The scoring transform" + } + ] + }, { "Name": "Transforms.DatasetTransformScorer", "Desc": "Score a dataset with a transform model", @@ -18189,46 +18144,6 @@ "SortOrder": 2.0, "IsNullable": false }, - { - "Name": "ColumnsToDrop", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Enum", - "Values": [ - "Year", - "Month", - "Day", - "Hour", - "Minute", - "Second", - "AmPm", - "Hour12", - "DayOfWeek", - "DayOfQuarter", - "DayOfYear", - "WeekOfMonth", - "QuarterOfYear", - "HalfOfYear", - "WeekIso", - "YearIso", - "MonthLabel", - "AmPmLabel", - "DayOfWeekLabel", - "HolidayName", - "IsPaidTimeOff" - ] - } - }, - "Desc": "Columns to drop after the DateTime Expansion", - "Aliases": [ - "drop" - ], - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": null - }, { "Name": "Country", "Type": { @@ -20909,7 +20824,7 @@ }, { "Name": "Transforms.MissingValueHandler", - "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric.", + "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if the input column type is numeric.", "FriendlyName": "NA Handle Transform", "ShortName": "NAHandle", "Inputs": [ @@ -22052,6 +21967,82 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.PrefixColumnConcatenator", + "Desc": "Concatenates one or more columns of the same item type.", + "FriendlyName": "Concat Transform", + "ShortName": "Concat", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.RandomNumberGenerator", "Desc": "Adds a column with a generated number sequence.", @@ -22162,7 +22153,7 @@ "Name": "Transforms.RobustScaler", "Desc": "Removes the median and scales the data according to the quantile range.", "FriendlyName": "RobustScalerTransformer", - "ShortName": "RobustScalerTransformer", + "ShortName": "RobScalT", "Inputs": [ { "Name": "Column", @@ -23296,7 +23287,7 @@ "Name": "Transforms.TimeSeriesImputer", "Desc": "Fills in missing row and values", "FriendlyName": "TimeSeriesImputer", - "ShortName": "TimeSeriesImputer", + "ShortName": "tsi", "Inputs": [ { "Name": "TimeSeriesColumn", @@ -23372,8 +23363,7 @@ "Values": [ "ForwardFill", "BackFill", - "Median", - "Interpolate" + "Median" ] }, "Desc": "Mode for imputing, defaults to ForwardFill if not provided", @@ -23421,7 +23411,7 @@ "Name": "Transforms.ToString", "Desc": "Turns the given column into a column of its string representation", "FriendlyName": "ToString Transform", - "ShortName": "ToStringTransform", + "ShortName": "tostr", "Inputs": [ { "Name": "Column", @@ -23640,6 +23630,61 @@ } ] }, + { + "Name": "Transforms.VariableColumnTransform", + "Desc": "Combines the specified input columns in to a single variable length vectorized column.", + "FriendlyName": "Variable Column Creator", + "ShortName": "Variable Column Creator", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Features", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Features", + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "LengthColumnName", + "Type": "String", + "Desc": "Length Column Name", + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.VectorToImage", "Desc": "Converts vector array into image type.", @@ -29514,140 +29559,6 @@ } ] }, - { - "Kind": "PartitionedPathParser", - "Components": [ - { - "Name": "ParquetPathParser", - "Desc": "Extract name/value pairs from Parquet formatted directory names. Example path: Year=2018/Month=12/data1.parquet", - "FriendlyName": "Parquet Partitioned Path Parser", - "Aliases": [ - "ParqPP" - ], - "Settings": [] - }, - { - "Name": "SimplePathParser", - "Desc": "A simple parser that extracts directory names as column values. Column names are defined as arguments.", - "FriendlyName": "Simple Partitioned Path Parser", - "Aliases": [ - "SmplPP" - ], - "Settings": [ - { - "Name": "Columns", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the column.", - "Required": true, - "SortOrder": 150.0, - "IsNullable": false - }, - { - "Name": "Type", - "Type": { - "Kind": "Enum", - "Values": [ - "I1", - "U1", - "I2", - "U2", - "I4", - "U4", - "I8", - "U8", - "R4", - "Num", - "R8", - "TX", - "Text", - "TXT", - "BL", - "Bool", - "TimeSpan", - "TS", - "DT", - "DateTime", - "DZ", - "DateTimeZone", - "UG", - "U16" - ] - }, - "Desc": "Data type of the column.", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Source", - "Type": "Int", - "Desc": "Index of the directory representing this column.", - "Required": true, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 - } - ] - } - }, - "Desc": "Column definitions used to override the Partitioned Path Parser. Expected with the format name:type:numeric-source, for example, col=MyFeature:R4:1", - "Aliases": [ - "col" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Type", - "Type": { - "Kind": "Enum", - "Values": [ - "I1", - "U1", - "I2", - "U2", - "I4", - "U4", - "I8", - "U8", - "R4", - "Num", - "R8", - "TX", - "Text", - "TXT", - "BL", - "Bool", - "TimeSpan", - "TS", - "DT", - "DateTime", - "DZ", - "DateTimeZone", - "UG", - "U16" - ] - }, - "Desc": "Data type of each column.", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "TX" - } - ] - } - ] - }, { "Kind": "RegressionLossFunction", "Components": [ diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index d5b0b3a3..c94a8845 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -176,7 +176,6 @@ "Common.MakeArrayIDataView", "Common.MakeArrayIPredictorModel", "CountTable.Create", - "Dssm.Trigram", "EnsembleCreator.CreateAnomalyPipelineEnsemble", "EnsembleCreator.CreateBinaryEnsemble", "EnsembleCreator.CreateBinaryPipelineEnsemble", @@ -781,6 +780,12 @@ "Module": "feature_extraction.text", "Type": "Transform" }, + { + "Name": "Transforms.NGramTranslator", + "NewName": "NGramExtractor", + "Module": "feature_extraction.text", + "Type": "Transform" + }, { "Name": "Transforms.WordEmbeddings", "NewName": "WordEmbedding", @@ -809,6 +814,12 @@ "NewName": "TypeConverter", "Module": "preprocessing.schema", "Type": "Transform" + }, + { + "Name": "Transforms.PrefixColumnConcatenator", + "NewName": "PrefixColumnConcatenator", + "Module": "preprocessing.schema", + "Type": "Transform" } ], "Components": [ diff --git a/src/python/tools/temp_docs_updater.py b/src/python/tools/temp_docs_updater.py new file mode 100644 index 00000000..3915d19e --- /dev/null +++ b/src/python/tools/temp_docs_updater.py @@ -0,0 +1,383 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------- +import os +import re +import stat +import shutil +import argparse +import tempfile +import subprocess +from pathlib import Path +from code_fixer import run_autopep + + +description = """ +This module helps with merging the changes from the master branch +in to the temp/docs branch. Here are the steps it takes: + +1. Create a local clone of the users fork of NimbusML. + +2. Create a new branch in the clone created in step (1) + which tracks the temp/docs branch of the official + NimbusML repository. + +3. Remove all the tracked files from the local branch + created in step (2). + +4. Create a local clone of the master branch of the official + NimbusML repository and checkout the specified commit + (default is HEAD). + +5. Copy all the tracked files from (4) in to (2). + +6. Modify the files in (2) to be compatible with the + documentation requirements. +""" + + +NIMBUSML_GIT_URL = 'https://github.com/microsoft/NimbusML.git' + +# This list should not contain 'core/...' dirs. +# Subdirectories will not be automatically traversed +# and need to be explicitly added to this list. +ENTRYPOINT_DIRS = [ + r'src\python\nimbusml\cluster', + r'src\python\nimbusml\decomposition', + r'src\python\nimbusml\ensemble', + r'src\python\nimbusml\ensemble\booster', + r'src\python\nimbusml\ensemble\feature_selector', + r'src\python\nimbusml\ensemble\output_combiner', + r'src\python\nimbusml\ensemble\sub_model_selector', + r'src\python\nimbusml\ensemble\sub_model_selector\diversity_measure', + r'src\python\nimbusml\ensemble\subset_selector', + r'src\python\nimbusml\feature_extraction', + r'src\python\nimbusml\feature_extraction\categorical', + r'src\python\nimbusml\feature_extraction\image', + r'src\python\nimbusml\feature_extraction\text', + r'src\python\nimbusml\feature_extraction\text\extractor', + r'src\python\nimbusml\feature_extraction\text\stopwords', + r'src\python\nimbusml\feature_selection', + r'src\python\nimbusml\linear_model', + r'src\python\nimbusml\model_selection', + r'src\python\nimbusml\multiclass', + r'src\python\nimbusml\naive_bayes', + r'src\python\nimbusml\preprocessing', + r'src\python\nimbusml\preprocessing\filter', + r'src\python\nimbusml\preprocessing\missing_values', + r'src\python\nimbusml\preprocessing\normalization', + r'src\python\nimbusml\preprocessing\schema', + r'src\python\nimbusml\preprocessing\text', + r'src\python\nimbusml\timeseries', +] + + +def print_title(message): + print('\n', '-' * 50, message, '-' * 50, sep='\n') + + +def get_dir_entries(directory, names_to_ignore=None, paths_to_ignore=None): + if not names_to_ignore: + names_to_ignore = [] + + if not paths_to_ignore: + paths_to_ignore = [] + + files = {} + sub_dirs = {} + + with os.scandir(directory) as it: + for entry in it: + if entry.name in names_to_ignore: + continue + + if any([(x in entry.path) for x in paths_to_ignore]): + continue + + if entry.is_file(): + files[entry.name] = entry + + elif entry.is_dir(): + sub_dirs[entry.name] = entry + + return files, sub_dirs + + +def rmdir(path): + def remove_readonly(func, path, _): + "Clear the readonly bit and reattempt the removal" + os.chmod(path, stat.S_IWRITE) + func(path) + + shutil.rmtree(path, onerror=remove_readonly) + + +def replace_file_contents(file_path, old, new, is_re=False): + with open(file_path, 'rt') as f: + contents = f.read() + + if is_re: + contents = re.sub(old, new, contents) + else: + contents = contents.replace(old, new) + + with open(file_path, 'wt') as f: + f.write(contents) + + +def init_target_repo(repo_dir, fork_git_url, branch_name): + cwd = os.getcwd() + + if os.path.isdir(repo_dir): + print(f'Directory {repo_dir} already exists. Removing it...') + rmdir(repo_dir) + + print_title(f'Cloning repository {fork_git_url} in to {repo_dir}...') + os.mkdir(repo_dir) + os.chdir(repo_dir) + subprocess.run(['git', 'clone', fork_git_url, '.']) + subprocess.run(['git', 'remote', 'add', 'upstream', NIMBUSML_GIT_URL]) + + print('\nAvailable remotes:') + subprocess.run(['git', 'remote', '-v']) + + print_title('Fetching upstream branches and creating local branch...') + subprocess.run(['git', 'fetch', 'upstream']) + subprocess.run(['git', 'checkout', '-b', branch_name, '--track', 'upstream/temp/docs']) + + print('\nBranches:') + subprocess.run(['git', 'branch', '-vv']) + + os.chdir(cwd) + + +def clear_repo(repo_dir): + files, subdirs = get_dir_entries(repo_dir, names_to_ignore=['.git']) + + for dir_entry in files.values(): + os.remove(dir_entry) + + for dir_entry in subdirs.values(): + rmdir(dir_entry) + + +def git_add_all_modifications(repo_dir): + cwd = os.getcwd() + os.chdir(repo_dir) + subprocess.run(['git', 'add', '-A']) + os.chdir(cwd) + + +def get_master_repo(commit=None): + tmp_dir = tempfile.mkdtemp() + cwd = os.getcwd() + os.chdir(tmp_dir) + + commit_name = commit if commit else 'HEAD' + print_title(f'Cloning master branch from {NIMBUSML_GIT_URL} in to {tmp_dir} at commit {commit_name}...') + subprocess.run(['git', 'clone', NIMBUSML_GIT_URL, '.']) + + if commit: + subprocess.run(['git', 'checkout', commit]) + + os.chdir(cwd) + return tmp_dir + + +def copy_to_dir(dst, src_files, src_dirs): + for dir_entry in src_files.values(): + shutil.copy2(dir_entry, dst) + + for dir_entry in src_dirs.values(): + shutil.copytree(dir_entry, os.path.join(dst, dir_entry.name)) + + +def update_entrypoint_compiler(repo_dir): + print_title('Updating entrypoint_compiler...') + + path = os.path.join(repo_dir, 'src', 'python', 'tools', 'entrypoint_compiler.py') + replace_file_contents(path, + 'class_file = class_name.lower()', + "class_file = '_' + class_name.lower()") + + print('entrypoint_compiler.py updated.') + + +def rename_data_dir(repo_dir): + print_title('Renaming data directory...') + + datasets_dir = os.path.join(repo_dir, 'src', 'python', 'nimbusml', 'datasets') + data_dir_src = os.path.join(datasets_dir, 'data') + data_dir_dst = os.path.join(datasets_dir, '_data') + os.rename(data_dir_src, data_dir_dst) + + path = os.path.join(repo_dir, 'src', 'python', 'nimbusml.pyproj') + replace_file_contents(path, 'nimbusml\\datasets\\data\\', 'nimbusml\\datasets\\_data\\') + + # Update the dataset.py file to fix the data dir references + replace_file_contents(os.path.join(datasets_dir, 'datasets.py'), + r'([\r\n]DATA_DIR.+)data', + r'\g<1>_data', + True) + + print('Data directory renamed.') + + +def rename_entrypoint_file(dir_entry): + module_name = dir_entry.name.replace('.py', '') + print(f'Renaming module: {module_name}\n\t({dir_entry.path})\n') + + # Update the import statement in the public file + replace_file_contents(dir_entry.path, + r'(?s)([\r\n]from\s+.*\.){0}'.format(module_name), + r'\g<1>_{0}'.format(module_name), + True) + + # Rename the public file to have an underscore as its first character + new_path = os.path.join(os.path.dirname(dir_entry), f'_{dir_entry.name}') + os.rename(dir_entry.path, new_path) + + # Run autopep on the modified file since the modifications + # might require new formatting which entrypoint_compiler is + # expecting when run with the --check_manual_changes option. + if not new_path.endswith('_cv.py'): + run_autopep(new_path) + + # Update the import statement in __init__.py + init_path = os.path.join(os.path.dirname(dir_entry), '__init__.py') + replace_file_contents(init_path, + r'(^from\s+.*\.|[\r\n]from\s+.*\.){0}'.format(module_name), + r'\g<1>_{0}'.format(module_name), + True) + + parts = Path(dir_entry).parts + last_index = max(i for i, val in enumerate(parts) if val == 'nimbusml') + + base_dir = os.path.join(*parts[:last_index]) + package_dir = os.path.join(*parts[last_index:-1]) + internal_dir = os.path.join(*parts[:last_index+1], 'internal', 'core', *parts[last_index+1:-1]) + internal_pkg_dir = os.path.join('nimbusml', 'internal', 'core', *parts[last_index+1:-1]) + + # Rename the internal file to have an underscore as its first character + if os.path.exists(internal_dir): + os.rename(os.path.join(internal_dir, dir_entry.name), + os.path.join(internal_dir, '_' + dir_entry.name)) + + # Update nimbusml.pyproj with the public and internal name changes + replace_file_contents(os.path.join(base_dir, 'nimbusml.pyproj'), + os.path.join(package_dir, dir_entry.name), + os.path.join(package_dir, '_' + dir_entry.name)) + replace_file_contents(os.path.join(base_dir, 'nimbusml.pyproj'), + os.path.join(internal_pkg_dir, dir_entry.name), + os.path.join(internal_pkg_dir, '_' + dir_entry.name)) + + +def rename_entrypoints(repo_dir): + print_title('Renaming entry point files...') + + for ep_dir in ENTRYPOINT_DIRS: + path = os.path.join(repo_dir, ep_dir) + files, _ = get_dir_entries(path) + + for dir_entry in files.values(): + if dir_entry.name.endswith('.py') and not dir_entry.name == '__init__.py': + rename_entrypoint_file(dir_entry) + + +def rename_pipeline(repo_dir): + nimbusml_path = os.path.join(repo_dir, 'src', 'python', 'nimbusml') + os.rename(os.path.join(nimbusml_path, 'pipeline.py'), + os.path.join(nimbusml_path, '_pipeline.py')) + + replace_file_contents(os.path.join(nimbusml_path, '__init__.py'), + 'from .pipeline import Pipeline', + 'from ._pipeline import Pipeline') + + replace_file_contents(os.path.join(nimbusml_path, '__init__.py.in'), + 'from .pipeline import Pipeline', + 'from ._pipeline import Pipeline') + + replace_file_contents(os.path.join(repo_dir, 'src', 'python', 'nimbusml.pyproj'), + r'nimbusml\pipeline.py', + r'nimbusml\_pipeline.py') + + replace_file_contents(os.path.join(nimbusml_path, 'tests', 'test_syntax_expected_failures.py'), + 'from nimbusml.pipeline import TrainedWarning', + 'from nimbusml._pipeline import TrainedWarning') + + +# TODO: the fixes in this method shouldn't be necessary. +def fix_files(repo_dir): + stopwords_dir = os.path.join(repo_dir, + 'src', 'python', 'nimbusml', + 'feature_extraction', 'text', + 'stopwords') + + replace_file_contents(os.path.join(stopwords_dir, '_customstopwordsremover.py'), + '__all__ = ["CustomStopWordsRemover"]', + '__all__ = ["CustomStopWordsRemover"]\n') + + replace_file_contents(os.path.join(stopwords_dir, '_predefinedstopwordsremover.py'), + '__all__ = ["PredefinedStopWordsRemover"]', + '__all__ = ["PredefinedStopWordsRemover"]\n') + + +def parse_command_line(): + global description + arg_parser = argparse.ArgumentParser(description=description) + + arg_parser.add_argument('repo_dir', + help='The location on disk where to create the new local ' + 'repo which will contain the updated temp/docs branch.', + type=str) + + arg_parser.add_argument('fork_git_url', + help='The url to use for the local repository. This will usually ' + 'be the users forked repository.', + type=str) + + arg_parser.add_argument('branch_name', + help='The name of the new branch which will track temp/docs. ' + 'This branch will be created in the locally cloned copy of the ' + 'repo pointed to by fork_git_url.', + type=str) + + arg_parser.add_argument('-c', '--commit', help='The latest commit to include in the changes ' + 'for the new local temp/docs branch.', + type=str) + + args = arg_parser.parse_args() + return args + + +def main(): + args = parse_command_line() + + repo_dir = Path(args.repo_dir).resolve() + + init_target_repo(repo_dir, + args.fork_git_url, + args.branch_name) + + clear_repo(repo_dir) + + master_repo_dir = get_master_repo(args.commit) + + entries = get_dir_entries(master_repo_dir, names_to_ignore=['.git']) + copy_to_dir(repo_dir, *entries) + + rmdir(master_repo_dir) + + update_entrypoint_compiler(repo_dir) + rename_data_dir(repo_dir) + rename_entrypoints(repo_dir) + rename_pipeline(repo_dir) + + fix_files(repo_dir) + + git_add_all_modifications(repo_dir) + + +if __name__ == '__main__': + main() diff --git a/version.txt b/version.txt index 8e03717d..9c6d6293 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.5.1 \ No newline at end of file +1.6.1