diff --git a/README.md b/README.md
index 1ec683ab..5fff9d39 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,13 @@
`nimbusml` is a Python module that provides Python bindings for [ML.NET](https://github.com/dotnet/machinelearning).
-ML.NET was originally developed in Microsoft Research and is used across many product groups in Microsoft like Windows, Bing, PowerPoint, Excel and others. `nimbusml` was built to enable data science teams that are more familiar with Python to take advantage of ML.NET's functionality and performance.
+ML.NET was originally developed in Microsoft Research and is used across many product groups in Microsoft like Windows, Bing, PowerPoint, Excel, and others. `nimbusml` was built to enable data science teams that are more familiar with Python to take advantage of ML.NET's functionality and performance.
-This package enables training ML.NET pipelines or integrating ML.NET components directly into [scikit-learn](https://scikit-learn.org/stable/) pipelines (it supports `numpy.ndarray`, `scipy.sparse_cst`, and `pandas.DataFrame` as inputs).
+`nimbusml` enables training ML.NET pipelines or integrating ML.NET components directly into [scikit-learn](https://scikit-learn.org/stable/) pipelines. It adheres to existing `scikit-learn` conventions, allowing simple interoperability between `nimbusml` and `scikit-learn` components, while adding a suite of fast, highly optimized, and scalable algorithms, transforms, and components written in C++ and C\#.
+
+See examples below showing interoperability with `scikit-learn`. A more detailed example in the [documentation](https://docs.microsoft.com/en-us/nimbusml/tutorials/b_c-sentiment-analysis-3-combining-nimbusml-and-scikit-learn) shows how to use a `nimbusml` component in a `scikit-learn` pipeline, and create a pipeline using only `nimbusml` components.
+
+`nimbusml` supports `numpy.ndarray`, `scipy.sparse_cst`, and `pandas.DataFrame` as inputs. In addition, `nimbusml` also supports streaming from files without loading the dataset into memory with `FileDataStream`, which allows training on data significantly exceeding memory.
Documentation can be found [here](https://docs.microsoft.com/en-us/NimbusML/overview) and additional notebook samples can be found [here](https://github.com/Microsoft/NimbusML-Samples).
@@ -84,7 +88,7 @@ To build `nimbusml` from source please visit our [developer guide](docs/develope
## Contributing
-The contributions guide can be found [here](CONTRIBUTING.md). Given the experimental nature of this project, support will be provided on a best-effort basis. We suggest opening an issue for discussion before starting a PR with big changes.
+The contributions guide can be found [here](CONTRIBUTING.md).
## Support
diff --git a/build.cmd b/build.cmd
index 8ad4a127..46c56ac5 100644
--- a/build.cmd
+++ b/build.cmd
@@ -26,6 +26,9 @@ set RunExtendedTests=False
set BuildDotNetBridgeOnly=False
set SkipDotNetBridge=False
set AzureBuild=False
+set BuildManifestGenerator=False
+set UpdateManifest=False
+set VerifyManifest=False
:Arg_Loop
if [%1] == [] goto :Build
@@ -53,6 +56,10 @@ if /i [%1] == [--skipDotNetBridge] (
set SkipDotNetBridge=True
shift && goto :Arg_Loop
)
+if /i [%1] == [--updateManifest] (
+ set UpdateManifest=True
+ shift && goto :Arg_Loop
+)
if /i [%1] == [--azureBuild] (
set AzureBuild=True
shift && goto :Arg_Loop
@@ -68,6 +75,7 @@ echo " --installPythonPackages Install python packages after build"
echo " --includeExtendedTests Include the extended tests if the tests are run"
echo " --buildDotNetBridgeOnly Build only DotNetBridge"
echo " --skipDotNetBridge Build everything except DotNetBridge"
+echo " --updateManifest Update manifest.json"
echo " --azureBuild Building in azure devops (adds dotnet CLI to the path)"
goto :Exit_Success
@@ -173,8 +181,6 @@ if "%AzureBuild%" == "True" (
echo ##vso[task.prependpath]%_dotnetRoot%
)
-set LOCAL_NUGET_PACKAGES_DIR=.\local-nuget-packages
-
:: Build managed code
echo ""
echo "#################################"
@@ -191,6 +197,37 @@ if "%BuildDotNetBridgeOnly%" == "True" (
call "%_dotnet%" build -c %Configuration% --force "%__currentScriptDir%src\Platforms\build.csproj"
call "%_dotnet%" publish "%__currentScriptDir%src\Platforms\build.csproj" --force --self-contained -r win-x64 -c %Configuration%
+
+if "%Configuration:~-5%" == "Py3.7" set VerifyManifest=True
+if "%VerifyManifest%" == "True" set BuildManifestGenerator=True
+if "%UpdateManifest%" == "True" set BuildManifestGenerator=True
+
+if "%BuildManifestGenerator%" == "True" (
+ echo ""
+ echo "#################################"
+ echo "Building Manifest Generator... "
+ echo "#################################"
+ call "%_dotnet%" build -c %Configuration% -o "%BuildOutputDir%%Configuration%" --force "%__currentScriptDir%src\ManifestGenerator\ManifestGenerator.csproj"
+)
+
+if "%UpdateManifest%" == "True" (
+ echo Updating manifest.json ...
+ call "%_dotnet%" "%BuildOutputDir%%Configuration%\ManifestGenerator.dll" create %__currentScriptDir%\src\python\tools\manifest.json
+ echo manifest.json updated.
+ echo Run entrypoint_compiler.py --generate_api --generate_entrypoints to generate entry points and api files.
+ goto :Exit_Success
+)
+
+if "%VerifyManifest%" == "True" (
+ echo Verifying manifest.json ...
+ call "%_dotnet%" "%BuildOutputDir%%Configuration%\ManifestGenerator.dll" verify %__currentScriptDir%\src\python\tools\manifest.json
+ if errorlevel 1 (
+ echo manifest.json is invalid.
+ echo Run build --updateManifest to update manifest.json.
+ goto :Exit_Error
+ )
+)
+
echo ""
echo "#################################"
echo "Downloading Dependencies "
@@ -352,13 +389,13 @@ if "%InstallPythonPackages%" == "True" (
echo "#################################"
echo "Installing python packages ... "
echo "#################################"
- call "%PythonExe%" -m pip install --upgrade pip
- call "%PythonExe%" -m pip install --upgrade nose pytest graphviz imageio pytest-cov "jupyter_client>=4.4.0" "nbconvert>=4.2.0"
+ call "%PythonExe%" -m pip install --upgrade "pip==19.3.1"
+ call "%PythonExe%" -m pip install --upgrade nose pytest pytest-xdist graphviz imageio pytest-cov "jupyter_client>=4.4.0" "nbconvert>=4.2.0"
if %PythonVersion% == 2.7 (
call "%PythonExe%" -m pip install --upgrade pyzmq
) else (
- call "%PythonExe%" -m pip install --upgrade "azureml-dataprep>=1.1.12"
+ call "%PythonExe%" -m pip install --upgrade "azureml-dataprep>=1.1.33"
)
call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%"
@@ -379,27 +416,53 @@ set TestsPath1=%PackagePath%\tests
set TestsPath2=%__currentScriptDir%src\python\tests
set TestsPath3=%__currentScriptDir%src\python\tests_extended
set ReportPath=%__currentScriptDir%build\TestCoverageReport
-call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath1%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
-if errorlevel 1 (
- goto :Exit_Error
-)
-call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath2%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
+set NumConcurrentTests=%NUMBER_OF_PROCESSORS%
+
+call "%PythonExe%" -m pytest -n %NumConcurrentTests% --verbose --maxfail=1000 --capture=sys "%TestsPath2%" "%TestsPath1%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
if errorlevel 1 (
- goto :Exit_Error
+ :: Rerun any failed tests to give them one more
+ :: chance in case the errors were intermittent.
+ call "%PythonExe%" -m pytest -n %NumConcurrentTests% --last-failed --verbose --maxfail=1000 --capture=sys "%TestsPath2%" "%TestsPath1%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
+ if errorlevel 1 (
+ goto :Exit_Error
+ )
)
if "%RunExtendedTests%" == "True" (
- call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath3%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
+ call "%PythonExe%" -m pytest -n %NumConcurrentTests% --verbose --maxfail=1000 --capture=sys "%TestsPath3%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
if errorlevel 1 (
- goto :Exit_Error
+ :: Rerun any failed tests to give them one more
+ :: chance in case the errors were intermittent.
+ call "%PythonExe%" -m pytest -n %NumConcurrentTests% --last-failed --verbose --maxfail=1000 --capture=sys "%TestsPath3%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
+ if errorlevel 1 (
+ goto :Exit_Error
+ )
)
)
:Exit_Success
+call :CleanUpDotnet
endlocal
exit /b %ERRORLEVEL%
:Exit_Error
+call :CleanUpDotnet
endlocal
echo Failed with error %ERRORLEVEL%
-exit /b %ERRORLEVEL%
\ No newline at end of file
+exit /b %ERRORLEVEL%
+
+:CleanUpDotnet
+:: Save the error level so it can be
+:: restored when exiting the function
+set PrevErrorLevel=%ERRORLEVEL%
+
+:: Shutdown all dotnet persistent servers so that the
+:: dotnet executable is not left open in the background.
+:: As of dotnet 2.1.3 three servers are left running in
+:: the background. This will shutdown them all down.
+:: See here for more info: https://github.com/dotnet/cli/issues/9458
+:: This fixes an issue when re-running the build script because
+:: the build script was trying to replace the existing dotnet
+:: binaries which were sometimes still in use.
+call "%_dotnet%" build-server shutdown
+exit /b %PrevErrorLevel%
\ No newline at end of file
diff --git a/build.sh b/build.sh
index e2292693..2b20be39 100755
--- a/build.sh
+++ b/build.sh
@@ -175,8 +175,6 @@ then
echo "Installing dotnet SDK ... "
curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Version 2.1.701 -InstallDir ./cli
- export LOCAL_NUGET_PACKAGES_DIR=./local-nuget-packages
-
# Build managed code
echo "Building managed code ... "
_dotnet="${__currentScriptDir}/cli/dotnet"
@@ -284,7 +282,7 @@ then
exit 1
fi
# Review: Adding "--upgrade" to pip install will cause problems when using Anaconda as the python distro because of Anaconda's quirks with pytest.
- "${PythonExe}" -m pip install nose "pytest>=4.4.0" graphviz "pytest-cov>=2.6.1" "jupyter_client>=4.4.0" "nbconvert>=4.2.0"
+ "${PythonExe}" -m pip install nose "pytest>=4.4.0" pytest-xdist graphviz "pytest-cov>=2.6.1" "jupyter_client>=4.4.0" "nbconvert>=4.2.0"
if [ ${PythonVersion} = 2.7 ]
then
"${PythonExe}" -m pip install --upgrade pyzmq
@@ -294,7 +292,7 @@ then
"${PythonExe}" -m pip install --upgrade pytest-remotedata
fi
- "${PythonExe}" -m pip install --upgrade "azureml-dataprep>=1.1.12"
+ "${PythonExe}" -m pip install --upgrade "azureml-dataprep>=1.1.33"
fi
"${PythonExe}" -m pip install --upgrade "${Wheel}"
"${PythonExe}" -m pip install "scikit-learn==0.19.2"
@@ -311,12 +309,25 @@ then
TestsPath2=${__currentScriptDir}/src/python/tests
TestsPath3=${__currentScriptDir}/src/python/tests_extended
ReportPath=${__currentScriptDir}/build/TestCoverageReport
- "${PythonExe}" -m pytest --verbose --maxfail=1000 --capture=sys "${TestsPath1}"
- "${PythonExe}" -m pytest --verbose --maxfail=1000 --capture=sys "${TestsPath2}"
+ "${PythonExe}" -m pytest -n 4 --verbose --maxfail=1000 --capture=sys "${TestsPath2}" "${TestsPath1}" || \
+ "${PythonExe}" -m pytest -n 4 --last-failed --verbose --maxfail=1000 --capture=sys "${TestsPath2}" "${TestsPath1}"
if [ ${__runExtendedTests} = true ]
- then
- "${PythonExe}" -m pytest --verbose --maxfail=1000 --capture=sys "${TestsPath3}"
+ then
+ echo "Running extended tests ... "
+ if [ ! "$(uname -s)" = "Darwin" ]
+ then
+ # Required for Image.py and Image_df.py to run successfully on Ubuntu.
+ {
+ apt-get update
+ apt-get install libc6-dev -y
+ apt-get install libgdiplus -y
+ } || {
+ # Required for Image.py and Image_df.py to run successfully on CentOS.
+ yum install glibc-devel -y
+ }
+ fi
+ "${PythonExe}" -m pytest -n 4 --verbose --maxfail=1000 --capture=sys "${TestsPath3}"
fi
fi
diff --git a/build/ci/phase-template.yml b/build/ci/phase-template.yml
index 4df9692c..047d95fe 100644
--- a/build/ci/phase-template.yml
+++ b/build/ci/phase-template.yml
@@ -26,7 +26,8 @@ phases:
- script: $(_buildScript) --configuration $(_configuration) --runTests $(_testOptions)
# Mac phases
- ${{ if eq(parameters.name, 'Mac') }}:
- - script: brew update && brew install https://raw.githubusercontent.com/Homebrew/homebrew-core/f5b1ac99a7fba27c19cee0bc4f036775c889b359/Formula/libomp.rb mono-libgdiplus gettext && brew link gettext --force
+ # Note: Manual defining of the libomp URL below is needed to avoid error at runtime. Installing using 'brew install libomp' results in "Intel MKL FATAL ERROR: Cannot load libmkl_intel_thread.dylib."
+ - script: brew update && brew install https://raw.githubusercontent.com/Homebrew/homebrew-core/f5b1ac99a7fba27c19cee0bc4f036775c889b359/Formula/libomp.rb gettext && brew link gettext --force && brew unlink python@2 && brew install mono-libgdiplus
- ${{ if eq(parameters.testDistro, 'noTests') }}:
- script: chmod 777 $(_buildScript) && $(_buildScript) --configuration $(_configuration)
- ${{ if eq(parameters.testDistro, '') }}:
@@ -50,7 +51,6 @@ phases:
# Publish build artifacts
- ${{ if or(eq(parameters.name, 'Linux_Ubuntu16'), eq(parameters.name, 'Windows'), eq(parameters.name, 'Mac')) }}:
- task: PublishBuildArtifacts@1
- condition: and(always(), ne(variables['Build.Reason'], 'PullRequest'))
displayName: Publish wheel file to VSTS artifacts
inputs:
pathToPublish: $(Build.SourcesDirectory)/target
diff --git a/build/libs_linux.txt b/build/libs_linux.txt
index c2c7d848..b7298fef 100644
--- a/build/libs_linux.txt
+++ b/build/libs_linux.txt
@@ -1,3 +1,4 @@
+Google.Protobuf.dll
Newtonsoft.Json.dll
libCpuMathNative.so
libFastTreeNative.so
diff --git a/build/libs_mac.txt b/build/libs_mac.txt
index 1ebc1724..1c4dc2e4 100644
--- a/build/libs_mac.txt
+++ b/build/libs_mac.txt
@@ -1,3 +1,4 @@
+Google.Protobuf.dll
Newtonsoft.Json.dll
libCpuMathNative.dylib
libFastTreeNative.dylib
diff --git a/docs/developers/linux-build.md b/docs/developers/linux-build.md
index 6ed681e8..fa59738e 100644
--- a/docs/developers/linux-build.md
+++ b/docs/developers/linux-build.md
@@ -12,9 +12,9 @@ Building NimbusML from source on Linux
## Build
Run `./build.sh`
-This downloads dependencies (.NET SDK, specific versions of Python and Boost), builds native code and managed code, and packages NimbusML into a pip-installable wheel. This produces debug binaries by default, and release versions can be specified by `./build.sh --configuration RlsLinPy3.7` for examle.
+This downloads dependencies (.NET SDK, specific versions of Python and Boost), builds native code and managed code, and packages NimbusML into a pip-installable wheel. This produces debug binaries by default, and release versions can be specified by `./build.sh --configuration RlsLinPy3.7` for example.
For additional options including running tests and building components independently, see `./build.sh -h`.
### Known Issues
-The LightGBM estimator fails on Linux when building from source. The official NimbusML Linux wheel package on Pypi.org has a working version of LightGBM.
\ No newline at end of file
+The LightGBM estimator fails on Linux when building from source. The official NimbusML Linux wheel package on Pypi.org has a working version of LightGBM.
diff --git a/docs/developers/windows-build.md b/docs/developers/windows-build.md
index 8dd0e4b8..4d8c4da5 100644
--- a/docs/developers/windows-build.md
+++ b/docs/developers/windows-build.md
@@ -7,6 +7,6 @@ Building NimbusML from source on Windows
## Build
Run `build.cmd`
-This downloads dependencies (.NET SDK, specific versions of Python and Boost), builds native code and managed code, and packages NimbusML into a pip-installable wheel. This produces debug binaries by default, and release versions can be specified by `build.cmd --configuration RlsWinPy3.7` for examle.
+This downloads dependencies (.NET SDK, specific versions of Python and Boost), builds native code and managed code, and packages NimbusML into a pip-installable wheel. This produces debug binaries by default, and release versions can be specified by `build.cmd --configuration RlsWinPy3.7` for example.
For additional options including running tests and building components independently, see `build.cmd -?`.
diff --git a/docs/release-notes/release-1.5.0.md b/docs/release-notes/release-1.5.0.md
new file mode 100644
index 00000000..e5a2eded
--- /dev/null
+++ b/docs/release-notes/release-1.5.0.md
@@ -0,0 +1,101 @@
+# [NimbusML](https://docs.microsoft.com/en-us/nimbusml/overview) 1.5.0
+
+## **New Features**
+
+- **Initial implementation of `csr_matrix` output support.**
+
+ [PR#250](https://github.com/microsoft/NimbusML/pull/250)
+ Add support for data output in `scipy.sparse.csr_matrix` format.
+
+ ```python
+ xf = OneHotVectorizer(columns={'c0':'c0', 'c1':'c1'})
+ xf.fit(train_df)
+ result = xf.transform(train_df, as_csr=True)
+ ```
+
+- **Permutation Feature Importance for model interpretibility.**
+
+ [PR#279](https://github.com/microsoft/NimbusML/pull/279)
+ Adds `permutation_feature_importance()` method to `Pipeline` and
+ predictor estimators, enabling evaluation of model-wide feature
+ importances on any dataset with same schema as the dataset used
+ to fit the `Pipeline`.
+
+ ```python
+ pipe = Pipeline([
+ LogisticRegressionBinaryClassifier(label='label', feature=['feature'])
+ ])
+ pipe.fit(data)
+ pipe.permutation_feature_importance(data)
+ ```
+
+- **Initial implementation of DateTime input and output column support.**
+
+ [PR#290](https://github.com/microsoft/NimbusML/pull/290)
+ Add initial support for input and output of Pandas DateTime columns.
+
+- **Initial implementation of LpScaler.**
+
+ [PR#253](https://github.com/microsoft/NimbusML/pull/253)
+ Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf).
+ Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D
+ is either L2 norm, L1 norm or LInf norm.
+
+- **Add support for variable length vector output.**
+
+ [PR#267](https://github.com/microsoft/NimbusML/pull/267)
+ Support output of columns returned from ML.Net which contain variable length vectors.
+
+- **Save `predictor_model` when pickling a `Pipeline`.**
+
+ [PR#295](https://github.com/microsoft/NimbusML/pull/295)
+
+- **Initial implementation of the WordTokenizer transform.**
+
+ [PR#296](https://github.com/microsoft/NimbusML/pull/296)
+
+- **Add support for summary output from tree based predictors.**
+
+ [PR#298](https://github.com/microsoft/NimbusML/pull/298)
+
+## **Bug Fixes**
+
+- **Fixed `Pipeline.transform()` in transform only `Pipeline` fails if y column is provided **
+
+ [PR#294](https://github.com/microsoft/NimbusML/pull/294)
+ Enable calling `.transform()` on a `Pipeline` containing only transforms when the y column is provided
+
+- **Fix issue when using `predict_proba` or `decision_function` with combined models.**
+
+ [PR#272](https://github.com/microsoft/NimbusML/pull/272)
+
+- **Fix `Pipeline._extract_classes_from_headers` was not checking for valid steps.**
+
+ [PR#292](https://github.com/microsoft/NimbusML/pull/292)
+
+- **Fix BinaryDataStream was not valid as input for transformer.**
+
+ [PR#307](https://github.com/microsoft/NimbusML/pull/307)
+
+- **Fix casing for the installPythonPackages build.sh argument.**
+
+ [PR#256](https://github.com/microsoft/NimbusML/pull/256)
+
+## **Breaking Changes**
+
+- **Removed `y` parameter from `Pipeline.transform()`**
+
+ [PR#294](https://github.com/microsoft/NimbusML/pull/294)
+ Removed `y` parameter from `Pipeline.transform()` as it is not needed nor used for transforming data with a fitted `Pipeline`.
+
+## **Enhancements**
+
+None.
+
+## **Documentation and Samples**
+
+None.
+
+## **Remarks**
+
+None.
diff --git a/docs/release-notes/release-1.6.0.md b/docs/release-notes/release-1.6.0.md
new file mode 100644
index 00000000..fa5ef3d8
--- /dev/null
+++ b/docs/release-notes/release-1.6.0.md
@@ -0,0 +1,42 @@
+# [NimbusML](https://docs.microsoft.com/en-us/nimbusml/overview) 1.6.0
+
+## **New Features**
+
+- **Initial implementation of NGramExtractor.**
+
+ [PR#320](https://github.com/microsoft/NimbusML/pull/320)
+ Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n)
+ in a given vector of keys. It does so by building a dictionary of n-grams and using
+ the id in the dictionary as the index in the bag.
+
+- **Update Manifest Generator.**
+
+ [PR#329](https://github.com/microsoft/NimbusML/pull/329)
+ Update the Manifest Generator project to work with the latest changes and incorporate
+ it in to the build process.
+
+## **Bug Fixes**
+
+None.
+
+## **Enhancements**
+
+- **Update To ML.Net Version 1.4.0.**
+
+ [PR#353](https://github.com/microsoft/NimbusML/pull/353)
+
+- **Update To Latest Version Of DataPrep.**
+
+ [PR#379](https://github.com/microsoft/NimbusML/pull/379)
+
+- **Update Tests To Execute In Parallel.**
+
+ [PR#331](https://github.com/microsoft/NimbusML/pull/331)
+
+## **Documentation and Samples**
+
+None.
+
+## **Remarks**
+
+None.
diff --git a/local-nuget-packages/MicrosoftMLFeaturizers.0.1.0.nupkg b/local-nuget-packages/MicrosoftMLFeaturizers.0.1.0.nupkg
deleted file mode 100644
index 0a8b2fbd..00000000
Binary files a/local-nuget-packages/MicrosoftMLFeaturizers.0.1.0.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.extensions.ml.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.extensions.ml.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 2ceed83a..00000000
Binary files a/local-nuget-packages/microsoft.extensions.ml.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.extensions.ml.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.extensions.ml.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 845b027f..00000000
Binary files a/local-nuget-packages/microsoft.extensions.ml.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index a8debf72..00000000
Binary files a/local-nuget-packages/microsoft.ml.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.automl.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.automl.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index f858c678..00000000
Binary files a/local-nuget-packages/microsoft.ml.automl.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.automl.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.automl.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 3cf6ed34..00000000
Binary files a/local-nuget-packages/microsoft.ml.automl.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.cpumath.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.cpumath.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 008df73c..00000000
Binary files a/local-nuget-packages/microsoft.ml.cpumath.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.cpumath.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.cpumath.symbols.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index bdcd6852..00000000
Binary files a/local-nuget-packages/microsoft.ml.cpumath.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.dataview.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dataview.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 5729bfa7..00000000
Binary files a/local-nuget-packages/microsoft.ml.dataview.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.dataview.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dataview.symbols.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index beefe429..00000000
Binary files a/local-nuget-packages/microsoft.ml.dataview.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.dnn.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dnn.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index f728196c..00000000
Binary files a/local-nuget-packages/microsoft.ml.dnn.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.dnn.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dnn.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 73ffedf4..00000000
Binary files a/local-nuget-packages/microsoft.ml.dnn.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.ensemble.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.ensemble.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 9cbdef31..00000000
Binary files a/local-nuget-packages/microsoft.ml.ensemble.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.ensemble.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.ensemble.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 069b69d9..00000000
Binary files a/local-nuget-packages/microsoft.ml.ensemble.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.entrypoints.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.entrypoints.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 8e27e3cc..00000000
Binary files a/local-nuget-packages/microsoft.ml.entrypoints.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.entrypoints.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.entrypoints.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index f72c9382..00000000
Binary files a/local-nuget-packages/microsoft.ml.entrypoints.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.experimental.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.experimental.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 554d2417..00000000
Binary files a/local-nuget-packages/microsoft.ml.experimental.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.experimental.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.experimental.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index fc844210..00000000
Binary files a/local-nuget-packages/microsoft.ml.experimental.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.fasttree.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.fasttree.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 820b48b3..00000000
Binary files a/local-nuget-packages/microsoft.ml.fasttree.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.fasttree.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.fasttree.symbols.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 4174ee8e..00000000
Binary files a/local-nuget-packages/microsoft.ml.fasttree.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.featurizers.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.featurizers.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index cb04dfd5..00000000
Binary files a/local-nuget-packages/microsoft.ml.featurizers.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.featurizers.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.featurizers.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 5be74193..00000000
Binary files a/local-nuget-packages/microsoft.ml.featurizers.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.imageanalytics.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.imageanalytics.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 7c5afeb9..00000000
Binary files a/local-nuget-packages/microsoft.ml.imageanalytics.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.imageanalytics.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.imageanalytics.symbols.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 11d473a0..00000000
Binary files a/local-nuget-packages/microsoft.ml.imageanalytics.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.lightgbm.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.lightgbm.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 381c705c..00000000
Binary files a/local-nuget-packages/microsoft.ml.lightgbm.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.lightgbm.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.lightgbm.symbols.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index cbd0cf9d..00000000
Binary files a/local-nuget-packages/microsoft.ml.lightgbm.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.mkl.components.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.mkl.components.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 7e448a72..00000000
Binary files a/local-nuget-packages/microsoft.ml.mkl.components.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.mkl.components.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.mkl.components.symbols.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index c24c142e..00000000
Binary files a/local-nuget-packages/microsoft.ml.mkl.components.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.mkl.redist.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.mkl.redist.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 42d18904..00000000
Binary files a/local-nuget-packages/microsoft.ml.mkl.redist.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.onnxconverter.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxconverter.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 045429c8..00000000
Binary files a/local-nuget-packages/microsoft.ml.onnxconverter.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.onnxconverter.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxconverter.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 4a1216b1..00000000
Binary files a/local-nuget-packages/microsoft.ml.onnxconverter.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.onnxtransformer.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxtransformer.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 0d97af5c..00000000
Binary files a/local-nuget-packages/microsoft.ml.onnxtransformer.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.onnxtransformer.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxtransformer.symbols.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index e8e99abc..00000000
Binary files a/local-nuget-packages/microsoft.ml.onnxtransformer.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.parquet.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.parquet.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 8f51320e..00000000
Binary files a/local-nuget-packages/microsoft.ml.parquet.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.parquet.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.parquet.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 765ce5f9..00000000
Binary files a/local-nuget-packages/microsoft.ml.parquet.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.recommender.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.recommender.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index dffcf5c4..00000000
Binary files a/local-nuget-packages/microsoft.ml.recommender.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.recommender.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.recommender.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 0c802cb0..00000000
Binary files a/local-nuget-packages/microsoft.ml.recommender.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.sampleutils.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.sampleutils.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 88add318..00000000
Binary files a/local-nuget-packages/microsoft.ml.sampleutils.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.sampleutils.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.sampleutils.symbols.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 6348fe79..00000000
Binary files a/local-nuget-packages/microsoft.ml.sampleutils.symbols.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.symbols.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 6637e4ff..00000000
Binary files a/local-nuget-packages/microsoft.ml.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.tensorflow.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.tensorflow.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 2b4619e7..00000000
Binary files a/local-nuget-packages/microsoft.ml.tensorflow.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.tensorflow.redist.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.tensorflow.redist.0.18.2-preview2-28208-8.nupkg
deleted file mode 100644
index 2e943616..00000000
Binary files a/local-nuget-packages/microsoft.ml.tensorflow.redist.0.18.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.tensorflow.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.tensorflow.symbols.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 88925eb0..00000000
Binary files a/local-nuget-packages/microsoft.ml.tensorflow.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.timeseries.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.timeseries.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index 036a2ca2..00000000
Binary files a/local-nuget-packages/microsoft.ml.timeseries.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/local-nuget-packages/microsoft.ml.timeseries.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.timeseries.symbols.1.6.2-preview2-28208-8.nupkg
deleted file mode 100644
index fcb211d3..00000000
Binary files a/local-nuget-packages/microsoft.ml.timeseries.symbols.1.6.2-preview2-28208-8.nupkg and /dev/null differ
diff --git a/nimbusml.sln b/nimbusml.sln
index 546014a9..c87f94b8 100644
--- a/nimbusml.sln
+++ b/nimbusml.sln
@@ -20,6 +20,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
version.txt = version.txt
EndProjectSection
EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ManifestGenerator", "src\ManifestGenerator\ManifestGenerator.csproj", "{D3AED287-722F-4243-966E-77AD0652B38E}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
DbgLinPy2.7|x64 = DbgLinPy2.7|x64
@@ -65,36 +67,36 @@ Global
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.5|x64.ActiveCfg = DbgLinPy3.5|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.5|x64.Build.0 = DbgLinPy3.5|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.6|x64.ActiveCfg = DbgLinPy3.6|x64
- {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.7|x64.ActiveCfg = DbgLinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.6|x64.Build.0 = DbgLinPy3.6|x64
+ {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.7|x64.ActiveCfg = DbgLinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgLinPy3.7|x64.Build.0 = DbgLinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy2.7|x64.ActiveCfg = DbgWinPy2.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy2.7|x64.Build.0 = DbgWinPy2.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.5|x64.ActiveCfg = DbgWinPy3.5|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.5|x64.Build.0 = DbgWinPy3.5|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.6|x64.ActiveCfg = DbgWinPy3.6|x64
- {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.7|x64.ActiveCfg = DbgWinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.6|x64.Build.0 = DbgWinPy3.6|x64
+ {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.7|x64.ActiveCfg = DbgWinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.DbgWinPy3.7|x64.Build.0 = DbgWinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy2.7|x64.ActiveCfg = RlsLinPy2.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy2.7|x64.Build.0 = RlsLinPy2.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.5|x64.ActiveCfg = RlsLinPy3.5|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.5|x64.Build.0 = RlsLinPy3.5|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.6|x64.ActiveCfg = RlsLinPy3.6|x64
- {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.7|x64.ActiveCfg = RlsLinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.6|x64.Build.0 = RlsLinPy3.6|x64
+ {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.7|x64.ActiveCfg = RlsLinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsLinPy3.7|x64.Build.0 = RlsLinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsMacPy3.6|x64.ActiveCfg = RlsMacPy3.6|x64
- {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsMacPy3.7|x64.ActiveCfg = RlsMacPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsMacPy3.6|x64.Build.0 = RlsMacPy3.6|x64
+ {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsMacPy3.7|x64.ActiveCfg = RlsMacPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsMacPy3.7|x64.Build.0 = RlsMacPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy2.7|x64.ActiveCfg = RlsWinPy2.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy2.7|x64.Build.0 = RlsWinPy2.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.5|x64.ActiveCfg = RlsWinPy3.5|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.5|x64.Build.0 = RlsWinPy3.5|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.6|x64.ActiveCfg = RlsWinPy3.6|x64
- {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.7|x64.ActiveCfg = RlsWinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.6|x64.Build.0 = RlsWinPy3.6|x64
+ {EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.7|x64.ActiveCfg = RlsWinPy3.7|x64
{EC58F2CF-A1D5-4E28-97F9-69B1E46F6F63}.RlsWinPy3.7|x64.Build.0 = RlsWinPy3.7|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgLinPy2.7|x64.ActiveCfg = DbgWinPy2.7|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgLinPy3.5|x64.ActiveCfg = DbgWinPy3.5|x64
@@ -105,8 +107,8 @@ Global
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.5|x64.ActiveCfg = DbgWinPy3.5|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.5|x64.Build.0 = DbgWinPy3.5|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.6|x64.ActiveCfg = DbgWinPy3.6|x64
- {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.7|x64.ActiveCfg = DbgWinPy3.7|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.6|x64.Build.0 = DbgWinPy3.6|x64
+ {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.7|x64.ActiveCfg = DbgWinPy3.7|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.DbgWinPy3.7|x64.Build.0 = DbgWinPy3.7|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsLinPy2.7|x64.ActiveCfg = RlsWinPy2.7|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsLinPy3.5|x64.ActiveCfg = RlsWinPy3.5|x64
@@ -119,9 +121,45 @@ Global
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.5|x64.ActiveCfg = RlsWinPy3.5|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.5|x64.Build.0 = RlsWinPy3.5|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.6|x64.ActiveCfg = RlsWinPy3.6|x64
- {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.7|x64.ActiveCfg = RlsWinPy3.7|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.6|x64.Build.0 = RlsWinPy3.6|x64
+ {3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.7|x64.ActiveCfg = RlsWinPy3.7|x64
{3DA0AF32-A05B-4ECF-8010-83B14612FBB3}.RlsWinPy3.7|x64.Build.0 = RlsWinPy3.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy2.7|x64.ActiveCfg = DbgLinPy2.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy2.7|x64.Build.0 = DbgLinPy2.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.5|x64.ActiveCfg = DbgLinPy3.5|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.5|x64.Build.0 = DbgLinPy3.5|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.6|x64.ActiveCfg = DbgLinPy3.6|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.6|x64.Build.0 = DbgLinPy3.6|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.7|x64.ActiveCfg = DbgLinPy3.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgLinPy3.7|x64.Build.0 = DbgLinPy3.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy2.7|x64.ActiveCfg = DbgWinPy2.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy2.7|x64.Build.0 = DbgWinPy2.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.5|x64.ActiveCfg = DbgWinPy3.5|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.5|x64.Build.0 = DbgWinPy3.5|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.6|x64.ActiveCfg = DbgWinPy3.6|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.6|x64.Build.0 = DbgWinPy3.6|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.7|x64.ActiveCfg = DbgWinPy3.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.DbgWinPy3.7|x64.Build.0 = DbgWinPy3.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy2.7|x64.ActiveCfg = RlsLinPy2.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy2.7|x64.Build.0 = RlsLinPy2.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.5|x64.ActiveCfg = RlsLinPy3.5|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.5|x64.Build.0 = RlsLinPy3.5|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.6|x64.ActiveCfg = RlsLinPy3.6|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.6|x64.Build.0 = RlsLinPy3.6|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.7|x64.ActiveCfg = RlsLinPy3.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsLinPy3.7|x64.Build.0 = RlsLinPy3.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsMacPy3.6|x64.ActiveCfg = RlsMacPy3.6|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsMacPy3.6|x64.Build.0 = RlsMacPy3.6|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsMacPy3.7|x64.ActiveCfg = RlsMacPy3.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsMacPy3.7|x64.Build.0 = RlsMacPy3.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy2.7|x64.ActiveCfg = RlsWinPy2.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy2.7|x64.Build.0 = RlsWinPy2.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.5|x64.ActiveCfg = RlsWinPy3.5|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.5|x64.Build.0 = RlsWinPy3.5|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.6|x64.ActiveCfg = RlsWinPy3.6|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.6|x64.Build.0 = RlsWinPy3.6|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.7|x64.ActiveCfg = RlsWinPy3.7|x64
+ {D3AED287-722F-4243-966E-77AD0652B38E}.RlsWinPy3.7|x64.Build.0 = RlsWinPy3.7|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
diff --git a/nuget.config b/nuget.config
index 75ab3744..c0efdcaa 100644
--- a/nuget.config
+++ b/nuget.config
@@ -5,7 +5,7 @@
-
-
+
+
diff --git a/release-next.md b/release-next.md
index 031f060f..c6d1ec43 100644
--- a/release-next.md
+++ b/release-next.md
@@ -2,91 +2,11 @@
## **New Features**
-- **Initial implementation of `csr_matrix` output support.**
-
- [PR#250](https://github.com/microsoft/NimbusML/pull/250)
- Add support for data output in `scipy.sparse.csr_matrix` format.
-
- ```python
- xf = OneHotVectorizer(columns={'c0':'c0', 'c1':'c1'})
- xf.fit(train_df)
- result = xf.transform(train_df, as_csr=True)
- ```
-
-- **Permutation Feature Importance for model interpretibility.**
-
- [PR#279](https://github.com/microsoft/NimbusML/pull/279)
- Adds `permutation_feature_importance()` method to `Pipeline` and
- predictor estimators, enabling evaluation of model-wide feature
- importances on any dataset with same schema as the dataset used
- to fit the `Pipeline`.
-
- ```python
- pipe = Pipeline([
- LogisticRegressionBinaryClassifier(label='label', feature=['feature'])
- ])
- pipe.fit(data)
- pipe.permutation_feature_importance(data)
- ```
-
-- **Initial implementation of DateTime input and output column support.**
-
- [PR#290](https://github.com/microsoft/NimbusML/pull/290)
- Add initial support for input and output of Pandas DateTime columns.
-
-- **Initial implementation of LpScaler.**
-
- [PR#253](https://github.com/microsoft/NimbusML/pull/253)
- Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf).
- Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D
- is either L2 norm, L1 norm or LInf norm.
-
-- **Add support for variable length vector output.**
-
- [PR#267](https://github.com/microsoft/NimbusML/pull/267)
- Support output of columns returned from ML.Net which contain variable length vectors.
-
-- **Save `predictor_model` when pickling a `Pipeline`.**
-
- [PR#295](https://github.com/microsoft/NimbusML/pull/295)
-
-- **Initial implementation of the WordTokenizer transform.**
-
- [PR#296](https://github.com/microsoft/NimbusML/pull/296)
-
-- **Add support for summary output from tree based predictors.**
-
- [PR#298](https://github.com/microsoft/NimbusML/pull/298)
+None.
## **Bug Fixes**
-- **Fixed `Pipeline.transform()` in transform only `Pipeline` fails if y column is provided **
-
- [PR#294](https://github.com/microsoft/NimbusML/pull/294)
- Enable calling `.transform()` on a `Pipeline` containing only transforms when the y column is provided
-
-- **Fix issue when using `predict_proba` or `decision_function` with combined models.**
-
- [PR#272](https://github.com/microsoft/NimbusML/pull/272)
-
-- **Fix `Pipeline._extract_classes_from_headers` was not checking for valid steps.**
-
- [PR#292](https://github.com/microsoft/NimbusML/pull/292)
-
-- **Fix BinaryDataStream was not valid as input for transformer.**
-
- [PR#307](https://github.com/microsoft/NimbusML/pull/307)
-
-- **Fix casing for the installPythonPackages build.sh argument.**
-
- [PR#256](https://github.com/microsoft/NimbusML/pull/256)
-
-## **Breaking Changes**
-
-- **Removed `y` parameter from `Pipeline.transform()`**
-
- [PR#294](https://github.com/microsoft/NimbusML/pull/294)
- Removed `y` parameter from `Pipeline.transform()` as it is not needed nor used for transforming data with a fitted `Pipeline`.
+None.
## **Enhancements**
diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs
index 00947124..30450540 100644
--- a/src/DotNetBridge/Bridge.cs
+++ b/src/DotNetBridge/Bridge.cs
@@ -8,9 +8,9 @@
using System.Text;
using System.Threading;
using Microsoft.ML;
-using Microsoft.ML.Featurizers;
using Microsoft.ML.Data;
using Microsoft.ML.EntryPoints;
+using Microsoft.ML.Featurizers;
using Microsoft.ML.Runtime;
using Microsoft.ML.Trainers;
using Microsoft.ML.Trainers.Ensemble;
@@ -302,8 +302,8 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd
//env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly);
//env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly);
env.ComponentCatalog.RegisterAssembly(typeof(SsaChangePointDetector).Assembly);
- env.ComponentCatalog.RegisterAssembly(typeof(CategoryImputerTransformer).Assembly);
env.ComponentCatalog.RegisterAssembly(typeof(DotNetBridgeEntrypoints).Assembly);
+ env.ComponentCatalog.RegisterAssembly(typeof(DateTimeTransformer).Assembly);
using (var ch = host.Start("Executing"))
{
diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj
index 9985bb62..67ba3209 100644
--- a/src/DotNetBridge/DotNetBridge.csproj
+++ b/src/DotNetBridge/DotNetBridge.csproj
@@ -32,21 +32,21 @@
allruntime; build; native; contentfiles; analyzers
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs
index 9be84e67..535d9d75 100644
--- a/src/DotNetBridge/Entrypoints.cs
+++ b/src/DotNetBridge/Entrypoints.cs
@@ -33,7 +33,7 @@ public static CommonOutputs.TransformOutput ConcatColumns(IHostEnvironment env,
for (int i = 0; i < input.Data.Schema.Count; i++)
colNames.Add(input.Data.Schema[i].Name);
- // Iterate throuh input options, find matching source columns, create new input options
+ // Iterate through input options, find matching source columns, create new input options
var inputOptions = new ColumnConcatenatingTransformer.Options() { Data = input.Data };
var columns = new List(input.Columns.Length);
foreach (var col in input.Columns)
diff --git a/src/DotNetBridge/ManifestUtils.cs b/src/DotNetBridge/ManifestUtils.cs
new file mode 100644
index 00000000..c01b8480
--- /dev/null
+++ b/src/DotNetBridge/ManifestUtils.cs
@@ -0,0 +1,112 @@
+//------------------------------------------------------------------------------
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+//------------------------------------------------------------------------------
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Reflection;
+using System.Text.RegularExpressions;
+using Microsoft.ML.Data;
+using Microsoft.ML.EntryPoints;
+using Microsoft.ML.Featurizers;
+using Microsoft.ML.Model.OnnxConverter;
+using Microsoft.ML.Runtime;
+using Microsoft.ML.Trainers;
+using Microsoft.ML.Trainers.Ensemble;
+using Microsoft.ML.Trainers.FastTree;
+using Microsoft.ML.Trainers.LightGbm;
+using Microsoft.ML.Transforms;
+using Microsoft.ML.Transforms.TimeSeries;
+using Newtonsoft.Json;
+using Newtonsoft.Json.Linq;
+
+
+namespace Microsoft.ML.DotNetBridge
+{
+ public static class ManifestUtils
+ {
+ private static readonly Type[] _types = new Type[]
+ {
+ typeof(TextLoader),
+ typeof(LinearModelParameters),
+ typeof(OneHotEncodingTransformer),
+ typeof(FastTreeBinaryModelParameters),
+ typeof(EnsembleModelParameters),
+ typeof(KMeansModelParameters),
+ typeof(PcaModelParameters),
+ typeof(CVSplit),
+ typeof(LightGbmBinaryModelParameters),
+ typeof(TensorFlowTransformer),
+ typeof(ImageLoadingTransformer),
+ typeof(SymbolicSgdLogisticRegressionBinaryTrainer),
+ typeof(OnnxContext),
+ typeof(SsaForecastingTransformer),
+ typeof(VariableColumnTransform),
+ typeof(DateTimeTransformer)
+ };
+
+ private static (IEnumerable epListContents, JObject manifest) BuildManifests()
+ {
+ ConsoleEnvironment env = new ConsoleEnvironment();
+
+ foreach (Type type in _types)
+ {
+ env.ComponentCatalog.RegisterAssembly(type.Assembly);
+ }
+
+ var catalog = env.ComponentCatalog;
+
+ var regex = new Regex(@"\r\n?|\n", RegexOptions.Compiled);
+ var epListContents = catalog.AllEntryPoints()
+ .Select(x => string.Join("\t",
+ x.Name,
+ regex.Replace(x.Description, ""),
+ x.Method.DeclaringType,
+ x.Method.Name,
+ x.InputType,
+ x.OutputType)
+ .Replace(Environment.NewLine, "", StringComparison.Ordinal))
+ .OrderBy(x => x);
+
+ var manifest = JsonManifestUtils.BuildAllManifests(env, catalog);
+
+ //clean up the description from the new line characters
+ if (manifest[FieldNames.TopEntryPoints] != null && manifest[FieldNames.TopEntryPoints] is JArray)
+ {
+ foreach (JToken entry in manifest[FieldNames.TopEntryPoints].Children())
+ if (entry[FieldNames.Desc] != null)
+ entry[FieldNames.Desc] = regex.Replace(entry[FieldNames.Desc].ToString(), "");
+ }
+
+ return (epListContents, manifest);
+ }
+
+ public static void ShowAssemblyInfo()
+ {
+ foreach (Type type in _types)
+ {
+ Assembly assembly = type.Assembly;
+ Console.WriteLine(assembly.Location);
+ }
+ }
+
+ public static void GenerateManifest(string filePath)
+ {
+ var (epListContents, jObj) = BuildManifests();
+
+ if (!string.IsNullOrWhiteSpace(filePath))
+ File.Delete(filePath);
+
+ using (var file = File.OpenWrite(filePath))
+ using (var writer = new StreamWriter(file))
+ using (var jw = new JsonTextWriter(writer))
+ {
+ jw.Formatting = Formatting.Indented;
+ jObj.WriteTo(jw);
+ }
+ }
+ }
+}
diff --git a/src/DotNetBridge/MessageValidator.cs b/src/DotNetBridge/MessageValidator.cs
index 4243a45d..1ebcae67 100644
--- a/src/DotNetBridge/MessageValidator.cs
+++ b/src/DotNetBridge/MessageValidator.cs
@@ -195,7 +195,7 @@ public sealed class MessageValidator
{
return "Failed to initialize CUDA runtime. Possible reasons:" + "\n" +
@"1. The machine does not have CUDA-capable card. Supported devices have compute capability 2.0 and higher." + "\n" +
- @"2. Outdated graphics drivers. Please install the latest drivers from http://www.nvidia.com/Drivers ." + "\n" +
+ @"2. Outdated graphics drivers. Please install the latest drivers from https://www.nvidia.com/Download/index.aspx?lang=en-us ." + "\n" +
@"3. CUDA runtime DLLs are missing, please see the GPU acceleration help for the installation instructions.";
}
)
diff --git a/src/ManifestGenerator/ManifestGenerator.cs b/src/ManifestGenerator/ManifestGenerator.cs
index 985318f6..b872775d 100644
--- a/src/ManifestGenerator/ManifestGenerator.cs
+++ b/src/ManifestGenerator/ManifestGenerator.cs
@@ -3,56 +3,79 @@
// Licensed under the MIT License.
//------------------------------------------------------------------------------
+using System;
using System.IO;
-using Microsoft.ML.Runtime;
-using Microsoft.ML.Runtime.Data;
-using Microsoft.ML.Runtime.EntryPoints.JsonUtils;
-using Microsoft.ML.Runtime.ImageAnalytics;
-using Microsoft.ML.Runtime.Learners;
-using Microsoft.ML.Runtime.LightGBM;
-using Microsoft.ML.Runtime.Model.Onnx;
-using Microsoft.ML.Runtime.PipelineInference;
-using Microsoft.ML.Trainers.FastTree;
-using Microsoft.ML.Trainers.KMeans;
-using Microsoft.ML.Trainers.PCA;
-using Microsoft.ML.Trainers.SymSgd;
-using Microsoft.ML.Transforms;
-using Microsoft.ML.Transforms.Categorical;
-using Newtonsoft.Json;
-
-namespace Microsoft.MachineLearning.ManifestGenerator
+using System.Linq;
+using Microsoft.ML.DotNetBridge;
+
+
+namespace Microsoft.ML.ManifestGenerator
{
public static class ManifestGenerator
{
- public static void Main()
+ private const int ERROR_SUCCESS = 0;
+ private const int ERROR_BAD_ARGUMENTS = 1;
+ private const int ERROR_MANIFEST_INVALID = 2;
+
+ public static void ShowUsage()
{
- using (var env = new ConsoleEnvironment())
+ string usage =
+ "Usage:\n" +
+ " create MANIFEST_PATH Creates a new manifest given the\n" +
+ " current assemblies and stores it\n" +
+ " in the file MANIFEST_PATH.\n" +
+ " verify MANIFEST_PATH Checks if the manifest specified by\n" +
+ " MANIFEST_PATH is valid given the\n" +
+ " the current assemblies.\n" +
+ "\n";
+
+ Console.WriteLine(usage);
+ }
+
+ public static int Main(string[] args)
+ {
+ int exitCode = ERROR_BAD_ARGUMENTS;
+
+ if (args.Length == 2)
{
- env.ComponentCatalog.RegisterAssembly(typeof(TextLoader).Assembly); // ML.Data
- env.ComponentCatalog.RegisterAssembly(typeof(LinearPredictor).Assembly); // ML.StandardLearners
- env.ComponentCatalog.RegisterAssembly(typeof(CategoricalTransform).Assembly); // ML.Transforms
- env.ComponentCatalog.RegisterAssembly(typeof(FastTreeBinaryPredictor).Assembly); // ML.FastTree
- env.ComponentCatalog.RegisterAssembly(typeof(KMeansPredictor).Assembly); // ML.KMeansClustering
- env.ComponentCatalog.RegisterAssembly(typeof(PcaPredictor).Assembly); // ML.PCA
- env.ComponentCatalog.RegisterAssembly(typeof(Experiment).Assembly); // ML.Legacy
- env.ComponentCatalog.RegisterAssembly(typeof(LightGbmBinaryPredictor).Assembly);
- env.ComponentCatalog.RegisterAssembly(typeof(TensorFlowTransform).Assembly);
- env.ComponentCatalog.RegisterAssembly(typeof(ImageLoaderTransform).Assembly);
- env.ComponentCatalog.RegisterAssembly(typeof(SymSgdClassificationTrainer).Assembly);
- env.ComponentCatalog.RegisterAssembly(typeof(AutoInference).Assembly);
- env.ComponentCatalog.RegisterAssembly(typeof(SaveOnnxCommand).Assembly);
- var catalog = env.ComponentCatalog;
- var jObj = JsonManifestUtils.BuildAllManifests(env, catalog);
-
- var jPath = "manifest.json";
- using (var file = File.OpenWrite(jPath))
- using (var writer = new StreamWriter(file))
- using (var jw = new JsonTextWriter(writer))
+ if (args[0].ToLower() == "create")
{
- jw.Formatting = Formatting.Indented;
- jObj.WriteTo(jw);
+ ManifestUtils.ShowAssemblyInfo();
+ ManifestUtils.GenerateManifest(args[1]);
+
+ exitCode = ERROR_SUCCESS;
}
+ else if (args[0].ToLower() == "verify")
+ {
+ string tmpFilePath = Path.GetTempFileName();
+ ManifestUtils.GenerateManifest(tmpFilePath);
+
+ exitCode = FilesMatch(args[1], tmpFilePath) ?
+ exitCode = ERROR_SUCCESS :
+ exitCode = ERROR_MANIFEST_INVALID;
+
+ File.Delete(tmpFilePath);
+ }
+ }
+
+ if (exitCode == ERROR_BAD_ARGUMENTS)
+ {
+ Console.WriteLine("ManifestGenerator: Error - Invalid Arguments.");
+ ShowUsage();
}
+
+ return exitCode;
+ }
+
+ private static bool FilesMatch(string path1, string path2)
+ {
+ long fileLength1 = new FileInfo(path1).Length;
+ long fileLength2 = new FileInfo(path2).Length;
+ if (fileLength1 != fileLength2) return false;
+
+ // TODO: read in only parts of the file at a time
+ bool bytesMatch = File.ReadAllBytes(path1).SequenceEqual(File.ReadAllBytes(path2));
+ return bytesMatch;
}
}
}
diff --git a/src/ManifestGenerator/ManifestGenerator.csproj b/src/ManifestGenerator/ManifestGenerator.csproj
index 4cd94610..13e69006 100644
--- a/src/ManifestGenerator/ManifestGenerator.csproj
+++ b/src/ManifestGenerator/ManifestGenerator.csproj
@@ -1,18 +1,24 @@
- {D3AED287-722F-4243-966E-77AD0652B38E}
- Exe
- Properties
+ netcoreapp2.1truex64
+ CORECLRManifestGeneratorManifestGeneratorfalse
- $(OutputBase)
- Debug;Release
- Microsoft.MachineLearning.ManifestGenerator.ManifestGenerator
+ ..\$(Platform)\$(Configuration)\
+ DbgWinPy3.7;DbgWinPy3.6;DbgWinPy3.5;DbgWinPy2.7;RlsWinPy3.7;RlsWinPy3.6;RlsWinPy3.5;RlsWinPy2.7;DbgLinPy3.7;DbgLinPy3.6;DbgLinPy3.5;DbgLinPy2.7;RlsLinPy3.7;RlsLinPy3.6;RlsLinPy3.5;RlsLinPy2.7;RlsMacPy3.7;RlsMacPy3.6
+ Microsoft.ML.ManifestGenerator.ManifestGenerator
+ 0.1.0
+ Microsoft Corporation
+ (c) Microsoft Corporation. All rights reserved.
+ https://github.com/Microsoft/NimbusML
+ https://github.com/Microsoft/NimbusML
+ latest
- netcoreapp2.0
+ {D3AED287-722F-4243-966E-77AD0652B38E}
+ Exe
@@ -24,13 +30,7 @@
-
-
-
-
-
-
-
+
diff --git a/src/ManifestGenerator/ManifestGenerator.sln b/src/ManifestGenerator/ManifestGenerator.sln
deleted file mode 100644
index 56d26d1d..00000000
--- a/src/ManifestGenerator/ManifestGenerator.sln
+++ /dev/null
@@ -1,25 +0,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 15
-VisualStudioVersion = 15.0.27428.2037
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ManifestGenerator", "ManifestGenerator.csproj", "{D3AED287-722F-4243-966E-77AD0652B38E}"
-EndProject
-Global
- GlobalSection(SolutionConfigurationPlatforms) = preSolution
- Debug|x64 = Debug|x64
- Release|x64 = Release|x64
- EndGlobalSection
- GlobalSection(ProjectConfigurationPlatforms) = postSolution
- {D3AED287-722F-4243-966E-77AD0652B38E}.Debug|x64.ActiveCfg = Debug|x64
- {D3AED287-722F-4243-966E-77AD0652B38E}.Debug|x64.Build.0 = Debug|x64
- {D3AED287-722F-4243-966E-77AD0652B38E}.Release|x64.ActiveCfg = Release|x64
- {D3AED287-722F-4243-966E-77AD0652B38E}.Release|x64.Build.0 = Release|x64
- EndGlobalSection
- GlobalSection(SolutionProperties) = preSolution
- HideSolutionNode = FALSE
- EndGlobalSection
- GlobalSection(ExtensibilityGlobals) = postSolution
- SolutionGuid = {620035F0-EA24-426B-BA6F-FF34BC8E14FA}
- EndGlobalSection
-EndGlobal
diff --git a/src/ManifestGenerator/app.config b/src/ManifestGenerator/app.config
index 5618aa07..7ded20c2 100644
--- a/src/ManifestGenerator/app.config
+++ b/src/ManifestGenerator/app.config
@@ -1,13 +1,5 @@
-
-
-
-
-
-
-
-
diff --git a/src/NativeBridge/DataViewInterop.h b/src/NativeBridge/DataViewInterop.h
index c764b285..f9e87763 100644
--- a/src/NativeBridge/DataViewInterop.h
+++ b/src/NativeBridge/DataViewInterop.h
@@ -16,7 +16,7 @@ typedef MANAGED_CALLBACK_PTR(bool, GETLABELS)(DataSourceBlock *source, int col,
// REVIEW: boost_python is not updated at the same speed as swig or pybind11.
// Both have a larger audience now, see about pybind11 https://github.com/davisking/dlib/issues/293
-// It handles csr_matrix: http://pybind11-rtdtest.readthedocs.io/en/stable/advanced.html#transparent-conversion-of-dense-and-sparse-eigen-data-types.
+// It handles csr_matrix: https://pybind11-rtdtest.readthedocs.io/en/stable/advanced.html#transparent-conversion-of-dense-and-sparse-eigen-data-types.
using namespace boost::python;
// The data source wrapper used for managed interop. Some of the fields of this are visible to managed code.
@@ -240,6 +240,7 @@ class DataSourceBlock
if (bp::extract(str(s).encode("utf_8")).check())
{
+ size = -1;
missing = -1;
pch = bp::extract(str(s).encode("utf_8"));
#if _MSC_VER
diff --git a/src/NativeBridge/NativeBridge.vcxproj b/src/NativeBridge/NativeBridge.vcxproj
index f9cf674c..82367ce5 100644
--- a/src/NativeBridge/NativeBridge.vcxproj
+++ b/src/NativeBridge/NativeBridge.vcxproj
@@ -150,7 +150,7 @@
CORECLR;_DEBUG;_WINDOWS;_USRDLL;PYBRIDGE_EXPORTS;BOOST_USE_STATIC_LIBS;BOOST_PYTHON_STATIC_LIB;BOOST_ALL_NO_LIB;BOOST_NUMPY_STATIC_LIB;_HAS_ITERATOR_DEBUGGING;%(PreprocessorDefinitions)$(BoostRoot)\Include;$(PythonRoot)\includetrue
- MultiThreadedDebug
+ MultiThreadedDebugDLLtrue
diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj
index 626822c9..6a0b7ab7 100644
--- a/src/Platforms/build.csproj
+++ b/src/Platforms/build.csproj
@@ -11,21 +11,21 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/python/README.md b/src/python/README.md
index 74254980..4385ade0 100644
--- a/src/python/README.md
+++ b/src/python/README.md
@@ -1,27 +1,35 @@
-NimbusML
+# NimbusML
-`nimbusml` provides battle-tested state-of-the-art ML algorithms,
-transforms and components, aiming to make them useful for all
-developers, data scientists, and information workers and helpful in all
-products, services and devices. The components are authored by the team
-members, as well as numerous contributors from MSR, CISL, Bing and other
-teams at Microsoft.
+`nimbusml` is a Python module that provides Python bindings for [ML.NET](https://github.com/dotnet/machinelearning).
+
+`nimbusml` aims to enable data science teams that are more familiar with Python
+to take advantage of ML.NET's functionality and performance. It provides
+battle-tested, state-of-the-art ML algorithms, transforms, and components. The
+components are authored by the team members, as well as numerous contributors
+from MSR, CISL, Bing, and other teams at Microsoft.
`nimbusml` is interoperable with `scikit-learn` estimators and transforms,
-while adding a suite of highly optimized algorithms written in C++ and
-C\# for speed and performance. `nimbusml` trainers and transforms support
-the following data structures for the `fit()` and `transform()` methods:
+while adding a suite of fast, highly optimized, and scalable algorithms written
+in C++ and C\#. `nimbusml` trainers and transforms support the following data
+structures for the `fit()` and `transform()` methods:
- `numpy.ndarray`
- `scipy.sparse_cst`
- `pandas.DataFrame`.
-In addition, `nimbusml` also supports streaming from files without loading
-the dataset into memory, which allows training on data significantly
-exceeding memory using `FileDataStream`.
+In addition, `nimbusml` also supports streaming from files without loading the
+dataset into memory with `FileDataStream`, which allows training on data
+significantly exceeding memory.
-With `FileDataStream` `nimbusml` is able to handle up to **billion** features
- and **billions** of training examples for select algorithms.
+With `FileDataStream`, `nimbusml` is able to handle up to a **billion**
+features and **billions** of training examples for select algorithms.
For more details, please refer to the documentation:
-.
\ No newline at end of file
+.
+
+## Third party notices
+
+`nimbusml` contains ML.NET binaries and the .NET Core CLR runtime, as well as
+their dependencies. Both ML.NET and .NET Core CLR are made available under the
+MIT license. Please refer to the [third party notices](https://github.com/microsoft/NimbusML/blob/master/THIRD-PARTY-NOTICES.txt)
+for full licensing information for ML.NET and .NET Core CLR.
\ No newline at end of file
diff --git a/src/python/docs/docstrings/AveragedPerceptronBinaryClassifier.txt b/src/python/docs/docstrings/AveragedPerceptronBinaryClassifier.txt
index 45b12209..31314605 100644
--- a/src/python/docs/docstrings/AveragedPerceptronBinaryClassifier.txt
+++ b/src/python/docs/docstrings/AveragedPerceptronBinaryClassifier.txt
@@ -45,10 +45,10 @@
`_
`Large Margin Classification Using the Perceptron Algorithm
- `_
+ `_
`Discriminative Training Methods for Hidden Markov Models
- `_
+ `_
:param loss: The default is :py:class:`'hinge' `. Other
diff --git a/src/python/docs/docstrings/Dart.txt b/src/python/docs/docstrings/Dart.txt
index faa504e0..d037b6f7 100644
--- a/src/python/docs/docstrings/Dart.txt
+++ b/src/python/docs/docstrings/Dart.txt
@@ -7,9 +7,9 @@
`_ is an
ensemble method of boosted regression trees. The Dropouts meet
Multiple Additive Regression
- Trees (DART) employs dropouts in MART and overcomes the issues of over-
+ Trees (DART) employs dropouts in MART and overcomes the issues of over-
specialization of MART,
- achiving better performance in many tasks.
+ achieving better performance in many tasks.
**Reference**
diff --git a/src/python/docs/docstrings/DssmFeaturizer.txt b/src/python/docs/docstrings/DssmFeaturizer.txt
deleted file mode 100644
index d71d2540..00000000
--- a/src/python/docs/docstrings/DssmFeaturizer.txt
+++ /dev/null
@@ -1,32 +0,0 @@
- """
-
- The input to this transform is text. It applies a pretrained DSSM
- featurizer and outputs semantic embeddings for
- the input vectors and a cosine similarity computed between the query
- and document columns.
-
- .. remarks::
- DSSM is a neural network algorithm that produces feature embeddings
- for key-value string pairs. It is trained
- using a dataset consisting of positive key-value pairs, from which
- the original rows are used as correct
- examples, and the strings are recombined to produce adversarial,
- incorrect training examples. Some example of
- key-value pairs include search query and clicked document title text,
- search query and clicked ad content text,
- Search using Clickthrough Data `_ , an MSR publication.
-
-
- .. seealso::
- :py:class:`NGramFeaturizer `,
- :py:class:`Sentiment `,
- :py:class:`SsweEmbedding `,
- :py:class:`WordEmbedding `.
-
- .. index:: transform, featurizer, text
-
- Example:
- .. literalinclude:: /../nimbusml/examples/DssmFeaturizer.py
- :language: python
- """
\ No newline at end of file
diff --git a/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt b/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt
index 787972a2..c8e86ac9 100644
--- a/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt
+++ b/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt
@@ -22,7 +22,7 @@
`Field Aware Factorization Machines
`_,
`Field-aware Factorization Machines for CTR Prediction
- `_,
+ `_,
`Adaptive Subgradient Methods for Online Learning and Stochastic
Optimization
`_
diff --git a/src/python/docs/docstrings/FastForestBinaryClassifier.txt b/src/python/docs/docstrings/FastForestBinaryClassifier.txt
index 6ebc1938..3e9a6688 100644
--- a/src/python/docs/docstrings/FastForestBinaryClassifier.txt
+++ b/src/python/docs/docstrings/FastForestBinaryClassifier.txt
@@ -33,7 +33,7 @@
**Reference**
`Wikipedia: Random forest
- `_
+ `_
`Quantile regression forest
`_
diff --git a/src/python/docs/docstrings/FastForestRegressor.txt b/src/python/docs/docstrings/FastForestRegressor.txt
index 0d01ad8c..35a6ad5e 100644
--- a/src/python/docs/docstrings/FastForestRegressor.txt
+++ b/src/python/docs/docstrings/FastForestRegressor.txt
@@ -43,7 +43,7 @@
**Reference**
`Wikipedia: Random forest
- `_
+ `_
`Quantile regression forest
`_
diff --git a/src/python/docs/docstrings/FastLinearBinaryClassifier.txt b/src/python/docs/docstrings/FastLinearBinaryClassifier.txt
index db2c74db..a16893e8 100644
--- a/src/python/docs/docstrings/FastLinearBinaryClassifier.txt
+++ b/src/python/docs/docstrings/FastLinearBinaryClassifier.txt
@@ -1,7 +1,7 @@
"""
A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer
- for linear binary classification and regression.
+ for linear binary classification.
.. remarks::
``FastLinearBinaryClassifier`` is a trainer based on the Stochastic
@@ -58,8 +58,7 @@
content/uploads/2016/06/main-3.pdf>`_
`Stochastic Dual Coordinate Ascent Methods for Regularized Loss
- Minimization `_
+ Minimization `_
:param loss: The default is :py:class:`'log' `. Other
diff --git a/src/python/docs/docstrings/FastLinearClassifier.txt b/src/python/docs/docstrings/FastLinearClassifier.txt
index 2fcb2868..d9984dd5 100644
--- a/src/python/docs/docstrings/FastLinearClassifier.txt
+++ b/src/python/docs/docstrings/FastLinearClassifier.txt
@@ -1,6 +1,7 @@
"""
- Train an SDCA multi class model
+ A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer for
+ multi class classification.
.. remarks::
``FastLinearClassifier`` is a trainer based on the Stochastic Dual
@@ -56,8 +57,7 @@
content/uploads/2016/06/main-3.pdf>`_
`Stochastic Dual Coordinate Ascent Methods for Regularized Loss
- Minimization `_
+ Minimization `_
:param feature: see `Columns `_.
diff --git a/src/python/docs/docstrings/FastLinearRegressor.txt b/src/python/docs/docstrings/FastLinearRegressor.txt
index 4dda71be..9e7c5d88 100644
--- a/src/python/docs/docstrings/FastLinearRegressor.txt
+++ b/src/python/docs/docstrings/FastLinearRegressor.txt
@@ -1,7 +1,7 @@
"""
A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer
- for linear binary classification and regression.
+ for linear regression.
.. remarks::
``FastLinearRegressor`` is a trainer based on the Stochastic Dual
@@ -56,8 +56,7 @@
content/uploads/2016/06/main-3.pdf>`_
`Stochastic Dual Coordinate Ascent Methods for Regularized Loss
- Minimization `_
+ Minimization `_
:param loss: The only supported loss is :py:class:`'squared'
diff --git a/src/python/docs/docstrings/FastTreesBinaryClassifier.txt b/src/python/docs/docstrings/FastTreesBinaryClassifier.txt
index 1789d738..15865149 100644
--- a/src/python/docs/docstrings/FastTreesBinaryClassifier.txt
+++ b/src/python/docs/docstrings/FastTreesBinaryClassifier.txt
@@ -57,7 +57,7 @@
`_
`Greedy function approximation: A gradient boosting machine.
- `_
+ `_
:param optimizer: Default is ``sgd``.
diff --git a/src/python/docs/docstrings/FastTreesRegressor.txt b/src/python/docs/docstrings/FastTreesRegressor.txt
index cd1f76b8..91a3622d 100644
--- a/src/python/docs/docstrings/FastTreesRegressor.txt
+++ b/src/python/docs/docstrings/FastTreesRegressor.txt
@@ -62,7 +62,7 @@
`_
`Greedy function approximation: A gradient boosting machine.
- `_
+ `_
:param optimizer: Default is ``sgd``.
diff --git a/src/python/docs/docstrings/FastTreesTweedieRegressor.txt b/src/python/docs/docstrings/FastTreesTweedieRegressor.txt
index 76cd6749..3c02e645 100644
--- a/src/python/docs/docstrings/FastTreesTweedieRegressor.txt
+++ b/src/python/docs/docstrings/FastTreesTweedieRegressor.txt
@@ -14,7 +14,7 @@
`_
`Greedy function approximation: A gradient boosting machine.
- `_
+ `_
:param optimizer: Default is ``sgd``.
diff --git a/src/python/docs/docstrings/FromKey.txt b/src/python/docs/docstrings/FromKey.txt
index a61b7064..fd162550 100644
--- a/src/python/docs/docstrings/FromKey.txt
+++ b/src/python/docs/docstrings/FromKey.txt
@@ -1,7 +1,6 @@
"""
- Text transforms that can be performed on data before training
- a model.
+ Converts the key types back to their original values.
.. remarks::
The ``FromKey`` transform converts a column of keys, generated using
diff --git a/src/python/docs/docstrings/GamBinaryClassifier.txt b/src/python/docs/docstrings/GamBinaryClassifier.txt
index 69484156..acd5f023 100644
--- a/src/python/docs/docstrings/GamBinaryClassifier.txt
+++ b/src/python/docs/docstrings/GamBinaryClassifier.txt
@@ -21,7 +21,7 @@
functions learned will step between the discretization boundaries.
This implementation is based on the this `paper
- `_,
+ `_,
but diverges from it in several important respects: most
significantly,
in each round of boosting, rather than do one feature at a time, it
@@ -57,7 +57,7 @@
`Generalized additive models
`_,
`Intelligible Models for Classification and Regression
- `_
+ `_
:param normalize: Specifies the type of automatic normalization used:
diff --git a/src/python/docs/docstrings/GamRegressor.txt b/src/python/docs/docstrings/GamRegressor.txt
index 54d71d10..3e44a736 100644
--- a/src/python/docs/docstrings/GamRegressor.txt
+++ b/src/python/docs/docstrings/GamRegressor.txt
@@ -21,7 +21,7 @@
functions learned will step between the discretization boundaries.
This implementation is based on the this `paper
- `_,
+ `_,
but diverges from it in several important respects: most
significantly,
in each round of boosting, rather than do one feature at a time, it
@@ -57,7 +57,7 @@
`Generalized additive models
`_,
`Intelligible Models for Classification and Regression
- `_
+ `_
:param normalize: Specifies the type of automatic normalization used:
diff --git a/src/python/docs/docstrings/Goss.txt b/src/python/docs/docstrings/Goss.txt
index 7ae86ec2..97265859 100644
--- a/src/python/docs/docstrings/Goss.txt
+++ b/src/python/docs/docstrings/Goss.txt
@@ -5,9 +5,9 @@
.. remarks::
Gradient-based One-Side Sampling (GOSS) employs an adaptive sampling
named gradient-based
- sampling. For datasets with large sample size, GOSS has considerable
+ sampling. For datasets with large sample size, GOSS has considerable
advantage in terms of
- statistical and computational efficiency.
+ statistical and computational efficiency.
diff --git a/src/python/docs/docstrings/Handler.txt b/src/python/docs/docstrings/Handler.txt
index 01d767e8..4a639e1e 100644
--- a/src/python/docs/docstrings/Handler.txt
+++ b/src/python/docs/docstrings/Handler.txt
@@ -33,14 +33,13 @@
For more details see `Columns `_.
:param replace_with: The method to use to replace NaN values. The
- following choices are available.
-
- * Def: Replace with default value of that type, usually ``0``. If no
- replace
- method is specified, this is the default strategy.
- * Mean: Replace NaN values with the mean of the values in that column.
- * Min: Replace with minimum value in the column.
- * Max: Replace with maximum value in the column.
+ following choices are available.
+
+ * Def: Replace with default value of that type, usually ``0``. If no
+ replace method is specified, this is the default strategy.
+ * Mean: Replace NaN values with the mean of the values in that column.
+ * Min: Replace with minimum value in the column.
+ * Max: Replace with maximum value in the column.
.. seealso::
:py:class:`Filter `,
diff --git a/src/python/docs/docstrings/LightLda.txt b/src/python/docs/docstrings/LightLda.txt
index 95736da9..aaec0162 100644
--- a/src/python/docs/docstrings/LightLda.txt
+++ b/src/python/docs/docstrings/LightLda.txt
@@ -10,7 +10,7 @@
topical vectors. LightLDA is an extremely
efficient implementation of LDA developed in MSR-Asia that
incorporates a number of optimization techniques
- `(http://arxiv.org/abs/1412.1576) `_.
+ `(https://arxiv.org/abs/1412.1576) `_.
With the LDA transform, we can
train a topic model to produce 1 million topics with 1 million
vocabulary on a 1-billion-token document set one
diff --git a/src/python/docs/docstrings/Loader.txt b/src/python/docs/docstrings/Loader.txt
index ca290c1e..e94fb9e1 100644
--- a/src/python/docs/docstrings/Loader.txt
+++ b/src/python/docs/docstrings/Loader.txt
@@ -1,6 +1,6 @@
"""
- Loaders image data.
+ Loads image data.
.. remarks::
``Loader`` loads images from paths.
diff --git a/src/python/docs/docstrings/LocalDeepSvmBinaryClassifier.txt b/src/python/docs/docstrings/LocalDeepSvmBinaryClassifier.txt
index cf028dcd..4863237a 100644
--- a/src/python/docs/docstrings/LocalDeepSvmBinaryClassifier.txt
+++ b/src/python/docs/docstrings/LocalDeepSvmBinaryClassifier.txt
@@ -39,14 +39,14 @@
More details about LD-SVM can be found in this paper `Local deep
kernel
learning for efficient non-linear SVM prediction
- `_.
**Reference**
`Local deep kernel learning for efficient non-linear SVM prediction
- `_
diff --git a/src/python/docs/docstrings/LogisticRegressionBinaryClassifier.txt b/src/python/docs/docstrings/LogisticRegressionBinaryClassifier.txt
index 6fb1063d..b268dea2 100644
--- a/src/python/docs/docstrings/LogisticRegressionBinaryClassifier.txt
+++ b/src/python/docs/docstrings/LogisticRegressionBinaryClassifier.txt
@@ -69,14 +69,14 @@
**Reference**
- `Wikipedia: L-BFGS `_
+ `Wikipedia: L-BFGS `_
`Wikipedia: Logistic
- regression `_
+ regression `_
`Scalable
Training of L1-Regularized Log-Linear Models
- `_
+ `_
`Test Run - L1
and L2 Regularization for Machine Learning
diff --git a/src/python/docs/docstrings/LogisticRegressionClassifier.txt b/src/python/docs/docstrings/LogisticRegressionClassifier.txt
index db6f386e..405c20f3 100644
--- a/src/python/docs/docstrings/LogisticRegressionClassifier.txt
+++ b/src/python/docs/docstrings/LogisticRegressionClassifier.txt
@@ -70,14 +70,14 @@
**Reference**
- `Wikipedia: L-BFGS `_
+ `Wikipedia: L-BFGS `_
`Wikipedia: Logistic
- regression `_
+ regression `_
`Scalable
Training of L1-Regularized Log-Linear Models
- `_
+ `_
`Test Run - L1
and L2 Regularization for Machine Learning
diff --git a/src/python/docs/docstrings/NGram.txt b/src/python/docs/docstrings/NGram.txt
index e05c292a..e4d681db 100644
--- a/src/python/docs/docstrings/NGram.txt
+++ b/src/python/docs/docstrings/NGram.txt
@@ -1,6 +1,6 @@
"""
- Extracts NGrams from text and convert them to vector using
+ Extracts NGrams from text and converts them to vector using
dictionary.
.. remarks::
diff --git a/src/python/docs/docstrings/NgramHash.txt b/src/python/docs/docstrings/NgramHash.txt
index b7e34e8a..a1969901 100644
--- a/src/python/docs/docstrings/NgramHash.txt
+++ b/src/python/docs/docstrings/NgramHash.txt
@@ -1,6 +1,6 @@
"""
- Extracts NGrams from text and convert them to vector using hashing
+ Extracts NGrams from text and converts them to vector using hashing
trick.
.. remarks::
diff --git a/src/python/docs/docstrings/OneClassSVMAnomalyDetector.txt b/src/python/docs/docstrings/OneClassSVMAnomalyDetector.txt
index 958bd389..44e9ef30 100644
--- a/src/python/docs/docstrings/OneClassSVMAnomalyDetector.txt
+++ b/src/python/docs/docstrings/OneClassSVMAnomalyDetector.txt
@@ -29,10 +29,10 @@
us/library/azure/dn913103.aspx>`_
`Estimating the Support of a High-Dimensional Distribution
- `_
+ `_
`New Support Vector Algorithms
- `_
+ `_
`LIBSVM: A Library for Support Vector Machines
`_
diff --git a/src/python/docs/docstrings/PcaAnomalyDetector.txt b/src/python/docs/docstrings/PcaAnomalyDetector.txt
index 5896c5c9..f51aaf24 100644
--- a/src/python/docs/docstrings/PcaAnomalyDetector.txt
+++ b/src/python/docs/docstrings/PcaAnomalyDetector.txt
@@ -36,13 +36,12 @@
`Randomized Methods for Computing the Singular Value Decomposition
(SVD) of very large matrices
- `_
+ `_
`A randomized algorithm for principal component analysis
`_,
`Finding Structure with Randomness: Probabilistic Algorithms for
Constructing Approximate Matrix Decompositions
- `_
+ `_
:param feature: see `Columns `_.
diff --git a/src/python/docs/docstrings/PrefixColumnConcatenator.txt b/src/python/docs/docstrings/PrefixColumnConcatenator.txt
new file mode 100644
index 00000000..aac3d116
--- /dev/null
+++ b/src/python/docs/docstrings/PrefixColumnConcatenator.txt
@@ -0,0 +1,44 @@
+ """
+
+ Combines several columns into a single vector-valued column by prefix.
+
+ .. remarks::
+ ``PrefixColumnConcatenator`` creates a single vector-valued column from
+ multiple
+ columns. It can be performed on data before training a model. The
+ concatenation
+ can significantly speed up the processing of data when the number of
+ columns
+ is as large as hundreds to thousands.
+
+ :param columns: a dictionary of key-value pairs, where key is the output
+ column name and value is a list of input column names.
+
+ * Only one key-value pair is allowed.
+ * Input column type: numeric or string.
+ * Output column type:
+ `Vector Type `_.
+
+ The << operator can be used to set this value (see
+ `Column Operator `_)
+
+ For example
+ * ColumnConcatenator(columns={'features': ['age', 'parity',
+ 'induced']})
+ * ColumnConcatenator() << {'features': ['age', 'parity',
+ 'induced']})
+
+ For more details see `Columns `_.
+
+ .. seealso::
+ :py:class:`ColumnDropper
+ `,
+ :py:class:`ColumnSelector
+ `.
+
+ .. index:: transform, schema
+
+ Example:
+ .. literalinclude:: /../nimbusml/examples/PrefixColumnConcatenator.py
+ :language: python
+ """
diff --git a/src/python/docs/docstrings/Resizer.txt b/src/python/docs/docstrings/Resizer.txt
index eb45128e..2bf9857f 100644
--- a/src/python/docs/docstrings/Resizer.txt
+++ b/src/python/docs/docstrings/Resizer.txt
@@ -1,15 +1,15 @@
"""
- Resizers an image to a specified dimension using a specified
+ Resizes an image to a specified dimension using a specified
resizing method.
.. remarks::
- ``Resizer`` resizers an image to the specified height and width
+ ``Resizer`` resizes an image to the specified height and width
using a specified resizing method. The input variables to this
transforms must
be images, typically the result of the ``Loader`` transform.
- :param columns: a dictionary of key-value pairs, where key is the output
+ :param columns: A dictionary of key-value pairs, where key is the output
column name and value is the input column name.
* Multiple key-value pairs are allowed.
diff --git a/src/python/docs/docstrings/SgdBinaryClassifier.txt b/src/python/docs/docstrings/SgdBinaryClassifier.txt
index c1ed86ac..a585e088 100644
--- a/src/python/docs/docstrings/SgdBinaryClassifier.txt
+++ b/src/python/docs/docstrings/SgdBinaryClassifier.txt
@@ -13,14 +13,14 @@
associated optimization problem is sparse, then Hogwild SGD achieves
a
nearly optimal rate of convergence. For a detailed reference, please
- refer to `http://arxiv.org/pdf/1106.5730v2.pdf
- `_.
+ refer to `https://arxiv.org/pdf/1106.5730v2.pdf
+ `_.
**Reference**
- `http://arxiv.org/pdf/1106.5730v2.pdf
- `_
+ `https://arxiv.org/pdf/1106.5730v2.pdf
+ `_
:param normalize: Specifies the type of automatic normalization used:
diff --git a/src/python/docs/docstrings/SigmoidKernel.txt b/src/python/docs/docstrings/SigmoidKernel.txt
index 3a22d2cd..62c5785a 100644
--- a/src/python/docs/docstrings/SigmoidKernel.txt
+++ b/src/python/docs/docstrings/SigmoidKernel.txt
@@ -3,8 +3,7 @@
Apply sigmoid function. tanh(gamma*+c).
.. remarks::
- `SigmoidKernel `_ is a
+ `SigmoidKernel `_ is a
kernel function
that computes the similarity between two features.
diff --git a/src/python/docs/docstrings/SsaForecaster.txt b/src/python/docs/docstrings/SsaForecaster.txt
index 8873702b..a8a99a6f 100644
--- a/src/python/docs/docstrings/SsaForecaster.txt
+++ b/src/python/docs/docstrings/SsaForecaster.txt
@@ -11,7 +11,7 @@
input time-series where each component in the spectrum corresponds to a
trend, seasonal or noise component in the time-series. For details of the
Singular Spectrum Analysis (SSA), refer to `this document
- `_.
+ `_.
.. seealso::
:py:func:`IIDChangePointDetector
diff --git a/src/python/docs/docstrings/SsweEmbedding.txt b/src/python/docs/docstrings/SsweEmbedding.txt
index 4c476285..4e6c56f9 100644
--- a/src/python/docs/docstrings/SsweEmbedding.txt
+++ b/src/python/docs/docstrings/SsweEmbedding.txt
@@ -7,12 +7,12 @@
versions of `GloVe Models
`_, `FastText
`_, and `Sswe
- `_.
+ `_.
.. remarks::
Sentiment-specific word embedding (SSWE) is a DNN featurizer
developed
- by MSRA (`paper `_).
+ by MSRA (`paper `_).
It
incorporates sentiment information into the neural network to learn
sentiment specific word embedding. It proves to be useful in various
@@ -63,7 +63,6 @@
.. seealso::
:py:class:`NGramFeaturizer `,
- :py:class:`DssmFeaturizer `,
:py:class:`Sentiment `,
:py:class:`WordEmbedding `.
diff --git a/src/python/docs/docstrings/SupervisedBinner.txt b/src/python/docs/docstrings/SupervisedBinner.txt
index 963a560e..95317b75 100644
--- a/src/python/docs/docstrings/SupervisedBinner.txt
+++ b/src/python/docs/docstrings/SupervisedBinner.txt
@@ -24,7 +24,7 @@
the default is to normalize features before training.
``SupervisedBinner`` implements the `Entropy-Based Discretization
- `_.
+ `_.
Partition of the data is performed recursively to select the split
with highest entropy gain with respect to the label.
Therefore, the final binned features will have high correlation with
diff --git a/src/python/docs/docstrings/ToKey.txt b/src/python/docs/docstrings/ToKey.txt
index 2740561b..89a32047 100644
--- a/src/python/docs/docstrings/ToKey.txt
+++ b/src/python/docs/docstrings/ToKey.txt
@@ -1,7 +1,6 @@
"""
- Text transforms that can be performed on data before training
- a model.
+ Converts input values (words, numbers, etc.) to index in a dictionary.
.. remarks::
The ``ToKey`` transform converts a column of text to key values
diff --git a/src/python/docs/docstrings/WordEmbedding.txt b/src/python/docs/docstrings/WordEmbedding.txt
index 41d6f1c6..f19c73d8 100644
--- a/src/python/docs/docstrings/WordEmbedding.txt
+++ b/src/python/docs/docstrings/WordEmbedding.txt
@@ -10,7 +10,7 @@
available options are various versions of `GloVe Models
`_, `FastText
`_, and `Sswe
- `_.
+ `_.
:param model_kind: Pre-trained model used to create the vocabulary.
diff --git a/src/python/docs/sphinx/ci_script/_static/mystyle.css b/src/python/docs/sphinx/ci_script/_static/mystyle.css
index a69e381c..a5df3a24 100644
--- a/src/python/docs/sphinx/ci_script/_static/mystyle.css
+++ b/src/python/docs/sphinx/ci_script/_static/mystyle.css
@@ -8432,7 +8432,7 @@ label {
padding: 0px;
}
/* Flexible box model classes */
-/* Taken from Alex Russell http://infrequently.org/2009/08/css-3-progress/ */
+/* Taken from Alex Russell https://infrequently.org/2009/08/css-3-progress/ */
/* This file is a compatability layer. It allows the usage of flexible box
model layouts accross multiple browsers, including older browsers. The newest,
universal implementation of the flexible box model is used when available (see
diff --git a/src/python/docs/sphinx/ci_script/conf.py b/src/python/docs/sphinx/ci_script/conf.py
index f96889d1..1acb3312 100644
--- a/src/python/docs/sphinx/ci_script/conf.py
+++ b/src/python/docs/sphinx/ci_script/conf.py
@@ -128,8 +128,8 @@
'relative': True,
'reference_url': {
'nimbusml': None,
- 'matplotlib': 'http://matplotlib.org',
- 'numpy': 'http://www.numpy.org/',
+ 'matplotlib': 'https://matplotlib.org',
+ 'numpy': 'https://www.numpy.org/',
'scipy': 'https://www.scipy.org/'},
}
diff --git a/src/python/docs/sphinx/concepts/datasources.rst b/src/python/docs/sphinx/concepts/datasources.rst
index 0a8b1986..731fad98 100644
--- a/src/python/docs/sphinx/concepts/datasources.rst
+++ b/src/python/docs/sphinx/concepts/datasources.rst
@@ -120,15 +120,21 @@ Example:
Output Data Types of Transforms
-------------------------------
-The return type of all of the transforms is a ``pandas.DataFrame``, when they
-are used inside a `sklearn.pipeline.Pipeline
-`_
-or when they are used individually.
-
-However, when used inside a :py:class:`nimbusml.Pipeline`, the outputs are often stored in
+When used inside a `sklearn.pipeline.Pipeline
+`_,
+the return type of all of the transforms is a ``pandas.DataFrame``.
+
+When used individually or inside a :py:class:`nimbusml.Pipeline`
+that contains only transforms, the default output is a ``pandas.DataFrame``. To instead output an
+`IDataView `_,
+pass ``as_binary_data_stream=True`` to either ``transform()`` or ``fit_transform()``.
+To output a sparse CSR matrix, pass ``as_csr=True``.
+See :py:class:`nimbusml.Pipeline` for more information.
+
+Note, when used inside a :py:class:`nimbusml.Pipeline`, the outputs are often stored in
a more optimized :ref:`VectorDataViewType`, which minimizes data conversion to
dataframes. When several transforms are combined inside an :py:class:`nimbusml.Pipeline`,
the intermediate transforms will store the data in the optimized format and only
-the last transform will return a ``pandas.DataFrame``.
+the last transform will return a ``pandas.DataFrame`` (or IDataView/CSR; see above).
diff --git a/src/python/docs/sphinx/conf.py b/src/python/docs/sphinx/conf.py
index 1f0cccfc..9fb1d4ab 100644
--- a/src/python/docs/sphinx/conf.py
+++ b/src/python/docs/sphinx/conf.py
@@ -145,8 +145,8 @@ def install_and_import(package):
'relative': True,
'reference_url': {
'nimbusml': None,
- 'matplotlib': 'http://matplotlib.org',
- 'numpy': 'http://www.numpy.org/',
+ 'matplotlib': 'https://matplotlib.org',
+ 'numpy': 'https://www.numpy.org/',
'scipy': 'https://www.scipy.org/'},
}
diff --git a/src/python/docs/sphinx/modules/feature_extraction/text/dssmfeaturizer.rst b/src/python/docs/sphinx/modules/feature_extraction/text/dssmfeaturizer.rst
deleted file mode 100644
index 7aa663ee..00000000
--- a/src/python/docs/sphinx/modules/feature_extraction/text/dssmfeaturizer.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-`nimbusml.feature_extraction.text.DssmFeaturizer`
-===========================================================
-
-.. autoclass:: nimbusml.feature_extraction.text.DssmFeaturizer
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index dc1a2c39..452dac23 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -93,8 +93,10 @@
-
+
+
+
@@ -249,6 +251,7 @@
+
@@ -296,6 +299,7 @@
+
@@ -352,7 +356,6 @@
-
@@ -451,7 +454,6 @@
-
@@ -474,6 +476,7 @@
+
@@ -530,8 +533,6 @@
-
-
@@ -706,6 +707,7 @@
+
@@ -716,10 +718,10 @@
+
-
@@ -819,14 +821,17 @@
+
+
+
@@ -983,7 +988,6 @@
-
diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py
index 0fdadc02..afb13002 100644
--- a/src/python/nimbusml/__init__.py
+++ b/src/python/nimbusml/__init__.py
@@ -2,7 +2,7 @@
Microsoft Machine Learning for Python
"""
-__version__ = '1.5.1'
+__version__ = '1.6.1'
# CoreCLR version of MicrosoftML is built on Windows.
# But file permissions are not preserved when it's copied to Linux.
diff --git a/src/python/nimbusml/base_transform.py b/src/python/nimbusml/base_transform.py
index 393c3655..b227d567 100644
--- a/src/python/nimbusml/base_transform.py
+++ b/src/python/nimbusml/base_transform.py
@@ -35,7 +35,19 @@ def fit_transform(self, X, y=None, as_binary_data_stream=False,
:param X: array-like with shape=[n_samples, n_features] or else
:py:class:`nimbusml.FileDataStream`
:param y: array-like with shape=[n_samples]
- :return: pandas.DataFrame
+ :param as_binary_data_stream: If ``True`` then output an IDV file.
+ See `here `_
+ for more information.
+ :param params: Additional arguments.
+ If ``as_csr=True`` and ``as_binary_data_stream=False`` then
+ return the transformed data in CSR (sparse matrix) format.
+ If ``as_binary_data_stream`` is also true then that
+ parameter takes precedence over ``as_csr`` and the output will
+ be an IDV file.
+
+ :return: Returns a pandas DataFrame if no other output format
+ is specified. See ``as_binary_data_stream`` and ``as_csr``
+ for other available output formats.
"""
pipeline = Pipeline([self])
try:
@@ -88,8 +100,20 @@ def transform(self, X, as_binary_data_stream=False, **params):
Applies transform to data.
:param X: array-like with shape=[n_samples, n_features] or else
- :py:class:`nimbusml.FileDataStream`
- :return: pandas.DataFrame
+ :py:class:`nimbusml.FileDataStream`
+ :param as_binary_data_stream: If ``True`` then output an IDV file.
+ See `here `_
+ for more information.
+ :param params: Additional arguments.
+ If ``as_csr=True`` and ``as_binary_data_stream=False`` then
+ return the transformed data in CSR (sparse matrix) format.
+ If ``as_binary_data_stream`` is also true then that
+ parameter takes precedence over ``as_csr`` and the output will
+ be an IDV file.
+
+ :return: Returns a pandas DataFrame if no other output format
+ is specified. See ``as_binary_data_stream`` and ``as_csr``
+ for other available output formats.
"""
# Check that the input is of the same shape as the one passed
# during
diff --git a/src/python/nimbusml/datasets/datasets.py b/src/python/nimbusml/datasets/datasets.py
index 56c325a6..9f040ff1 100644
--- a/src/python/nimbusml/datasets/datasets.py
+++ b/src/python/nimbusml/datasets/datasets.py
@@ -3,7 +3,7 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
"""
-Datasets used in MicrosoftML unittests.
+Datasets used in MicrosoftML unittests.
"""
import copy
import os
@@ -15,6 +15,8 @@
__all__ = ["get_dataset", "available_datasets"]
+DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
+
class DataSet:
"""
@@ -175,11 +177,7 @@ def load(self):
# isCase ~ age + parity + education + spontaneous + induced
# education age parity induced case spontaneous stratum
# pooled.stratum
- this = os.path.join(
- os.path.dirname(__file__),
- "data",
- "gplv2",
- "infert.csv")
+ this = os.path.join(DATA_DIR, "gplv2", "infert.csv")
self.__dict__['_data'] = pandas.read_csv(this)
self.__dict__['case'] = self._data["case"]
self._finalize()
@@ -229,11 +227,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "gplv2",
- "infert.csv")
+ return os.path.join(DATA_DIR, "gplv2", "infert.csv")
class DataSetAirQuality(DataSet):
@@ -262,11 +256,7 @@ def load(self):
# isCase ~ age + parity + education + spontaneous + induced
# education age parity induced case spontaneous stratum
# pooled.stratum
- this = os.path.join(
- os.path.dirname(__file__),
- "data",
- "gplv2",
- "airquality.csv")
+ this = os.path.join(DATA_DIR, "gplv2", "airquality.csv")
self.__dict__['_data'] = pandas.read_csv(this)
self._finalize()
@@ -294,11 +284,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "gplv2",
- "airquality.csv")
+ return os.path.join(DATA_DIR, "gplv2", "airquality.csv")
class Topics(DataSet):
@@ -324,8 +310,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(os.path.dirname(__file__), "data",
- "topics.csv")
+ return os.path.join(DATA_DIR, "topics.csv")
class Timeseries(DataSet):
@@ -351,10 +336,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "timeseries.csv")
+ return os.path.join(DATA_DIR, "timeseries.csv")
class WikiDetox_Train(DataSet):
@@ -379,10 +361,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "train-250.wikipedia.sample.tsv")
+ return os.path.join(DATA_DIR, "train-250.wikipedia.sample.tsv")
class WikiDetox_Test(DataSet):
@@ -407,10 +386,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "test.wikipedia.sample.tsv")
+ return os.path.join(DATA_DIR, "test.wikipedia.sample.tsv")
class FS_Train(DataSet):
@@ -435,10 +411,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "train_fs.csv")
+ return os.path.join(DATA_DIR, "train_fs.csv")
class FS_Test(DataSet):
@@ -463,10 +436,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "test_fs.csv")
+ return os.path.join(DATA_DIR, "test_fs.csv")
class MSLTR_Train(DataSet):
@@ -492,10 +462,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "train-msltr.sample.csv")
+ return os.path.join(DATA_DIR, "train-msltr.sample.csv")
class MSLTR_Test(DataSet):
@@ -521,10 +488,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "test-msltr.sample.csv")
+ return os.path.join(DATA_DIR, "test-msltr.sample.csv")
class Uci_Train(DataSet):
@@ -548,10 +512,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "train-500.uciadult.sample.csv")
+ return os.path.join(DATA_DIR, "train-500.uciadult.sample.csv")
class Uci_Test(DataSet):
@@ -575,10 +536,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "test-100.uciadult.sample.csv")
+ return os.path.join(DATA_DIR, "test-100.uciadult.sample.csv")
class Generated_Twitter_Train(DataSet):
@@ -603,10 +561,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "train-twitter.gen-sample.tsv")
+ return os.path.join(DATA_DIR, "train-twitter.gen-sample.tsv")
class Generated_Twitter_Test(DataSet):
@@ -631,10 +586,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "test-twitter.gen-sample.tsv")
+ return os.path.join(DATA_DIR, "test-twitter.gen-sample.tsv")
class Generated_Ticket_Train(DataSet):
@@ -659,10 +611,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "train-ticketchoice.csv")
+ return os.path.join(DATA_DIR, "train-ticketchoice.csv")
class Generated_Ticket_Test(DataSet):
@@ -687,10 +636,7 @@ def as_filepath(self):
"""
Return file name.
"""
- return os.path.join(
- os.path.dirname(__file__),
- "data",
- "test-ticketchoice.csv")
+ return os.path.join(DATA_DIR, "test-ticketchoice.csv")
_datasets = dict(
diff --git a/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py b/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py
index fd3d75a2..4a8f6c44 100644
--- a/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py
+++ b/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py
@@ -44,7 +44,7 @@ class FactorizationMachineBinaryClassifier(
`Field Aware Factorization Machines
`_,
`Field-aware Factorization Machines for CTR Prediction
- `_,
+ `_,
`Adaptive Subgradient Methods for Online Learning and Stochastic
Optimization
`_
diff --git a/src/python/nimbusml/decomposition/pcaanomalydetector.py b/src/python/nimbusml/decomposition/pcaanomalydetector.py
index bdf42b22..85938224 100644
--- a/src/python/nimbusml/decomposition/pcaanomalydetector.py
+++ b/src/python/nimbusml/decomposition/pcaanomalydetector.py
@@ -57,13 +57,12 @@ class PcaAnomalyDetector(core, BasePredictor, ClassifierMixin):
`Randomized Methods for Computing the Singular Value Decomposition
(SVD) of very large matrices
- `_
+ `_
`A randomized algorithm for principal component analysis
`_,
`Finding Structure with Randomness: Probabilistic Algorithms for
Constructing Approximate Matrix Decompositions
- `_
+ `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/ensemble/booster/dart.py b/src/python/nimbusml/ensemble/booster/dart.py
index 33dc8295..ab6857e2 100644
--- a/src/python/nimbusml/ensemble/booster/dart.py
+++ b/src/python/nimbusml/ensemble/booster/dart.py
@@ -24,9 +24,9 @@ class Dart(core):
`_ is an
ensemble method of boosted regression trees. The Dropouts meet
Multiple Additive Regression
- Trees (DART) employs dropouts in MART and overcomes the issues of over-
+ Trees (DART) employs dropouts in MART and overcomes the issues of over-
specialization of MART,
- achiving better performance in many tasks.
+ achieving better performance in many tasks.
**Reference**
diff --git a/src/python/nimbusml/ensemble/booster/goss.py b/src/python/nimbusml/ensemble/booster/goss.py
index 8e57181b..9b17e4ad 100644
--- a/src/python/nimbusml/ensemble/booster/goss.py
+++ b/src/python/nimbusml/ensemble/booster/goss.py
@@ -22,9 +22,9 @@ class Goss(core):
.. remarks::
Gradient-based One-Side Sampling (GOSS) employs an adaptive sampling
named gradient-based
- sampling. For datasets with large sample size, GOSS has considerable
+ sampling. For datasets with large sample size, GOSS has considerable
advantage in terms of
- statistical and computational efficiency.
+ statistical and computational efficiency.
diff --git a/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py b/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py
index ea911977..5e6d5bd9 100644
--- a/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py
+++ b/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py
@@ -55,7 +55,7 @@ class FastForestBinaryClassifier(
**Reference**
`Wikipedia: Random forest
- `_
+ `_
`Quantile regression forest
`_
diff --git a/src/python/nimbusml/ensemble/fastforestregressor.py b/src/python/nimbusml/ensemble/fastforestregressor.py
index 5a2affe4..cb20c847 100644
--- a/src/python/nimbusml/ensemble/fastforestregressor.py
+++ b/src/python/nimbusml/ensemble/fastforestregressor.py
@@ -64,7 +64,7 @@ class FastForestRegressor(core, BasePredictor, RegressorMixin):
**Reference**
`Wikipedia: Random forest
- `_
+ `_
`Quantile regression forest
`_
diff --git a/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py b/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py
index 8c12cb48..24f633fe 100644
--- a/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py
+++ b/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py
@@ -81,7 +81,7 @@ class FastTreesBinaryClassifier(
`_
`Greedy function approximation: A gradient boosting machine.
- `_
+ `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/ensemble/fasttreesregressor.py b/src/python/nimbusml/ensemble/fasttreesregressor.py
index c3994230..12c8c59b 100644
--- a/src/python/nimbusml/ensemble/fasttreesregressor.py
+++ b/src/python/nimbusml/ensemble/fasttreesregressor.py
@@ -83,7 +83,7 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin):
`_
`Greedy function approximation: A gradient boosting machine.
- `_
+ `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/ensemble/fasttreestweedieregressor.py b/src/python/nimbusml/ensemble/fasttreestweedieregressor.py
index 1db266b7..177d9ede 100644
--- a/src/python/nimbusml/ensemble/fasttreestweedieregressor.py
+++ b/src/python/nimbusml/ensemble/fasttreestweedieregressor.py
@@ -38,7 +38,7 @@ class FastTreesTweedieRegressor(
`_
`Greedy function approximation: A gradient boosting machine.
- `_
+ `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/ensemble/gambinaryclassifier.py b/src/python/nimbusml/ensemble/gambinaryclassifier.py
index eb08e95c..79808610 100644
--- a/src/python/nimbusml/ensemble/gambinaryclassifier.py
+++ b/src/python/nimbusml/ensemble/gambinaryclassifier.py
@@ -42,7 +42,7 @@ class GamBinaryClassifier(core, BasePredictor, ClassifierMixin):
functions learned will step between the discretization boundaries.
This implementation is based on the this `paper
- `_,
+ `_,
but diverges from it in several important respects: most
significantly,
in each round of boosting, rather than do one feature at a time, it
@@ -78,7 +78,7 @@ class GamBinaryClassifier(core, BasePredictor, ClassifierMixin):
`Generalized additive models
`_,
`Intelligible Models for Classification and Regression
- `_
+ `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/ensemble/gamregressor.py b/src/python/nimbusml/ensemble/gamregressor.py
index c57ad499..45796805 100644
--- a/src/python/nimbusml/ensemble/gamregressor.py
+++ b/src/python/nimbusml/ensemble/gamregressor.py
@@ -41,7 +41,7 @@ class GamRegressor(core, BasePredictor, RegressorMixin):
functions learned will step between the discretization boundaries.
This implementation is based on the this `paper
- `_,
+ `_,
but diverges from it in several important respects: most
significantly,
in each round of boosting, rather than do one feature at a time, it
@@ -77,7 +77,7 @@ class GamRegressor(core, BasePredictor, RegressorMixin):
`Generalized additive models
`_,
`Intelligible Models for Classification and Regression
- `_
+ `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/examples/AveragedPerceptronBinaryClassifier.py b/src/python/nimbusml/examples/AveragedPerceptronBinaryClassifier.py
index 69566dab..1e0bd727 100644
--- a/src/python/nimbusml/examples/AveragedPerceptronBinaryClassifier.py
+++ b/src/python/nimbusml/examples/AveragedPerceptronBinaryClassifier.py
@@ -20,7 +20,6 @@
feature=['age', 'parity', 'spontaneous'], label='case')])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/FactorizationMachineBinaryClassifier.py b/src/python/nimbusml/examples/FactorizationMachineBinaryClassifier.py
index 508f8a84..52dbcc6f 100644
--- a/src/python/nimbusml/examples/FactorizationMachineBinaryClassifier.py
+++ b/src/python/nimbusml/examples/FactorizationMachineBinaryClassifier.py
@@ -26,7 +26,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/FastForestBinaryClassifier.py b/src/python/nimbusml/examples/FastForestBinaryClassifier.py
index aa7f34ed..1f1a5e3f 100644
--- a/src/python/nimbusml/examples/FastForestBinaryClassifier.py
+++ b/src/python/nimbusml/examples/FastForestBinaryClassifier.py
@@ -25,7 +25,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/FastLinearBinaryClassifier.py b/src/python/nimbusml/examples/FastLinearBinaryClassifier.py
index fd38072a..73f72f03 100644
--- a/src/python/nimbusml/examples/FastLinearBinaryClassifier.py
+++ b/src/python/nimbusml/examples/FastLinearBinaryClassifier.py
@@ -23,7 +23,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/FastLinearClassifier.py b/src/python/nimbusml/examples/FastLinearClassifier.py
index d668a49e..32d00ecd 100644
--- a/src/python/nimbusml/examples/FastLinearClassifier.py
+++ b/src/python/nimbusml/examples/FastLinearClassifier.py
@@ -24,7 +24,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/FastLinearRegressor.py b/src/python/nimbusml/examples/FastLinearRegressor.py
index 4fb64001..64b97cc4 100644
--- a/src/python/nimbusml/examples/FastLinearRegressor.py
+++ b/src/python/nimbusml/examples/FastLinearRegressor.py
@@ -23,7 +23,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/FastTreesBinaryClassifier.py b/src/python/nimbusml/examples/FastTreesBinaryClassifier.py
index 4d9712e1..6a3d1458 100644
--- a/src/python/nimbusml/examples/FastTreesBinaryClassifier.py
+++ b/src/python/nimbusml/examples/FastTreesBinaryClassifier.py
@@ -23,7 +23,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/FastTreesRegressor.py b/src/python/nimbusml/examples/FastTreesRegressor.py
index aac8fc38..a08ac653 100644
--- a/src/python/nimbusml/examples/FastTreesRegressor.py
+++ b/src/python/nimbusml/examples/FastTreesRegressor.py
@@ -23,7 +23,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/FastTreesTweedieRegressor.py b/src/python/nimbusml/examples/FastTreesTweedieRegressor.py
index f6a0bac1..008107ac 100644
--- a/src/python/nimbusml/examples/FastTreesTweedieRegressor.py
+++ b/src/python/nimbusml/examples/FastTreesTweedieRegressor.py
@@ -23,7 +23,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/GamBinaryClassifier.py b/src/python/nimbusml/examples/GamBinaryClassifier.py
index 78ee1ba4..de8d049f 100644
--- a/src/python/nimbusml/examples/GamBinaryClassifier.py
+++ b/src/python/nimbusml/examples/GamBinaryClassifier.py
@@ -23,7 +23,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/GamRegressor.py b/src/python/nimbusml/examples/GamRegressor.py
index c4bf43f8..82a3b70b 100644
--- a/src/python/nimbusml/examples/GamRegressor.py
+++ b/src/python/nimbusml/examples/GamRegressor.py
@@ -23,7 +23,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/Image.py b/src/python/nimbusml/examples/Image.py
index 08c6aa35..78ee62b6 100644
--- a/src/python/nimbusml/examples/Image.py
+++ b/src/python/nimbusml/examples/Image.py
@@ -18,10 +18,10 @@
X = data[['Path']]
y = data[['Label']]
-# define the training pipeline
+# define the training pipeline
pipeline = Pipeline([
Loader(columns={'ImgPath': 'Path'}),
- Resizer(image_width=227, image_height=227,
+ Resizer(image_width=32, image_height=32,
columns={'ImgResize': 'ImgPath'}),
PixelExtractor(columns={'ImgPixels': 'ImgResize'}),
FastLinearBinaryClassifier(feature='ImgPixels')
diff --git a/src/python/nimbusml/examples/KMeansPlusPlus.py b/src/python/nimbusml/examples/KMeansPlusPlus.py
index fab4c2d8..673feb95 100644
--- a/src/python/nimbusml/examples/KMeansPlusPlus.py
+++ b/src/python/nimbusml/examples/KMeansPlusPlus.py
@@ -24,7 +24,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline \
.fit(data) \
.test(data, 'induced', output_scores=True)
diff --git a/src/python/nimbusml/examples/LightGbmBinaryClassifier.py b/src/python/nimbusml/examples/LightGbmBinaryClassifier.py
index 3774c815..b4a99dda 100644
--- a/src/python/nimbusml/examples/LightGbmBinaryClassifier.py
+++ b/src/python/nimbusml/examples/LightGbmBinaryClassifier.py
@@ -26,7 +26,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(
data, 'case').test(
data, output_scores=True)
diff --git a/src/python/nimbusml/examples/LightGbmClassifier.py b/src/python/nimbusml/examples/LightGbmClassifier.py
index 15179a3b..543f72ca 100644
--- a/src/python/nimbusml/examples/LightGbmClassifier.py
+++ b/src/python/nimbusml/examples/LightGbmClassifier.py
@@ -26,7 +26,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/LightGbmRanker.py b/src/python/nimbusml/examples/LightGbmRanker.py
index b137ff94..7b04a87d 100644
--- a/src/python/nimbusml/examples/LightGbmRanker.py
+++ b/src/python/nimbusml/examples/LightGbmRanker.py
@@ -16,7 +16,6 @@
feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group')])
# train, predict, and evaluate.
-# TODO: Replace with CV
metrics, predictions = pipeline \
.fit(data) \
.test(data, output_scores=True)
diff --git a/src/python/nimbusml/examples/LightGbmRegressor.py b/src/python/nimbusml/examples/LightGbmRegressor.py
index 6165f614..cac8a047 100644
--- a/src/python/nimbusml/examples/LightGbmRegressor.py
+++ b/src/python/nimbusml/examples/LightGbmRegressor.py
@@ -26,7 +26,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py b/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py
index 1a2d70e6..50d760ec 100644
--- a/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py
+++ b/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py
@@ -20,7 +20,6 @@
feature=['age', 'parity', 'spontaneous'], label='case')])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/LogisticRegressionBinaryClassifier.py b/src/python/nimbusml/examples/LogisticRegressionBinaryClassifier.py
index a99b5dc3..e9b15be8 100644
--- a/src/python/nimbusml/examples/LogisticRegressionBinaryClassifier.py
+++ b/src/python/nimbusml/examples/LogisticRegressionBinaryClassifier.py
@@ -24,7 +24,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/LogisticRegressionClassifier.py b/src/python/nimbusml/examples/LogisticRegressionClassifier.py
index 232605c8..80af4ee0 100644
--- a/src/python/nimbusml/examples/LogisticRegressionClassifier.py
+++ b/src/python/nimbusml/examples/LogisticRegressionClassifier.py
@@ -24,7 +24,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/NGramExtractor.py b/src/python/nimbusml/examples/NGramExtractor.py
new file mode 100644
index 00000000..facb4596
--- /dev/null
+++ b/src/python/nimbusml/examples/NGramExtractor.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+###############################################################################
+# NGramExtractor
+from nimbusml import FileDataStream, Pipeline
+from nimbusml.datasets import get_dataset
+from nimbusml.preprocessing.schema import ColumnDropper
+from nimbusml.preprocessing.text import CharTokenizer
+from nimbusml.feature_extraction.text import NGramExtractor
+
+# data input (as a FileDataStream)
+path = get_dataset("wiki_detox_train").as_filepath()
+
+data = FileDataStream.read_csv(path, sep='\t')
+print(data.head())
+# Sentiment SentimentText
+# 0 1 ==RUDE== Dude, you are rude upload that carl p...
+# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK...
+# 2 1 Stop trolling, zapatancas, calling me a liar m...
+# 3 1 ==You're cool== You seem like a really cool g...
+# 4 1 ::::: Why are you threatening me? I'm not bein...
+
+# transform usage
+pipe = Pipeline([
+ CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
+ NGramExtractor(ngram_length=1, all_lengths=False, columns={'Ngrams': 'SentimentText_Transform'}),
+ ColumnDropper(columns=['SentimentText_Transform', 'SentimentText', 'Sentiment'])
+ ])
+
+# fit and transform
+features = pipe.fit_transform(data)
+
+print(features.head())
+# Ngrams.<␂> Ngrams.= Ngrams.R Ngrams.U Ngrams.D Ngrams.E ...
+# 0 1.0 4.0 1.0 1.0 2.0 1.0 ...
+# 1 1.0 4.0 0.0 0.0 2.0 3.0 ...
+# 2 1.0 0.0 0.0 0.0 0.0 0.0 ...
+# 3 1.0 4.0 0.0 0.0 0.0 0.0 ...
+# 4 1.0 0.0 0.0 0.0 0.0 0.0 ...
diff --git a/src/python/nimbusml/examples/NaiveBayesClassifier.py b/src/python/nimbusml/examples/NaiveBayesClassifier.py
index 04e038af..8cabd122 100644
--- a/src/python/nimbusml/examples/NaiveBayesClassifier.py
+++ b/src/python/nimbusml/examples/NaiveBayesClassifier.py
@@ -25,7 +25,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/OneVsRestClassifier.py b/src/python/nimbusml/examples/OneVsRestClassifier.py
index e5c864cb..caef3cc6 100644
--- a/src/python/nimbusml/examples/OneVsRestClassifier.py
+++ b/src/python/nimbusml/examples/OneVsRestClassifier.py
@@ -30,7 +30,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/OnlineGradientDescentRegressor.py b/src/python/nimbusml/examples/OnlineGradientDescentRegressor.py
index 85f6e49f..95a6f18c 100644
--- a/src/python/nimbusml/examples/OnlineGradientDescentRegressor.py
+++ b/src/python/nimbusml/examples/OnlineGradientDescentRegressor.py
@@ -24,7 +24,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/OrdinaryLeastSquaresRegressor.py b/src/python/nimbusml/examples/OrdinaryLeastSquaresRegressor.py
index c394f23b..8a9feebc 100644
--- a/src/python/nimbusml/examples/OrdinaryLeastSquaresRegressor.py
+++ b/src/python/nimbusml/examples/OrdinaryLeastSquaresRegressor.py
@@ -24,7 +24,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/PcaAnomalyDetector.py b/src/python/nimbusml/examples/PcaAnomalyDetector.py
index dfe50237..8e16aa91 100644
--- a/src/python/nimbusml/examples/PcaAnomalyDetector.py
+++ b/src/python/nimbusml/examples/PcaAnomalyDetector.py
@@ -24,7 +24,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(
data, 'case', output_scores=True)
# Score
diff --git a/src/python/nimbusml/examples/PoissonRegressionRegressor.py b/src/python/nimbusml/examples/PoissonRegressionRegressor.py
index 5edd5d27..0e2a3653 100644
--- a/src/python/nimbusml/examples/PoissonRegressionRegressor.py
+++ b/src/python/nimbusml/examples/PoissonRegressionRegressor.py
@@ -24,7 +24,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/Schema.py b/src/python/nimbusml/examples/Schema.py
index c0b8d493..c54e708d 100644
--- a/src/python/nimbusml/examples/Schema.py
+++ b/src/python/nimbusml/examples/Schema.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
###############################################################################
# Get schema from a fitted pipeline example.
import numpy as np
@@ -30,4 +31,4 @@
schema = pipe.get_output_columns()
print(schema[0:5])
-# ['Sentiment', 'SentimentText', 'features.Char.>|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u']
+# ['Sentiment', 'SentimentText', 'features.Char.<␂>|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u']
diff --git a/src/python/nimbusml/examples/SgdBinaryClassifier.py b/src/python/nimbusml/examples/SgdBinaryClassifier.py
index df6c7c6a..a31576f0 100644
--- a/src/python/nimbusml/examples/SgdBinaryClassifier.py
+++ b/src/python/nimbusml/examples/SgdBinaryClassifier.py
@@ -24,7 +24,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/SymSgdBinaryClassifier.py b/src/python/nimbusml/examples/SymSgdBinaryClassifier.py
index 9cae2d8f..0d5c09a5 100644
--- a/src/python/nimbusml/examples/SymSgdBinaryClassifier.py
+++ b/src/python/nimbusml/examples/SymSgdBinaryClassifier.py
@@ -24,7 +24,6 @@
])
# train, predict, and evaluate
-# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
# print predictions
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py b/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py
index 8e33ab7b..f049c39a 100644
--- a/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py
+++ b/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py
@@ -1,7 +1,9 @@
###############################################################################
# DateTimeSplitter
import pandas
+from nimbusml import Pipeline
from nimbusml.preprocessing import DateTimeSplitter
+from nimbusml.preprocessing.schema import ColumnSelector
df = pandas.DataFrame(data=dict(
tokens1=[1, 2, 3, 157161600],
@@ -9,16 +11,16 @@
))
cols_to_drop = [
- 'Hour12', 'DayOfWeek', 'DayOfQuarter',
- 'DayOfYear', 'WeekOfMonth', 'QuarterOfYear',
- 'HalfOfYear', 'WeekIso', 'YearIso', 'MonthLabel',
- 'AmPmLabel', 'DayOfWeekLabel', 'IsPaidTimeOff'
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
]
-cd = DateTimeSplitter(prefix='dt',
- country='Canada',
- columns_to_drop=cols_to_drop) << 'tokens1'
-y = cd.fit_transform(df)
+dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'
+
+pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
+y = pipeline.fit_transform(df)
# view the three columns
pandas.set_option('display.max_columns', None)
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/Image_df.py b/src/python/nimbusml/examples/examples_from_dataframe/Image_df.py
index 8dd050a0..3cdbb00e 100644
--- a/src/python/nimbusml/examples/examples_from_dataframe/Image_df.py
+++ b/src/python/nimbusml/examples/examples_from_dataframe/Image_df.py
@@ -18,7 +18,7 @@
# transforms and learners
transform_1 = Loader() << 'Path'
-transform_2 = Resizer(image_width=227, image_height=227)
+transform_2 = Resizer(image_width=32, image_height=32)
transform_3 = PixelExtractor()
algo = FastLinearBinaryClassifier() << 'Path'
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/NGramExtractor_df.py b/src/python/nimbusml/examples/examples_from_dataframe/NGramExtractor_df.py
new file mode 100644
index 00000000..ddc27ab3
--- /dev/null
+++ b/src/python/nimbusml/examples/examples_from_dataframe/NGramExtractor_df.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+###############################################################################
+# Example with NGramExtractor and LogisticRegressionBinaryClassifier
+import pandas
+from nimbusml import Pipeline
+from nimbusml.feature_extraction.text import NGramExtractor
+from nimbusml.linear_model import LogisticRegressionBinaryClassifier
+from nimbusml.preprocessing.schema import ColumnConcatenator, ColumnDropper
+from nimbusml.preprocessing.text import CharTokenizer
+
+train_reviews = pandas.DataFrame(
+ data=dict(
+ review=[
+ "This is great",
+ "I hate it",
+ "Love it",
+ "Do not like it",
+ "Really like it",
+ "I hate it",
+ "I like it a lot",
+ "I kind of hate it",
+ "I do like it",
+ "I really hate it",
+ "It is very good",
+ "I hate it a bunch",
+ "I love it a bunch",
+ "I hate it",
+ "I like it very much",
+ "I hate it very much.",
+ "I really do love it",
+ "I really do hate it",
+ "Love it!",
+ "Hate it!",
+ "I love it",
+ "I hate it",
+ "I love it",
+ "I hate it",
+ "I love it"],
+ like=[
+ True,
+ False,
+ True,
+ False,
+ True,
+ False,
+ True,
+ False,
+ True,
+ False,
+ True,
+ False,
+ True,
+ False,
+ True,
+ False,
+ True,
+ False,
+ True,
+ False,
+ True,
+ False,
+ True,
+ False,
+ True]))
+
+test_reviews = pandas.DataFrame(
+ data=dict(
+ review=[
+ "This is great",
+ "I hate it",
+ "Love it",
+ "Really like it",
+ "I hate it",
+ "I like it a lot",
+ "I love it",
+ "I do like it",
+ "I really hate it",
+ "I love it"]))
+
+y = train_reviews['like']
+X = train_reviews.loc[:, train_reviews.columns != 'like']
+
+pipeline = Pipeline([
+ CharTokenizer(columns={'review_transform': 'review'}),
+ NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}),
+ ColumnDropper(columns=['review_transform', 'review'])
+])
+X = pipeline.fit_transform(X)
+
+print(X.head())
+# ngrams.<␂>|T|h ngrams.T|h|i ngrams.h|i|s ngrams.i|s|<␠> ... ngrams.i|t|! ngrams.t|!|<␃> ngrams.<␂>|H|a ngrams.H|a|t
+# 0 1.0 1.0 1.0 2.0 ... 0.0 0.0 0.0 0.0
+# 1 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
+# 2 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
+# 3 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
+# 4 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
+
+model = LogisticRegressionBinaryClassifier().fit(X, y)
+
+X_test = pipeline.transform(test_reviews)
+result = model.predict(X_test)
+
+print(result)
+# 0 True
+# 1 False
+# 2 True
+# 3 True
+# 4 False
+# 5 True
+# 6 True
+# 7 True
+# 8 False
+# 9 True
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py
index 9a4eba53..074ce92f 100644
--- a/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py
+++ b/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py
@@ -2,8 +2,7 @@
# WordEmbedding: pre-trained transform to generate word embeddings
import pandas
from nimbusml import Pipeline
-from nimbusml.feature_extraction.text import WordEmbedding
-from nimbusml.feature_extraction.text.ngramfeaturizer import NGramFeaturizer
+from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding
from nimbusml.feature_extraction.text.extractor import Ngram
# create the data
@@ -22,5 +21,12 @@
])
y = pipeline.fit_transform(customer_reviews)
-# view the review embeddings
-# print(y.head())
+# view a small subset of the review embeddings
+print(y.iloc[:5, -3:])
+# review_TransformedText.147 review_TransformedText.148 review_TransformedText.149
+# 0 1.918661 -0.714531 3.062141
+# 1 1.891922 -0.248650 1.706620
+# 2 1.601611 0.309785 3.379576
+# 3 1.970666 1.477450 3.110802
+# 4 2.521791 0.122538 3.129919
+
diff --git a/src/python/nimbusml/feature_extraction/image/loader.py b/src/python/nimbusml/feature_extraction/image/loader.py
index bd93a080..0ee0e305 100644
--- a/src/python/nimbusml/feature_extraction/image/loader.py
+++ b/src/python/nimbusml/feature_extraction/image/loader.py
@@ -20,7 +20,7 @@
class Loader(core, BaseTransform, TransformerMixin):
"""
- Loaders image data.
+ Loads image data.
.. remarks::
``Loader`` loads images from paths.
diff --git a/src/python/nimbusml/feature_extraction/image/resizer.py b/src/python/nimbusml/feature_extraction/image/resizer.py
index 77d9434f..2a8baf4a 100644
--- a/src/python/nimbusml/feature_extraction/image/resizer.py
+++ b/src/python/nimbusml/feature_extraction/image/resizer.py
@@ -20,16 +20,16 @@
class Resizer(core, BaseTransform, TransformerMixin):
"""
- Resizers an image to a specified dimension using a specified
+ Resizes an image to a specified dimension using a specified
resizing method.
.. remarks::
- ``Resizer`` resizers an image to the specified height and width
+ ``Resizer`` resizes an image to the specified height and width
using a specified resizing method. The input variables to this
transforms must
be images, typically the result of the ``Loader`` transform.
- :param columns: a dictionary of key-value pairs, where key is the output
+ :param columns: A dictionary of key-value pairs, where key is the output
column name and value is the input column name.
* Multiple key-value pairs are allowed.
diff --git a/src/python/nimbusml/feature_extraction/text/__init__.py b/src/python/nimbusml/feature_extraction/text/__init__.py
index 7dbd24cf..9c16726e 100644
--- a/src/python/nimbusml/feature_extraction/text/__init__.py
+++ b/src/python/nimbusml/feature_extraction/text/__init__.py
@@ -1,10 +1,12 @@
from .lightlda import LightLda
+from .ngramextractor import NGramExtractor
from .ngramfeaturizer import NGramFeaturizer
from .sentiment import Sentiment
from .wordembedding import WordEmbedding
__all__ = [
'LightLda',
+ 'NGramExtractor',
'NGramFeaturizer',
'Sentiment',
'WordEmbedding'
diff --git a/src/python/nimbusml/feature_extraction/text/extractor/ngram.py b/src/python/nimbusml/feature_extraction/text/extractor/ngram.py
index 9ec1858f..6da8cfd2 100644
--- a/src/python/nimbusml/feature_extraction/text/extractor/ngram.py
+++ b/src/python/nimbusml/feature_extraction/text/extractor/ngram.py
@@ -18,7 +18,7 @@
class Ngram(core):
"""
- Extracts NGrams from text and convert them to vector using
+ Extracts NGrams from text and converts them to vector using
dictionary.
.. remarks::
diff --git a/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py b/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py
index 2f373a31..fca66615 100644
--- a/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py
+++ b/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py
@@ -18,7 +18,7 @@
class NgramHash(core):
"""
- Extracts NGrams from text and convert them to vector using hashing
+ Extracts NGrams from text and converts them to vector using hashing
trick.
.. remarks::
diff --git a/src/python/nimbusml/feature_extraction/text/lightlda.py b/src/python/nimbusml/feature_extraction/text/lightlda.py
index 271f90c7..f8801caa 100644
--- a/src/python/nimbusml/feature_extraction/text/lightlda.py
+++ b/src/python/nimbusml/feature_extraction/text/lightlda.py
@@ -30,7 +30,7 @@ class LightLda(core, BaseTransform, TransformerMixin):
topical vectors. LightLDA is an extremely
efficient implementation of LDA developed in MSR-Asia that
incorporates a number of optimization techniques
- `(http://arxiv.org/abs/1412.1576) `_.
+ `(https://arxiv.org/abs/1412.1576) `_.
With the LDA transform, we can
train a topic model to produce 1 million topics with 1 million
vocabulary on a 1-billion-token document set one
diff --git a/src/python/nimbusml/feature_extraction/text/ngramextractor.py b/src/python/nimbusml/feature_extraction/text/ngramextractor.py
new file mode 100644
index 00000000..f27b7004
--- /dev/null
+++ b/src/python/nimbusml/feature_extraction/text/ngramextractor.py
@@ -0,0 +1,72 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+NGramExtractor
+"""
+
+__all__ = ["NGramExtractor"]
+
+
+from sklearn.base import TransformerMixin
+
+from ...base_transform import BaseTransform
+from ...internal.core.feature_extraction.text.ngramextractor import \
+ NGramExtractor as core
+from ...internal.utils.utils import trace
+
+
+class NGramExtractor(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.
+
+ :param columns: see `Columns `_.
+
+ :param ngram_length: Maximum n-gram length.
+
+ :param all_lengths: Whether to store all n-gram lengths up to ngramLength,
+ or only ngramLength.
+
+ :param skip_length: Maximum number of tokens to skip when constructing an
+ n-gram.
+
+ :param max_num_terms: Maximum number of n-grams to store in the dictionary.
+
+ :param weighting: The weighting criteria.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ ngram_length=2,
+ all_lengths=True,
+ skip_length=0,
+ max_num_terms=[10000000],
+ weighting='Tf',
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ ngram_length=ngram_length,
+ all_lengths=all_lengths,
+ skip_length=skip_length,
+ max_num_terms=max_num_terms,
+ weighting=weighting,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/nimbusml/feature_extraction/text/wordembedding.py b/src/python/nimbusml/feature_extraction/text/wordembedding.py
index ad467ce1..957cf06d 100644
--- a/src/python/nimbusml/feature_extraction/text/wordembedding.py
+++ b/src/python/nimbusml/feature_extraction/text/wordembedding.py
@@ -31,7 +31,7 @@ class WordEmbedding(core, BaseTransform, TransformerMixin):
available options are various versions of `GloVe Models
`_, `FastText
`_, and `Sswe
- `_.
+ `_.
:param columns: a dictionary of key-value pairs, where key is the output
diff --git a/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py b/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py
index c54f353b..bdc0a7d2 100644
--- a/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py
+++ b/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py
@@ -42,7 +42,7 @@ class FactorizationMachineBinaryClassifier(
`Field Aware Factorization Machines
`_,
`Field-aware Factorization Machines for CTR Prediction
- `_,
+ `_,
`Adaptive Subgradient Methods for Online Learning and Stochastic
Optimization
`_
diff --git a/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py b/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py
index 728a7132..9fe01d4f 100644
--- a/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py
+++ b/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py
@@ -57,13 +57,12 @@ class PcaAnomalyDetector(
`Randomized Methods for Computing the Singular Value Decomposition
(SVD) of very large matrices
- `_
+ `_
`A randomized algorithm for principal component analysis
`_,
`Finding Structure with Randomness: Probabilistic Algorithms for
Constructing Approximate Matrix Decompositions
- `_
+ `_
:param normalize: Specifies the type of automatic normalization used:
diff --git a/src/python/nimbusml/internal/core/ensemble/booster/dart.py b/src/python/nimbusml/internal/core/ensemble/booster/dart.py
index dd4418d3..49297929 100644
--- a/src/python/nimbusml/internal/core/ensemble/booster/dart.py
+++ b/src/python/nimbusml/internal/core/ensemble/booster/dart.py
@@ -25,9 +25,9 @@ class Dart(Component):
`_ is an
ensemble method of boosted regression trees. The Dropouts meet
Multiple Additive Regression
- Trees (DART) employs dropouts in MART and overcomes the issues of over-
+ Trees (DART) employs dropouts in MART and overcomes the issues of over-
specialization of MART,
- achiving better performance in many tasks.
+ achieving better performance in many tasks.
**Reference**
diff --git a/src/python/nimbusml/internal/core/ensemble/booster/goss.py b/src/python/nimbusml/internal/core/ensemble/booster/goss.py
index 694cb8bf..aa552afc 100644
--- a/src/python/nimbusml/internal/core/ensemble/booster/goss.py
+++ b/src/python/nimbusml/internal/core/ensemble/booster/goss.py
@@ -23,9 +23,9 @@ class Goss(Component):
.. remarks::
Gradient-based One-Side Sampling (GOSS) employs an adaptive sampling
named gradient-based
- sampling. For datasets with large sample size, GOSS has considerable
+ sampling. For datasets with large sample size, GOSS has considerable
advantage in terms of
- statistical and computational efficiency.
+ statistical and computational efficiency.
diff --git a/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py
index 270584a3..715c2035 100644
--- a/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py
+++ b/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py
@@ -54,7 +54,7 @@ class FastForestBinaryClassifier(
**Reference**
`Wikipedia: Random forest
- `_
+ `_
`Quantile regression forest
`_
diff --git a/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py b/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py
index 74698a6d..37278659 100644
--- a/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py
+++ b/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py
@@ -64,7 +64,7 @@ class FastForestRegressor(
**Reference**
`Wikipedia: Random forest
- `_
+ `_
`Quantile regression forest
`_
diff --git a/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py
index 37e5cd76..eef52d67 100644
--- a/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py
+++ b/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py
@@ -78,7 +78,7 @@ class FastTreesBinaryClassifier(
`_
`Greedy function approximation: A gradient boosting machine.
- `_
+ `_
:param number_of_trees: Specifies the total number of decision trees to
create in the ensemble. By creating more decision trees, you can
diff --git a/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py b/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py
index 3ee724c4..25becac7 100644
--- a/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py
+++ b/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py
@@ -83,7 +83,7 @@ class FastTreesRegressor(
`_
`Greedy function approximation: A gradient boosting machine.
- `_
+ `_
:param number_of_trees: Specifies the total number of decision trees to
create in the ensemble. By creating more decision trees, you can
diff --git a/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py b/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py
index f9340f5d..75a15169 100644
--- a/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py
+++ b/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py
@@ -35,7 +35,7 @@ class FastTreesTweedieRegressor(
`_
`Greedy function approximation: A gradient boosting machine.
- `_
+ `_
:param number_of_trees: Specifies the total number of decision trees to
create in the ensemble. By creating more decision trees, you can
diff --git a/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py
index 56d90d7e..52f2f565 100644
--- a/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py
+++ b/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py
@@ -42,7 +42,7 @@ class GamBinaryClassifier(
functions learned will step between the discretization boundaries.
This implementation is based on the this `paper
- `_,
+ `_,
but diverges from it in several important respects: most
significantly,
in each round of boosting, rather than do one feature at a time, it
@@ -78,7 +78,7 @@ class GamBinaryClassifier(
`Generalized additive models
`_,
`Intelligible Models for Classification and Regression
- `_
+ `_
:param number_of_iterations: Total number of iterations over all features.
diff --git a/src/python/nimbusml/internal/core/ensemble/gamregressor.py b/src/python/nimbusml/internal/core/ensemble/gamregressor.py
index 048bf874..de884d9a 100644
--- a/src/python/nimbusml/internal/core/ensemble/gamregressor.py
+++ b/src/python/nimbusml/internal/core/ensemble/gamregressor.py
@@ -40,7 +40,7 @@ class GamRegressor(BasePipelineItem, DefaultSignatureWithRoles):
functions learned will step between the discretization boundaries.
This implementation is based on the this `paper
- `_,
+ `_,
but diverges from it in several important respects: most
significantly,
in each round of boosting, rather than do one feature at a time, it
@@ -76,7 +76,7 @@ class GamRegressor(BasePipelineItem, DefaultSignatureWithRoles):
`Generalized additive models
`_,
`Intelligible Models for Classification and Regression
- `_
+ `_
:param number_of_iterations: Total number of iterations over all features.
diff --git a/src/python/nimbusml/internal/core/feature_extraction/image/loader.py b/src/python/nimbusml/internal/core/feature_extraction/image/loader.py
index ad8c70c1..888afab4 100644
--- a/src/python/nimbusml/internal/core/feature_extraction/image/loader.py
+++ b/src/python/nimbusml/internal/core/feature_extraction/image/loader.py
@@ -18,7 +18,7 @@
class Loader(BasePipelineItem, DefaultSignature):
"""
- Loaders image data.
+ Loads image data.
.. remarks::
``Loader`` loads images from paths.
diff --git a/src/python/nimbusml/internal/core/feature_extraction/image/resizer.py b/src/python/nimbusml/internal/core/feature_extraction/image/resizer.py
index 34ba1f39..819fb51c 100644
--- a/src/python/nimbusml/internal/core/feature_extraction/image/resizer.py
+++ b/src/python/nimbusml/internal/core/feature_extraction/image/resizer.py
@@ -18,11 +18,11 @@
class Resizer(BasePipelineItem, DefaultSignature):
"""
- Resizers an image to a specified dimension using a specified
+ Resizes an image to a specified dimension using a specified
resizing method.
.. remarks::
- ``Resizer`` resizers an image to the specified height and width
+ ``Resizer`` resizes an image to the specified height and width
using a specified resizing method. The input variables to this
transforms must
be images, typically the result of the ``Loader`` transform.
diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py
index 07fde941..a7292f9c 100644
--- a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py
+++ b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py
@@ -18,7 +18,7 @@
class Ngram(Component):
"""
- Extracts NGrams from text and convert them to vector using
+ Extracts NGrams from text and converts them to vector using
dictionary.
.. remarks::
diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py
index cd08b4be..04cb7713 100644
--- a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py
+++ b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py
@@ -18,7 +18,7 @@
class NgramHash(Component):
"""
- Extracts NGrams from text and convert them to vector using hashing
+ Extracts NGrams from text and converts them to vector using hashing
trick.
.. remarks::
diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py b/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py
index 45743c1b..8fbcc6e5 100644
--- a/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py
+++ b/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py
@@ -28,7 +28,7 @@ class LightLda(BasePipelineItem, DefaultSignature):
topical vectors. LightLDA is an extremely
efficient implementation of LDA developed in MSR-Asia that
incorporates a number of optimization techniques
- `(http://arxiv.org/abs/1412.1576) `_.
+ `(https://arxiv.org/abs/1412.1576) `_.
With the LDA transform, we can
train a topic model to produce 1 million topics with 1 million
vocabulary on a 1-billion-token document set one
diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/ngramextractor.py b/src/python/nimbusml/internal/core/feature_extraction/text/ngramextractor.py
new file mode 100644
index 00000000..c627addd
--- /dev/null
+++ b/src/python/nimbusml/internal/core/feature_extraction/text/ngramextractor.py
@@ -0,0 +1,111 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+NGramExtractor
+"""
+
+__all__ = ["NGramExtractor"]
+
+
+from ....entrypoints.transforms_ngramtranslator import \
+ transforms_ngramtranslator
+from ....utils.utils import trace
+from ...base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class NGramExtractor(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.
+
+ :param ngram_length: Maximum n-gram length.
+
+ :param all_lengths: Whether to store all n-gram lengths up to ngramLength,
+ or only ngramLength.
+
+ :param skip_length: Maximum number of tokens to skip when constructing an
+ n-gram.
+
+ :param max_num_terms: Maximum number of n-grams to store in the dictionary.
+
+ :param weighting: The weighting criteria.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ ngram_length=2,
+ all_lengths=True,
+ skip_length=0,
+ max_num_terms=[10000000],
+ weighting='Tf',
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ self.ngram_length = ngram_length
+ self.all_lengths = all_lengths
+ self.skip_length = skip_length
+ self.max_num_terms = max_num_terms
+ self.weighting = weighting
+
+ @property
+ def _entrypoint(self):
+ return transforms_ngramtranslator
+
+ @trace
+ def _get_node(self, **all_args):
+
+ input_columns = self.input
+ if input_columns is None and 'input' in all_args:
+ input_columns = all_args['input']
+ if 'input' in all_args:
+ all_args.pop('input')
+
+ output_columns = self.output
+ if output_columns is None and 'output' in all_args:
+ output_columns = all_args['output']
+ if 'output' in all_args:
+ all_args.pop('output')
+
+ # validate input
+ if input_columns is None:
+ raise ValueError(
+ "'None' input passed when it cannot be none.")
+
+ if not isinstance(input_columns, list):
+ raise ValueError(
+ "input has to be a list of strings, instead got %s" %
+ type(input_columns))
+
+ # validate output
+ if output_columns is None:
+ output_columns = input_columns
+
+ if not isinstance(output_columns, list):
+ raise ValueError(
+ "output has to be a list of strings, instead got %s" %
+ type(output_columns))
+
+ algo_args = dict(
+ column=[
+ dict(
+ Source=i,
+ Name=o) for i,
+ o in zip(
+ input_columns,
+ output_columns)] if input_columns else None,
+ ngram_length=self.ngram_length,
+ all_lengths=self.all_lengths,
+ skip_length=self.skip_length,
+ max_num_terms=self.max_num_terms,
+ weighting=self.weighting)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py b/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py
index d67df9db..45553249 100644
--- a/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py
+++ b/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py
@@ -28,7 +28,7 @@ class WordEmbedding(BasePipelineItem, DefaultSignature):
available options are various versions of `GloVe Models
`_, `FastText
`_, and `Sswe
- `_.
+ `_.
:param model_kind: Pre-trained model used to create the vocabulary.
diff --git a/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py
index 26471467..67f10cfc 100644
--- a/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py
+++ b/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py
@@ -67,10 +67,10 @@ class AveragedPerceptronBinaryClassifier(
`_
`Large Margin Classification Using the Perceptron Algorithm
- `_
+ `_
`Discriminative Training Methods for Hidden Markov Models
- `_
+ `_
:param normalize: Specifies the type of automatic normalization used:
diff --git a/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py
index 10c5c2a5..f8346814 100644
--- a/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py
+++ b/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py
@@ -23,7 +23,7 @@ class FastLinearBinaryClassifier(
"""
A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer
- for linear binary classification and regression.
+ for linear binary classification.
.. remarks::
``FastLinearBinaryClassifier`` is a trainer based on the Stochastic
@@ -80,8 +80,7 @@ class FastLinearBinaryClassifier(
content/uploads/2016/06/main-3.pdf>`_
`Stochastic Dual Coordinate Ascent Methods for Regularized Loss
- Minimization `_
+ Minimization `_
:param l2_regularization: L2 regularizer constant. By default the l2
diff --git a/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py b/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py
index a2880b79..4afcba87 100644
--- a/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py
+++ b/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py
@@ -22,7 +22,8 @@ class FastLinearClassifier(
DefaultSignatureWithRoles):
"""
- Train an SDCA multi class model
+ A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer for
+ multi class classification.
.. remarks::
``FastLinearClassifier`` is a trainer based on the Stochastic Dual
@@ -78,8 +79,7 @@ class FastLinearClassifier(
content/uploads/2016/06/main-3.pdf>`_
`Stochastic Dual Coordinate Ascent Methods for Regularized Loss
- Minimization `_
+ Minimization `_
:param l2_regularization: L2 regularizer constant. By default the l2
diff --git a/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py b/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py
index cf9073e5..597e3dfb 100644
--- a/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py
+++ b/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py
@@ -23,7 +23,7 @@ class FastLinearRegressor(
"""
A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer
- for linear binary classification and regression.
+ for linear regression.
.. remarks::
``FastLinearRegressor`` is a trainer based on the Stochastic Dual
@@ -78,8 +78,7 @@ class FastLinearRegressor(
content/uploads/2016/06/main-3.pdf>`_
`Stochastic Dual Coordinate Ascent Methods for Regularized Loss
- Minimization `_
+ Minimization `_
:param l2_regularization: L2 regularizer constant. By default the l2
diff --git a/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py
index 098c92e9..50b344ac 100644
--- a/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py
+++ b/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py
@@ -90,14 +90,14 @@ class LogisticRegressionBinaryClassifier(
**Reference**
- `Wikipedia: L-BFGS `_
+ `Wikipedia: L-BFGS `_
`Wikipedia: Logistic
- regression `_
+ regression `_
`Scalable
Training of L1-Regularized Log-Linear Models
- `_
+ `_
`Test Run - L1
and L2 Regularization for Machine Learning
diff --git a/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py b/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py
index 90af2ffb..3fd6efba 100644
--- a/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py
+++ b/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py
@@ -91,14 +91,14 @@ class LogisticRegressionClassifier(
**Reference**
- `Wikipedia: L-BFGS `_
+ `Wikipedia: L-BFGS `_
`Wikipedia: Logistic
- regression `_
+ regression `_
`Scalable
Training of L1-Regularized Log-Linear Models
- `_
+ `_
`Test Run - L1
and L2 Regularization for Machine Learning
diff --git a/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py
index b0c5e898..aada6337 100644
--- a/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py
+++ b/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py
@@ -35,14 +35,14 @@ class SgdBinaryClassifier(
associated optimization problem is sparse, then Hogwild SGD achieves
a
nearly optimal rate of convergence. For a detailed reference, please
- refer to `http://arxiv.org/pdf/1106.5730v2.pdf
- `_.
+ refer to `https://arxiv.org/pdf/1106.5730v2.pdf
+ `_.
**Reference**
- `http://arxiv.org/pdf/1106.5730v2.pdf
- `_
+ `https://arxiv.org/pdf/1106.5730v2.pdf
+ `_
:param normalize: Specifies the type of automatic normalization used:
diff --git a/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py b/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py
index db2c39ef..a00c3dc6 100644
--- a/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py
+++ b/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py
@@ -23,8 +23,6 @@ class DateTimeSplitter(BasePipelineItem, DefaultSignature):
:param prefix: Output column prefix.
- :param columns_to_drop: Columns to drop after the DateTime Expansion.
-
:param country: Country to get holidays for. Defaults to none if not
passed.
@@ -36,14 +34,12 @@ class DateTimeSplitter(BasePipelineItem, DefaultSignature):
def __init__(
self,
prefix,
- columns_to_drop=None,
country='None',
**params):
BasePipelineItem.__init__(
self, type='transform', **params)
self.prefix = prefix
- self.columns_to_drop = columns_to_drop
self.country = country
@property
@@ -55,7 +51,6 @@ def _get_node(self, **all_args):
algo_args = dict(
source=self.source,
prefix=self.prefix,
- columns_to_drop=self.columns_to_drop,
country=self.country)
all_args.update(algo_args)
diff --git a/src/python/nimbusml/internal/core/preprocessing/fromkey.py b/src/python/nimbusml/internal/core/preprocessing/fromkey.py
index bd5cfe10..ef7f8efb 100644
--- a/src/python/nimbusml/internal/core/preprocessing/fromkey.py
+++ b/src/python/nimbusml/internal/core/preprocessing/fromkey.py
@@ -19,8 +19,7 @@
class FromKey(BasePipelineItem, DefaultSignature):
"""
- Text transforms that can be performed on data before training
- a model.
+ Converts the key types back to their original values.
.. remarks::
The ``FromKey`` transform converts a column of keys, generated using
diff --git a/src/python/nimbusml/internal/core/preprocessing/missing_values/handler.py b/src/python/nimbusml/internal/core/preprocessing/missing_values/handler.py
index 55f0ed01..3fd199aa 100644
--- a/src/python/nimbusml/internal/core/preprocessing/missing_values/handler.py
+++ b/src/python/nimbusml/internal/core/preprocessing/missing_values/handler.py
@@ -33,14 +33,13 @@ class Handler(BasePipelineItem, DefaultSignature):
were imputed. This works for columns that have numeric type.
:param replace_with: The method to use to replace NaN values. The
- following choices are available.
-
- * Def: Replace with default value of that type, usually ``0``. If no
- replace
- method is specified, this is the default strategy.
- * Mean: Replace NaN values with the mean of the values in that column.
- * Min: Replace with minimum value in the column.
- * Max: Replace with maximum value in the column.
+ following choices are available.
+
+ * Def: Replace with default value of that type, usually ``0``. If no
+ replace method is specified, this is the default strategy.
+ * Mean: Replace NaN values with the mean of the values in that column.
+ * Min: Replace with minimum value in the column.
+ * Max: Replace with maximum value in the column.
:param impute_by_slot: Whether to impute values by slot.
diff --git a/src/python/nimbusml/internal/core/preprocessing/schema/prefixcolumnconcatenator.py b/src/python/nimbusml/internal/core/preprocessing/schema/prefixcolumnconcatenator.py
index d202e947..003e909f 100644
--- a/src/python/nimbusml/internal/core/preprocessing/schema/prefixcolumnconcatenator.py
+++ b/src/python/nimbusml/internal/core/preprocessing/schema/prefixcolumnconcatenator.py
@@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
PrefixColumnConcatenator
"""
@@ -15,10 +16,12 @@
from ...base_pipeline_item import BasePipelineItem, DefaultSignature
-class PrefixColumnConcatenator(BasePipelineItem, DefaultSignature):
+class PrefixColumnConcatenator(
+ BasePipelineItem,
+ DefaultSignature):
"""
- Combines several columns into a single vector-valued column by prefix
+ Combines several columns into a single vector-valued column by prefix.
.. remarks::
``PrefixColumnConcatenator`` creates a single vector-valued column from
diff --git a/src/python/nimbusml/internal/core/preprocessing/tokey.py b/src/python/nimbusml/internal/core/preprocessing/tokey.py
index 55cd7200..b1295adf 100644
--- a/src/python/nimbusml/internal/core/preprocessing/tokey.py
+++ b/src/python/nimbusml/internal/core/preprocessing/tokey.py
@@ -19,8 +19,7 @@
class ToKey(BasePipelineItem, DefaultSignature):
"""
- Text transforms that can be performed on data before training
- a model.
+ Converts input values (words, numbers, etc.) to index in a dictionary.
.. remarks::
The ``ToKey`` transform converts a column of text to key values
diff --git a/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py b/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py
index da39b2c8..f1ee5f6b 100644
--- a/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py
+++ b/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py
@@ -30,7 +30,7 @@ class SsaForecaster(BasePipelineItem, DefaultSignature):
input time-series where each component in the spectrum corresponds to a
trend, seasonal or noise component in the time-series. For details of the
Singular Spectrum Analysis (SSA), refer to `this document
- `_.
+ `_.
:param window_size: The length of the window on the series for building the
trajectory matrix (parameter L).
@@ -38,7 +38,7 @@ class SsaForecaster(BasePipelineItem, DefaultSignature):
:param series_length: The length of series that is kept in buffer for
modeling (parameter N).
- :param train_size: The length of series from the begining used for
+ :param train_size: The length of series from the beginning used for
training.
:param horizon: The number of values to forecast.
diff --git a/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_parquetpathparser.py b/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_parquetpathparser.py
deleted file mode 100644
index a5c34acb..00000000
--- a/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_parquetpathparser.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# - Generated by tools/entrypoint_compiler.py: do not edit by hand
-"""
-ParquetPathParser
-"""
-
-
-from ..utils.entrypoints import Component
-
-
-def parquet_path_parser(
- **params):
- """
- **Description**
- Extract name/value pairs from Parquet formatted directory names.
- Example path: Year=2018/Month=12/data1.parquet
-
- """
-
- entrypoint_name = 'ParquetPathParser'
- settings = {}
-
- component = Component(
- name=entrypoint_name,
- settings=settings,
- kind='PartitionedPathParser')
- return component
diff --git a/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_simplepathparser.py b/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_simplepathparser.py
deleted file mode 100644
index 3f63ac19..00000000
--- a/src/python/nimbusml/internal/entrypoints/_partitionedpathparser_simplepathparser.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# - Generated by tools/entrypoint_compiler.py: do not edit by hand
-"""
-SimplePathParser
-"""
-
-
-from ..utils.entrypoints import Component
-from ..utils.utils import try_set
-
-
-def simple_path_parser(
- columns=None,
- type='TX',
- **params):
- """
- **Description**
- A simple parser that extracts directory names as column values.
- Column names are defined as arguments.
-
- :param columns: Column definitions used to override the
- Partitioned Path Parser. Expected with the format
- name:type:numeric-source, for example, col=MyFeature:R4:1
- (settings).
- :param type: Data type of each column. (settings).
- """
-
- entrypoint_name = 'SimplePathParser'
- settings = {}
-
- if columns is not None:
- settings['Columns'] = try_set(
- obj=columns,
- none_acceptable=True,
- is_of_type=list,
- is_column=True)
- if type is not None:
- settings['Type'] = try_set(
- obj=type,
- none_acceptable=True,
- is_of_type=str,
- values=[
- 'I1',
- 'U1',
- 'I2',
- 'U2',
- 'I4',
- 'U4',
- 'I8',
- 'U8',
- 'R4',
- 'Num',
- 'R8',
- 'TX',
- 'Text',
- 'TXT',
- 'BL',
- 'Bool',
- 'TimeSpan',
- 'TS',
- 'DT',
- 'DateTime',
- 'DZ',
- 'DateTimeZone',
- 'UG',
- 'U16'])
-
- component = Component(
- name=entrypoint_name,
- settings=settings,
- kind='PartitionedPathParser')
- return component
diff --git a/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py b/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py
deleted file mode 100644
index 3c080eb6..00000000
--- a/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# - Generated by tools/entrypoint_compiler.py: do not edit by hand
-"""
-Models.OnnxConverter
-"""
-
-
-from ..utils.entrypoints import EntryPoint
-from ..utils.utils import try_set, unlist
-
-
-def models_onnxconverter(
- onnx,
- data_file=None,
- json=None,
- name=None,
- domain=None,
- inputs_to_drop=None,
- outputs_to_drop=None,
- model=None,
- onnx_version='Stable',
- predictive_model=None,
- **params):
- """
- **Description**
- Converts the model to ONNX format.
-
- :param data_file: The data file (inputs).
- :param onnx: The path to write the output ONNX to. (inputs).
- :param json: The path to write the output JSON to. (inputs).
- :param name: The 'name' property in the output ONNX. By default
- this will be the ONNX extension-less name. (inputs).
- :param domain: The 'domain' property in the output ONNX.
- (inputs).
- :param inputs_to_drop: Array of input column names to drop
- (inputs).
- :param outputs_to_drop: Array of output column names to drop
- (inputs).
- :param model: Model that needs to be converted to ONNX format.
- (inputs).
- :param onnx_version: The targeted ONNX version. It can be either
- "Stable" or "Experimental". If "Experimental" is used,
- produced model can contain components that is not officially
- supported in ONNX standard. (inputs).
- :param predictive_model: Predictor model that needs to be
- converted to ONNX format. (inputs).
- """
-
- entrypoint_name = 'Models.OnnxConverter'
- inputs = {}
- outputs = {}
-
- if data_file is not None:
- inputs['DataFile'] = try_set(
- obj=data_file,
- none_acceptable=True,
- is_of_type=str)
- if onnx is not None:
- inputs['Onnx'] = try_set(
- obj=onnx,
- none_acceptable=False,
- is_of_type=str)
- if json is not None:
- inputs['Json'] = try_set(
- obj=json,
- none_acceptable=True,
- is_of_type=str)
- if name is not None:
- inputs['Name'] = try_set(
- obj=name,
- none_acceptable=True,
- is_of_type=str,
- is_column=True)
- if domain is not None:
- inputs['Domain'] = try_set(
- obj=domain,
- none_acceptable=True,
- is_of_type=str)
- if inputs_to_drop is not None:
- inputs['InputsToDrop'] = try_set(
- obj=inputs_to_drop,
- none_acceptable=True,
- is_of_type=list)
- if outputs_to_drop is not None:
- inputs['OutputsToDrop'] = try_set(
- obj=outputs_to_drop,
- none_acceptable=True,
- is_of_type=list)
- if model is not None:
- inputs['Model'] = try_set(
- obj=model,
- none_acceptable=True,
- is_of_type=str)
- if onnx_version is not None:
- inputs['OnnxVersion'] = try_set(
- obj=onnx_version,
- none_acceptable=True,
- is_of_type=str,
- values=[
- 'Stable',
- 'Experimental'])
- if predictive_model is not None:
- inputs['PredictiveModel'] = try_set(
- obj=predictive_model, none_acceptable=True, is_of_type=str)
-
- input_variables = {
- x for x in unlist(inputs.values())
- if isinstance(x, str) and x.startswith("$")}
- output_variables = {
- x for x in unlist(outputs.values())
- if isinstance(x, str) and x.startswith("$")}
-
- entrypoint = EntryPoint(
- name=entrypoint_name, inputs=inputs, outputs=outputs,
- input_variables=input_variables,
- output_variables=output_variables)
- return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/models_schema.py b/src/python/nimbusml/internal/entrypoints/models_schema.py
index 0b8b0056..096aa2e5 100644
--- a/src/python/nimbusml/internal/entrypoints/models_schema.py
+++ b/src/python/nimbusml/internal/entrypoints/models_schema.py
@@ -1,5 +1,6 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
-Models.Summarizer
+Models.Schema
"""
@@ -8,23 +9,24 @@
def models_schema(
- transform_model,
+ model,
schema=None,
**params):
"""
**Description**
- Retreives input/output column schema for transform model.
+ Retrieve output model schema
- :param transform_model: The transform model.
+ :param model: The transform model. (inputs).
+ :param schema: The model schema (outputs).
"""
entrypoint_name = 'Models.Schema'
inputs = {}
outputs = {}
- if transform_model is not None:
+ if model is not None:
inputs['Model'] = try_set(
- obj=transform_model,
+ obj=model,
none_acceptable=False,
is_of_type=str)
if schema is not None:
@@ -32,7 +34,7 @@ def models_schema(
obj=schema,
none_acceptable=False,
is_of_type=str)
-
+
input_variables = {
x for x in unlist(inputs.values())
if isinstance(x, str) and x.startswith("$")}
diff --git a/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py b/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py
index f02da3a7..1684783c 100644
--- a/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py
+++ b/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py
@@ -43,7 +43,7 @@ def timeseriesprocessingentrypoints_ssaforecasting(
building the trajectory matrix (parameter L). (inputs).
:param series_length: The length of series that is kept in buffer
for modeling (parameter N). (inputs).
- :param train_size: The length of series from the begining used
+ :param train_size: The length of series from the beginning used
for training. (inputs).
:param horizon: The number of values to forecast. (inputs).
:param confidence_level: The confidence level in [0, 1) for
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py
index e5b62a23..5c281338 100644
--- a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py
+++ b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py
@@ -36,7 +36,7 @@ def trainers_generalizedadditivemodelbinaryclassifier(
**Description**
Trains a gradient boosted stump per feature, on all features
simultaneously, to fit target values using least-squares. It
- mantains no interactions between features.
+ maintains no interactions between features.
:param number_of_iterations: Total number of iterations over all
features (inputs).
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py
index 1c56a706..2b9334f8 100644
--- a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py
+++ b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py
@@ -36,7 +36,7 @@ def trainers_generalizedadditivemodelregressor(
**Description**
Trains a gradient boosted stump per feature, on all features
simultaneously, to fit target values using least-squares. It
- mantains no interactions between features.
+ maintains no interactions between features.
:param number_of_iterations: Total number of iterations over all
features (inputs).
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py
index 5db498b1..61759e4d 100644
--- a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py
+++ b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py
@@ -33,7 +33,7 @@ def trainers_logisticregressionclassifier(
**params):
"""
**Description**
- Maximum entrypy classification is a method in statistics used to
+ Maximum entropy classification is a method in statistics used to
predict the probabilities of parallel events. The model
predicts the probabilities of parallel events by fitting data
to a softmax function.
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py
index 7a5d8c71..addc2298 100644
--- a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py
+++ b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py
@@ -1,3 +1,4 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
Transforms.DatasetScorerEx
"""
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py b/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py
index 7afc028a..ac2524c8 100644
--- a/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py
+++ b/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py
@@ -14,7 +14,6 @@ def transforms_datetimesplitter(
prefix,
output_data=None,
model=None,
- columns_to_drop=None,
country='None',
**params):
"""
@@ -24,8 +23,6 @@ def transforms_datetimesplitter(
:param source: Input column (inputs).
:param data: Input dataset (inputs).
:param prefix: Output column prefix (inputs).
- :param columns_to_drop: Columns to drop after the DateTime
- Expansion (inputs).
:param country: Country to get holidays for. Defaults to none if
not passed (inputs).
:param output_data: Transformed dataset (outputs).
@@ -52,12 +49,6 @@ def transforms_datetimesplitter(
obj=prefix,
none_acceptable=False,
is_of_type=str)
- if columns_to_drop is not None:
- inputs['ColumnsToDrop'] = try_set(
- obj=columns_to_drop,
- none_acceptable=True,
- is_of_type=list,
- is_column=True)
if country is not None:
inputs['Country'] = try_set(
obj=country,
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py b/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py
index 1f1a3870..121115b4 100644
--- a/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py
+++ b/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py
@@ -21,7 +21,7 @@ def transforms_missingvaluehandler(
**Description**
Handle missing values by replacing them with either the default value
or the mean/min/max value (for non-text columns only). An
- indicator column can optionally be concatenated, if theinput
+ indicator column can optionally be concatenated, if the input
column type is numeric.
:param column: New column definition(s) (optional form:
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py b/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py
index cfe672b7..301f1c2f 100644
--- a/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py
+++ b/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py
@@ -1,3 +1,4 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
Transforms.PrefixColumnConcatenator
"""
@@ -15,10 +16,10 @@ def transforms_prefixcolumnconcatenator(
**params):
"""
**Description**
- Concatenates one or more columns of the same item type by prefix.
+ Concatenates one or more columns of the same item type.
- :param column: New column definition(s) (optional form:
- name:srcs) (inputs).
+ :param column: New column definition(s) (optional form: name:src)
+ (inputs).
:param data: Input dataset (inputs).
:param output_data: Transformed dataset (outputs).
:param model: Transform model (outputs).
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py b/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py
index e19bd1f1..499a08e3 100644
--- a/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py
+++ b/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py
@@ -83,8 +83,7 @@ def transforms_timeseriesimputer(
values=[
'ForwardFill',
'BackFill',
- 'Median',
- 'Interpolate'])
+ 'Median'])
if supress_type_errors is not None:
inputs['SupressTypeErrors'] = try_set(
obj=supress_type_errors,
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py b/src/python/nimbusml/internal/entrypoints/transforms_variablecolumntransform.py
similarity index 82%
rename from src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py
rename to src/python/nimbusml/internal/entrypoints/transforms_variablecolumntransform.py
index 16fca0ad..febcffde 100644
--- a/src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py
+++ b/src/python/nimbusml/internal/entrypoints/transforms_variablecolumntransform.py
@@ -1,3 +1,4 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
Transforms.VariableColumnTransform
"""
@@ -7,7 +8,7 @@
from ..utils.utils import try_set, unlist
-def transforms_variablecolumn(
+def transforms_variablecolumntransform(
data,
output_data=None,
model=None,
@@ -16,10 +17,12 @@ def transforms_variablecolumn(
**params):
"""
**Description**
- Combines the specified input columns in to a
- single variable length vectorized column.
+ Combines the specified input columns in to a single variable length
+ vectorized column.
:param data: Input dataset (inputs).
+ :param features: Features (inputs).
+ :param length_column_name: Length Column Name (inputs).
:param output_data: Transformed dataset (outputs).
:param model: Transform model (outputs).
"""
@@ -43,7 +46,8 @@ def transforms_variablecolumn(
inputs['LengthColumnName'] = try_set(
obj=length_column_name,
none_acceptable=True,
- is_of_type=str)
+ is_of_type=str,
+ is_column=True)
if output_data is not None:
outputs['OutputData'] = try_set(
obj=output_data,
diff --git a/src/python/nimbusml/internal/utils/entrypoints.py b/src/python/nimbusml/internal/utils/entrypoints.py
index 0e06ff15..a907e52a 100644
--- a/src/python/nimbusml/internal/utils/entrypoints.py
+++ b/src/python/nimbusml/internal/utils/entrypoints.py
@@ -320,8 +320,8 @@ def _get_separator(self):
return pieces[0].replace("sep=", "").strip()
def run(self, X, y=None, max_slots=-1, random_state=None, verbose=1, **params):
- if params.get("dryrun") is not None:
- return 'graph = %s' % (str(self))
+ if params.get("dry_run", False):
+ return str(self)
output_modelfilename = None
output_predictor_modelfilename = None
diff --git a/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py b/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py
index 0b467a37..3825c9e0 100644
--- a/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py
+++ b/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py
@@ -67,10 +67,10 @@ class AveragedPerceptronBinaryClassifier(
`_
`Large Margin Classification Using the Perceptron Algorithm
- `_
+ `_
`Discriminative Training Methods for Hidden Markov Models
- `_
+ `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py b/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py
index 4758454b..3f0fd7c7 100644
--- a/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py
+++ b/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py
@@ -23,7 +23,7 @@ class FastLinearBinaryClassifier(
"""
A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer
- for linear binary classification and regression.
+ for linear binary classification.
.. remarks::
``FastLinearBinaryClassifier`` is a trainer based on the Stochastic
@@ -80,8 +80,7 @@ class FastLinearBinaryClassifier(
content/uploads/2016/06/main-3.pdf>`_
`Stochastic Dual Coordinate Ascent Methods for Regularized Loss
- Minimization `_
+ Minimization `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/linear_model/fastlinearclassifier.py b/src/python/nimbusml/linear_model/fastlinearclassifier.py
index d1ef7644..50162961 100644
--- a/src/python/nimbusml/linear_model/fastlinearclassifier.py
+++ b/src/python/nimbusml/linear_model/fastlinearclassifier.py
@@ -21,7 +21,8 @@
class FastLinearClassifier(core, BasePredictor, ClassifierMixin):
"""
- Train an SDCA multi class model
+ A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer for
+ multi class classification.
.. remarks::
``FastLinearClassifier`` is a trainer based on the Stochastic Dual
@@ -77,8 +78,7 @@ class FastLinearClassifier(core, BasePredictor, ClassifierMixin):
content/uploads/2016/06/main-3.pdf>`_
`Stochastic Dual Coordinate Ascent Methods for Regularized Loss
- Minimization `_
+ Minimization `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/linear_model/fastlinearregressor.py b/src/python/nimbusml/linear_model/fastlinearregressor.py
index 766a79ae..73745f22 100644
--- a/src/python/nimbusml/linear_model/fastlinearregressor.py
+++ b/src/python/nimbusml/linear_model/fastlinearregressor.py
@@ -22,7 +22,7 @@ class FastLinearRegressor(core, BasePredictor, RegressorMixin):
"""
A Stochastic Dual Coordinate Ascent (SDCA) optimization trainer
- for linear binary classification and regression.
+ for linear regression.
.. remarks::
``FastLinearRegressor`` is a trainer based on the Stochastic Dual
@@ -77,8 +77,7 @@ class FastLinearRegressor(core, BasePredictor, RegressorMixin):
content/uploads/2016/06/main-3.pdf>`_
`Stochastic Dual Coordinate Ascent Methods for Regularized Loss
- Minimization `_
+ Minimization `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py b/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py
index 1cf29de4..76410659 100644
--- a/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py
+++ b/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py
@@ -91,14 +91,14 @@ class LogisticRegressionBinaryClassifier(
**Reference**
- `Wikipedia: L-BFGS `_
+ `Wikipedia: L-BFGS `_
`Wikipedia: Logistic
- regression `_
+ regression `_
`Scalable
Training of L1-Regularized Log-Linear Models
- `_
+ `_
`Test Run - L1
and L2 Regularization for Machine Learning
diff --git a/src/python/nimbusml/linear_model/logisticregressionclassifier.py b/src/python/nimbusml/linear_model/logisticregressionclassifier.py
index 265adc10..9155799e 100644
--- a/src/python/nimbusml/linear_model/logisticregressionclassifier.py
+++ b/src/python/nimbusml/linear_model/logisticregressionclassifier.py
@@ -92,14 +92,14 @@ class LogisticRegressionClassifier(
**Reference**
- `Wikipedia: L-BFGS `_
+ `Wikipedia: L-BFGS `_
`Wikipedia: Logistic
- regression `_
+ regression `_
`Scalable
Training of L1-Regularized Log-Linear Models
- `_
+ `_
`Test Run - L1
and L2 Regularization for Machine Learning
diff --git a/src/python/nimbusml/linear_model/sgdbinaryclassifier.py b/src/python/nimbusml/linear_model/sgdbinaryclassifier.py
index a5ee573d..893f6465 100644
--- a/src/python/nimbusml/linear_model/sgdbinaryclassifier.py
+++ b/src/python/nimbusml/linear_model/sgdbinaryclassifier.py
@@ -34,14 +34,14 @@ class SgdBinaryClassifier(core, BasePredictor, ClassifierMixin):
associated optimization problem is sparse, then Hogwild SGD achieves
a
nearly optimal rate of convergence. For a detailed reference, please
- refer to `http://arxiv.org/pdf/1106.5730v2.pdf
- `_.
+ refer to `https://arxiv.org/pdf/1106.5730v2.pdf
+ `_.
**Reference**
- `http://arxiv.org/pdf/1106.5730v2.pdf
- `_
+ `https://arxiv.org/pdf/1106.5730v2.pdf
+ `_
:param feature: see `Columns `_.
diff --git a/src/python/nimbusml/model_selection/cv.py b/src/python/nimbusml/model_selection/cv.py
index 79a5def4..effff597 100644
--- a/src/python/nimbusml/model_selection/cv.py
+++ b/src/python/nimbusml/model_selection/cv.py
@@ -96,7 +96,7 @@ class CV:
the average of each metric on all models.
:param pipeline: Pipeline object or a list of pipeline steps that's
- used for cross validation
+ used for cross validation
'''
fold_column_name = 'Fold'
@@ -307,27 +307,22 @@ def _cleanup_results(self, results, cv):
return clean_results
- def _process_split_start(self, split_start):
- nodes = self._pipeline.nodes
- pipeline_len = len(nodes)
+ def _process_split_start(self, split_start, num_transform_nodes):
if isinstance(split_start, str):
if split_start == 'before_transforms':
split_index = 0
elif split_start == 'after_transforms':
- split_index = pipeline_len - 1
+ split_index = num_transform_nodes
else:
raise ValueError(
'String value for split_start should be either '
'"before_transforms" or "after_transforms"')
if isinstance(split_start, six.integer_types):
- try:
- nodes[split_start]
- except IndexError:
+ if split_start > num_transform_nodes:
raise ValueError(
'Pipeline doesn\'t contain a step for split_start={'
- '}'.format(
- split_start))
+ '}'.format(split_start))
split_index = split_start
@@ -335,7 +330,10 @@ def _process_split_start(self, split_start):
# Convert split_index to positive number, so that it can index into
# list of transfroms without the learner.
if split_index < 0:
- split_index = split_index + pipeline_len
+ split_index = split_index + num_transform_nodes
+
+ if split_index < 0:
+ raise ValueError('Invalid split index.')
return split_index
@@ -426,6 +424,7 @@ def fit(
self._results = None
self._raw_results = None
verbose = 1
+ dry_run = params.pop('dry_run', False)
# _fit_graph() seems to have side-effects on the pipeline object
# Use a clone, so that we can reuse CV object for multiple calls to
@@ -468,9 +467,10 @@ def fit(
'groups in .fit() function.')
- split_index = self._process_split_start(split_start)
graph_sections = cv_aux_info.graph_sections
transforms = graph_sections.get('transform_nodes', [])
+
+ split_index = self._process_split_start(split_start, len(transforms))
pre_split_transforms = transforms[:split_index]
post_split_transforms = transforms[split_index:]
implicit_nodes = graph_sections['implicit_nodes']
@@ -562,11 +562,16 @@ def fit(
telemetry_info=telemetry_info,
is_cv=True,
output_types=self.output_types,
+ dry_run=dry_run,
**params)
except RuntimeError as e:
self._run_time = time.time() - start_time
raise e
- self._raw_results = graph_run_results
- self._results = self._cleanup_results(graph_run_results, cv)
+ if dry_run:
+ self._results = graph_run_results
+ else:
+ self._raw_results = graph_run_results
+ self._results = self._cleanup_results(graph_run_results, cv)
+
return self._results
diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py
index 3e0dce27..4efa9cf0 100644
--- a/src/python/nimbusml/pipeline.py
+++ b/src/python/nimbusml/pipeline.py
@@ -119,10 +119,10 @@ class Pipeline:
for more details on how to select these.
:param steps: the list of operator or (name, operator) tuples that
- are chained in the appropriate order.
+ are chained in the appropriate order.
:param model: the path to the model file (".zip") if want to load a
- model directly from file (such as a trained model from ML.NET).
+ model directly from file (such as a trained model from ML.NET).
:param random_state: the integer used as the random seed.
@@ -1223,6 +1223,19 @@ def fit_transform(
:param X: {array-like [n_samples, n_features],
:py:func:`FileDataStream ` }
:param y: {array-like [n_samples]}
+ :param as_binary_data_stream: If ``True`` then output an IDV file.
+ See `here `_
+ for more information.
+ :param params: Additional arguments.
+ If ``as_csr=True`` and ``as_binary_data_stream=False`` then
+ return the transformed data in CSR (sparse matrix) format.
+ If ``as_binary_data_stream`` is also true then that
+ parameter takes precedence over ``as_csr`` and the output will
+ be an IDV file.
+
+ :return: Returns a pandas DataFrame if no other output format
+ is specified. See ``as_binary_data_stream`` and ``as_csr``
+ for other available output formats.
"""
self.fit(
X,
@@ -1529,10 +1542,14 @@ def _evaluation_infer(self, evaltype, label_column, group_id,
models_anomalydetectionevaluator(**params)])
elif type_ == 'ranking':
- svd = "$scoredVectorData"
column = [OrderedDict(Source=group_id, Name=group_id)]
- algo_args = dict(data=svd, output_data=svd, column=column)
+ algo_args = dict(
+ data="$scoredVectorData",
+ output_data="$scoredVectorData2",
+ column=column)
key_node = transforms_texttokeyconverter(**algo_args)
+
+ params['data'] = "$scoredVectorData2"
evaluate_node = models_rankingevaluator(
group_id_column=group_id, **params)
all_nodes.extend([
@@ -1834,7 +1851,7 @@ def get_output_columns(self, verbose=0, **params):
inputs = dict([('transform_model', self.model)])
schema_node = models_schema(
- transform_model="$transform_model",
+ model="$transform_model",
schema="$output_data")
all_nodes = [schema_node]
@@ -2443,7 +2460,19 @@ def transform(
:param X: {array-like [n_samples, n_features],
:py:class:`nimbusml.FileDataStream` }
:param y: {array-like [n_samples]}
-
+ :param as_binary_data_stream: If ``True`` then output an IDV file.
+ See `here `_
+ for more information.
+ :param params: Additional arguments.
+ If ``as_csr=True`` and ``as_binary_data_stream=False`` then
+ return the transformed data in CSR (sparse matrix) format.
+ If ``as_binary_data_stream`` is also true then that
+ parameter takes precedence over ``as_csr`` and the output will
+ be an IDV file.
+
+ :return: Returns a pandas DataFrame if no other output format
+ is specified. See ``as_binary_data_stream`` and ``as_csr``
+ for other available output formats.
"""
# start the clock!
start_time = time.time()
@@ -2539,7 +2568,7 @@ def summary(self, verbose=0, **params):
if len(self.steps) > 0 and not isinstance(
self.last_node, BasePredictor):
raise ValueError(
- "Summary is availabe only for predictor types, instead "
+ "Summary is available only for predictor types, instead "
"got " +
self.last_node.type)
@@ -2577,6 +2606,10 @@ def summary(self, verbose=0, **params):
self._run_time = time.time() - start_time
raise e
+ # .summary() not supported if size of summary_data
+ # is less or equal to 1 (if only PredictedName in summary_data)
+ if summary_data.size == 1 and summary_data.columns.values == ["PredictorName"]:
+ raise TypeError("One or more predictors in this pipeline do not support the .summary() function.")
self.model_summary = summary_data
# stop the clock
diff --git a/src/python/nimbusml/preprocessing/datetimesplitter.py b/src/python/nimbusml/preprocessing/datetimesplitter.py
index fb33337b..c3fceb43 100644
--- a/src/python/nimbusml/preprocessing/datetimesplitter.py
+++ b/src/python/nimbusml/preprocessing/datetimesplitter.py
@@ -27,8 +27,6 @@ class DateTimeSplitter(core, BaseTransform, TransformerMixin):
:param prefix: Output column prefix.
- :param columns_to_drop: Columns to drop after the DateTime Expansion.
-
:param country: Country to get holidays for. Defaults to none if not
passed.
@@ -40,7 +38,6 @@ class DateTimeSplitter(core, BaseTransform, TransformerMixin):
def __init__(
self,
prefix,
- columns_to_drop=None,
country='None',
columns=None,
**params):
@@ -51,7 +48,6 @@ def __init__(
core.__init__(
self,
prefix=prefix,
- columns_to_drop=columns_to_drop,
country=country,
**params)
self._columns = columns
diff --git a/src/python/nimbusml/preprocessing/fromkey.py b/src/python/nimbusml/preprocessing/fromkey.py
index f83d90a7..126d6b5f 100644
--- a/src/python/nimbusml/preprocessing/fromkey.py
+++ b/src/python/nimbusml/preprocessing/fromkey.py
@@ -20,8 +20,7 @@
class FromKey(core, BaseTransform, TransformerMixin):
"""
- Text transforms that can be performed on data before training
- a model.
+ Converts the key types back to their original values.
.. remarks::
The ``FromKey`` transform converts a column of keys, generated using
diff --git a/src/python/nimbusml/preprocessing/missing_values/handler.py b/src/python/nimbusml/preprocessing/missing_values/handler.py
index 1a1fac0a..01da758b 100644
--- a/src/python/nimbusml/preprocessing/missing_values/handler.py
+++ b/src/python/nimbusml/preprocessing/missing_values/handler.py
@@ -54,14 +54,13 @@ class Handler(core, BaseTransform, TransformerMixin):
For more details see `Columns `_.
:param replace_with: The method to use to replace NaN values. The
- following choices are available.
-
- * Def: Replace with default value of that type, usually ``0``. If no
- replace
- method is specified, this is the default strategy.
- * Mean: Replace NaN values with the mean of the values in that column.
- * Min: Replace with minimum value in the column.
- * Max: Replace with maximum value in the column.
+ following choices are available.
+
+ * Def: Replace with default value of that type, usually ``0``. If no
+ replace method is specified, this is the default strategy.
+ * Mean: Replace NaN values with the mean of the values in that column.
+ * Min: Replace with minimum value in the column.
+ * Max: Replace with maximum value in the column.
:param impute_by_slot: Whether to impute values by slot.
diff --git a/src/python/nimbusml/preprocessing/schema/prefixcolumnconcatenator.py b/src/python/nimbusml/preprocessing/schema/prefixcolumnconcatenator.py
index 9a3aa443..6e0662e1 100644
--- a/src/python/nimbusml/preprocessing/schema/prefixcolumnconcatenator.py
+++ b/src/python/nimbusml/preprocessing/schema/prefixcolumnconcatenator.py
@@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
PrefixColumnConcatenator
"""
@@ -17,7 +18,10 @@
from ...internal.utils.utils import trace
-class PrefixColumnConcatenator(core, BaseTransform, TransformerMixin):
+class PrefixColumnConcatenator(
+ core,
+ BaseTransform,
+ TransformerMixin):
"""
Combines several columns into a single vector-valued column by prefix.
diff --git a/src/python/nimbusml/preprocessing/tokey.py b/src/python/nimbusml/preprocessing/tokey.py
index 97c00ad3..c94c2eac 100644
--- a/src/python/nimbusml/preprocessing/tokey.py
+++ b/src/python/nimbusml/preprocessing/tokey.py
@@ -20,8 +20,7 @@
class ToKey(core, BaseTransform, TransformerMixin):
"""
- Text transforms that can be performed on data before training
- a model.
+ Converts input values (words, numbers, etc.) to index in a dictionary.
.. remarks::
The ``ToKey`` transform converts a column of text to key values
diff --git a/src/python/nimbusml/tests/ensemble/test_lightgbmclassifier.py b/src/python/nimbusml/tests/ensemble/test_lightgbmclassifier.py
index 0c31c9ff..16442dff 100644
--- a/src/python/nimbusml/tests/ensemble/test_lightgbmclassifier.py
+++ b/src/python/nimbusml/tests/ensemble/test_lightgbmclassifier.py
@@ -19,8 +19,10 @@
class TestLightGbmClassifier(unittest.TestCase):
- @unittest.skipIf(platform.system() in ("Linux", "Darwin") and six.PY2,
- "encoding/decoding issues with linux py2.7, bug 286536")
+ @unittest.skipIf(platform.system() == "Darwin" and six.PY2,
+ "Disabled due to bug on Mac Python 2.7 build, more info: \
+ https://github.com/microsoft/NimbusML/issues/366, \
+ https://github.com/microsoft/NimbusML/pull/362")
def test_lightgbmclassifier(self):
np.random.seed(0)
train_file = get_dataset('wiki_detox_train').as_filepath()
diff --git a/src/python/nimbusml/tests/feature_extraction/text/test_ngramextractor.py b/src/python/nimbusml/tests/feature_extraction/text/test_ngramextractor.py
new file mode 100644
index 00000000..5914a24f
--- /dev/null
+++ b/src/python/nimbusml/tests/feature_extraction/text/test_ngramextractor.py
@@ -0,0 +1,36 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+import os
+import unittest
+import pandas
+
+from nimbusml import FileDataStream, Pipeline
+from nimbusml.datasets import get_dataset
+from nimbusml.feature_extraction.text import NGramExtractor
+from nimbusml.preprocessing.text import CharTokenizer
+from nimbusml.preprocessing.schema import ColumnDropper
+
+
+class TestNGramExtractor(unittest.TestCase):
+
+ def test_ngramfeaturizer(self):
+ train_df = pandas.DataFrame(data=dict(review=['one', 'two']))
+
+ pipeline = Pipeline([
+ CharTokenizer(columns={'review_transform': 'review'}),
+ NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}),
+ ColumnDropper(columns=['review_transform', 'review'])
+ ])
+
+ result = pipeline.fit_transform(train_df)
+ self.assertEqual(len(result.columns), 6)
+ self.assertEqual(result.loc[0, 'ngrams.o|n|e'], 1.0)
+ self.assertEqual(result.loc[1, 'ngrams.o|n|e'], 0.0)
+ self.assertEqual(result.loc[0, 'ngrams.t|w|o'], 0.0)
+ self.assertEqual(result.loc[1, 'ngrams.t|w|o'], 1.0)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py b/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py
index 6b183b91..21ad6c12 100644
--- a/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py
+++ b/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py
@@ -3,7 +3,7 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
-import os
+import platform
import unittest
import numpy as np
@@ -18,8 +18,10 @@
class TestNGramFeaturizer(unittest.TestCase):
- @unittest.skipIf(os.name != "nt" and six.PY2,
- "encoding/decoding issues with linux py2.7, bug 286536")
+ @unittest.skipIf(platform.system() == "Darwin" and six.PY2,
+ "Disabled due to bug on Mac Python 2.7 build, more info: \
+ https://github.com/microsoft/NimbusML/issues/366, \
+ https://github.com/microsoft/NimbusML/pull/362")
def test_ngramfeaturizer(self):
np.random.seed(0)
train_file = get_dataset('wiki_detox_train').as_filepath()
diff --git a/src/python/nimbusml/tests/model_selection/test_cv.py b/src/python/nimbusml/tests/model_selection/test_cv.py
index 2f264de2..b6883331 100644
--- a/src/python/nimbusml/tests/model_selection/test_cv.py
+++ b/src/python/nimbusml/tests/model_selection/test_cv.py
@@ -4,6 +4,7 @@
# --------------------------------------------------------------------------------------------
import os
+import json
import unittest
import numpy as np
@@ -11,13 +12,14 @@
from nimbusml import Pipeline, FileDataStream, Role, DataSchema
from nimbusml.cluster import KMeansPlusPlus
from nimbusml.datasets import get_dataset
-from nimbusml.ensemble import FastForestRegressor, LightGbmRanker
+from nimbusml.ensemble import FastForestRegressor, LightGbmRanker, LightGbmRegressor
from nimbusml.feature_extraction.categorical import OneHotVectorizer, \
OneHotHashVectorizer
from nimbusml.linear_model import FastLinearClassifier, \
LogisticRegressionBinaryClassifier, LogisticRegressionClassifier
from nimbusml.model_selection import CV
from nimbusml.preprocessing import ToKey
+from nimbusml.preprocessing.missing_values import Indicator, Handler
from nimbusml.preprocessing.schema import ColumnConcatenator, ColumnDropper
from nimbusml.tests.test_utils import split_features_and_label
from sklearn.utils.testing import assert_equal, assert_true, \
@@ -123,6 +125,9 @@ def check_cv(
cv = CV(pipeline)
if split_start == 'try_all':
len_pipeline = len(pipeline.nodes)
+ if pipeline.last_node.type != 'transform':
+ len_pipeline = len_pipeline - 1
+
values_to_test = ['after_transforms', 'before_transforms']
values_to_test.extend(list(range(len_pipeline)))
values_to_test.extend(list(range(-len_pipeline, 0)))
@@ -249,6 +254,38 @@ def test_unsupported_split_start(self):
self.check_cv_with_defaults(
split_start=split_start, graph_id=str(split_start))
+ def test_split_start_with_transforms_with_presteps(self):
+ path = get_dataset("airquality").as_filepath()
+ schema = DataSchema.read_schema(path)
+ data = FileDataStream(path, schema)
+
+ pipeline_steps = [
+ Indicator() << {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'},
+ Handler(replace_with='Mean') << {
+ 'Solar_R': 'Solar_R',
+ 'Ozone': 'Ozone'},
+ LightGbmRegressor(
+ feature=['Ozone',
+ 'Solar_R',
+ 'Ozone_ind',
+ 'Solar_R_ind',
+ 'Temp'],
+ label='Wind')]
+
+ results = CV(pipeline_steps).fit(data,
+ split_start='after_transforms',
+ dry_run=True)
+ results = json.loads(results)
+
+ node_names = [ep['Name'] for ep in results['nodes']]
+ cv_node = [ep for ep in results['nodes']
+ if 'Models.CrossValidator' in ep['Name']][0]
+ cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']]
+
+ self.assertTrue('Transforms.MissingValueHandler' in node_names)
+ self.assertTrue('Transforms.MissingValueHandler' not in cv_sub_node_names)
+ self.assertTrue('Transforms.ModelCombiner' in node_names)
+
class TestCvBinary(unittest.TestCase):
infert_case_index = 5
@@ -562,3 +599,7 @@ def test_df(self):
y=[0, 1, 2, 10, 11, 12, -10, -11, -12],
z=[0, 1, 2, 10, 11, 12, -10, -11, -12]))
check_cv([KMeansPlusPlus(n_clusters=3)], X)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/model_summary/test_model_summary.py b/src/python/nimbusml/tests/model_summary/test_model_summary.py
index 650238ae..3d0c659c 100644
--- a/src/python/nimbusml/tests/model_summary/test_model_summary.py
+++ b/src/python/nimbusml/tests/model_summary/test_model_summary.py
@@ -66,25 +66,25 @@
#SymSgdBinaryClassifier(),
OrdinaryLeastSquaresRegressor(),
PoissonRegressionRegressor(),
- OneVsRestClassifier(FastLinearBinaryClassifier()),
GamRegressor(),
GamBinaryClassifier(),
PcaAnomalyDetector(),
- FactorizationMachineBinaryClassifier(),
- KMeansPlusPlus(),
- NaiveBayesClassifier(),
FastForestBinaryClassifier(number_of_trees=2),
FastForestRegressor(number_of_trees=2),
FastTreesBinaryClassifier(number_of_trees=2),
FastTreesRegressor(number_of_trees=2),
FastTreesTweedieRegressor(number_of_trees=2),
LightGbmRegressor(number_of_iterations=2),
- LightGbmClassifier(),
LightGbmBinaryClassifier(number_of_iterations=2)
]
learners_not_supported = [
- #PcaTransformer(), # REVIEW: crashes
+ FactorizationMachineBinaryClassifier(),
+ OneVsRestClassifier(FastLinearBinaryClassifier()),
+ FactorizationMachineBinaryClassifier(),
+ KMeansPlusPlus(n_clusters=2),
+ NaiveBayesClassifier(),
+ LightGbmClassifier()
]
@@ -98,7 +98,6 @@ def test_model_summary(self):
pipeline.fit(train_stream, label_column)
pipeline.summary()
- @unittest.skip("No unsupported learners")
def test_model_summary_not_supported(self):
for learner in learners_not_supported:
pipeline = Pipeline(
@@ -107,6 +106,23 @@ def test_model_summary_not_supported(self):
pipeline.fit(train_stream, label_column)
assert_raises(TypeError, pipeline.summary)
+ def test_model_summary_not_supported_specific(self):
+ path = get_dataset('infert').as_filepath()
+ data = FileDataStream.read_csv(path, sep=',',
+ names={0: 'row_num', 5: 'case'})
+ pipeline = Pipeline([
+ OneHotVectorizer(columns={'edu': 'education'}),
+ FactorizationMachineBinaryClassifier(feature=['induced', 'edu', 'parity'],
+ label='case')
+ ])
+ pipeline.fit(data)
+ try:
+ pipeline.summary()
+ except TypeError as e:
+ self.assertEqual(e.args[0], "One or more predictors in this pipeline do not support the .summary() function.")
+ else:
+ assert False
+
def test_summary_called_back_to_back_on_predictor(self):
"""
When a predictor is fit without using a Pipeline,
@@ -119,24 +135,24 @@ def test_summary_called_back_to_back_on_predictor(self):
ols.summary()
def test_pipeline_summary_is_refreshed_after_refitting(self):
- predictor = OrdinaryLeastSquaresRegressor(normalize='No', l2_regularization=0)
+ predictor = OrdinaryLeastSquaresRegressor()
pipeline = Pipeline([predictor])
pipeline.fit([0,1,2,3], [1,2,3,4])
summary1 = pipeline.summary()
- pipeline.fit([0,1,2,3], [2,5,8,11])
+ pipeline.fit([0,1,2.5,3], [2,5,8,11])
summary2 = pipeline.summary()
self.assertFalse(summary1.equals(summary2))
def test_predictor_summary_is_refreshed_after_refitting(self):
- predictor = OrdinaryLeastSquaresRegressor(normalize='No', l2_regularization=0)
+ predictor = OrdinaryLeastSquaresRegressor()
predictor.fit([0,1,2,3], [1,2,3,4])
summary1 = predictor.summary()
- predictor.fit([0,1,2,3], [2,5,8,11])
+ predictor.fit([0,1,2.5,3], [2,5,8,11])
summary2 = predictor.summary()
self.assertFalse(summary1.equals(summary2))
diff --git a/src/python/nimbusml/tests/naive_bayes/test_naivebayesclassifier.py b/src/python/nimbusml/tests/naive_bayes/test_naivebayesclassifier.py
index 4b414c38..36d44b85 100644
--- a/src/python/nimbusml/tests/naive_bayes/test_naivebayesclassifier.py
+++ b/src/python/nimbusml/tests/naive_bayes/test_naivebayesclassifier.py
@@ -19,8 +19,6 @@
class TestNaiveBayesClassifier(unittest.TestCase):
- @unittest.skipIf(os.name != "nt" and six.PY2,
- "encoding/decoding issues with linux py2.7, bug 286536")
def test_naivebayesclassifier(self):
np.random.seed(0)
train_file = get_dataset("wiki_detox_train").as_filepath()
diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py
index 19bc26ce..3807507e 100644
--- a/src/python/nimbusml/tests/pipeline/test_load_save.py
+++ b/src/python/nimbusml/tests/pipeline/test_load_save.py
@@ -5,6 +5,7 @@
import os
import pickle
+import tempfile
import unittest
import numpy as np
@@ -32,6 +33,12 @@
(train, label) = get_X_y(train_file, label_column, sep=',')
(test, test_label) = get_X_y(test_file, label_column, sep=',')
+def get_temp_file(suffix=None):
+ fd, file_name = tempfile.mkstemp(suffix=suffix)
+ fl = os.fdopen(fd, 'w')
+ fl.close()
+ return file_name
+
class TestLoadSave(unittest.TestCase):
@@ -48,7 +55,7 @@ def test_model_dataframe(self):
model_nimbusml.fit(train, label)
# Save with pickle
- pickle_filename = 'nimbusml_model.p'
+ pickle_filename = get_temp_file(suffix='.p')
with open(pickle_filename, 'wb') as f:
pickle.dump(model_nimbusml, f)
@@ -65,9 +72,10 @@ def test_model_dataframe(self):
test, test_label, output_scores=True)
# Save load with pipeline methods
- model_nimbusml.save_model('model.nimbusml.m')
+ model_filename = get_temp_file(suffix='.m')
+ model_nimbusml.save_model(model_filename)
model_nimbusml_load = Pipeline()
- model_nimbusml_load.load_model('model.nimbusml.m')
+ model_nimbusml_load.load_model(model_filename)
score1 = model_nimbusml.predict(test).head(5)
score2 = model_nimbusml_load.predict(test).head(5)
@@ -82,7 +90,7 @@ def test_model_dataframe(self):
model_nimbusml_load.sum().sum(),
decimal=2)
- os.remove('model.nimbusml.m')
+ os.remove(model_filename)
def test_model_datastream(self):
model_nimbusml = Pipeline(
@@ -97,7 +105,7 @@ def test_model_datastream(self):
model_nimbusml.fit(train, label)
# Save with pickle
- pickle_filename = 'nimbusml_model.p'
+ pickle_filename = get_temp_file(suffix='.p')
with open(pickle_filename, 'wb') as f:
pickle.dump(model_nimbusml, f)
@@ -120,9 +128,10 @@ def test_model_datastream(self):
decimal=2)
# Save load with pipeline methods
- model_nimbusml.save_model('model.nimbusml.m')
+ model_filename = get_temp_file(suffix='.m')
+ model_nimbusml.save_model(model_filename)
model_nimbusml_load = Pipeline()
- model_nimbusml_load.load_model('model.nimbusml.m')
+ model_nimbusml_load.load_model(model_filename)
score1 = model_nimbusml.predict(test).head(5)
score2 = model_nimbusml_load.predict(test).head(5)
@@ -137,7 +146,7 @@ def test_model_datastream(self):
model_nimbusml_load.sum().sum(),
decimal=2)
- os.remove('model.nimbusml.m')
+ os.remove(model_filename)
def test_pipeline_saves_complete_model_file_when_pickled(self):
model_nimbusml = Pipeline(
@@ -152,7 +161,7 @@ def test_pipeline_saves_complete_model_file_when_pickled(self):
model_nimbusml.fit(train, label)
metrics, score = model_nimbusml.test(test, test_label, output_scores=True)
- pickle_filename = 'nimbusml_model.p'
+ pickle_filename = get_temp_file(suffix='.p')
# Save with pickle
with open(pickle_filename, 'wb') as f:
@@ -202,7 +211,7 @@ def test_unfitted_pickled_pipeline_can_be_fit(self):
shuffle=False,
number_of_threads=1))])
- pickle_filename = 'nimbusml_model.p'
+ pickle_filename = get_temp_file(suffix='.p')
# Save with pickle
with open(pickle_filename, 'wb') as f:
@@ -234,7 +243,7 @@ def test_unpickled_pipeline_has_feature_contributions(self):
fc = model_nimbusml.get_feature_contributions(test)
# Save with pickle
- pickle_filename = 'nimbusml_model.p'
+ pickle_filename = get_temp_file(suffix='.p')
with open(pickle_filename, 'wb') as f:
pickle.dump(model_nimbusml, f)
# Unpickle model
@@ -260,7 +269,7 @@ def test_unpickled_predictor_has_feature_contributions(self):
fc = model_nimbusml.get_feature_contributions(test)
# Save with pickle
- pickle_filename = 'nimbusml_model.p'
+ pickle_filename = get_temp_file(suffix='.p')
with open(pickle_filename, 'wb') as f:
pickle.dump(model_nimbusml, f)
# Unpickle model
@@ -287,7 +296,7 @@ def test_pipeline_loaded_from_zip_has_feature_contributions(self):
fc = model_nimbusml.get_feature_contributions(test)
# Save the model to zip
- model_filename = 'nimbusml_model.zip'
+ model_filename = get_temp_file(suffix='.zip')
model_nimbusml.save_model(model_filename)
# Load the model from zip
model_nimbusml_zip = Pipeline()
@@ -312,7 +321,7 @@ def test_predictor_loaded_from_zip_has_feature_contributions(self):
fc = model_nimbusml.get_feature_contributions(test)
# Save the model to zip
- model_filename = 'nimbusml_model.zip'
+ model_filename = get_temp_file(suffix='.zip')
model_nimbusml.save_model(model_filename)
# Load the model from zip
model_nimbusml_zip = Pipeline()
@@ -347,7 +356,7 @@ def test_pickled_pipeline_with_predictor_model(self):
self.assertTrue(pipeline.predictor_model)
self.assertNotEqual(pipeline.model, pipeline.predictor_model)
- pickle_filename = 'nimbusml_model.p'
+ pickle_filename = get_temp_file(suffix='.p')
with open(pickle_filename, 'wb') as f:
pickle.dump(pipeline, f)
diff --git a/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py b/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py
index 347b2798..04f1bc35 100644
--- a/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py
+++ b/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py
@@ -3,6 +3,7 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
import os
+import tempfile
import unittest
from nimbusml import FileDataStream
@@ -16,6 +17,14 @@
from numpy.testing import assert_almost_equal
from pandas.testing import assert_frame_equal
+
+def get_temp_model_file():
+ fd, file_name = tempfile.mkstemp(suffix='.zip')
+ fl = os.fdopen(fd, 'w')
+ fl.close()
+ return file_name
+
+
class TestPermutationFeatureImportance(unittest.TestCase):
@classmethod
@@ -65,7 +74,7 @@ def test_binary_classifier(self):
assert_almost_equal(self.binary_pfi['AreaUnderPrecisionRecallCurve'].sum(), -0.19365, 5)
def test_binary_classifier_from_loaded_model(self):
- model_path = "model.zip"
+ model_path = get_temp_model_file()
self.binary_model.save_model(model_path)
loaded_model = Pipeline()
loaded_model.load_model(model_path)
@@ -81,7 +90,7 @@ def test_clasifier(self):
assert_almost_equal(self.classifier_pfi['PerClassLogLoss.1'].sum(), 0.419826, 6)
def test_classifier_from_loaded_model(self):
- model_path = "model.zip"
+ model_path = get_temp_model_file()
self.classifier_model.save_model(model_path)
loaded_model = Pipeline()
loaded_model.load_model(model_path)
@@ -96,7 +105,7 @@ def test_regressor(self):
assert_almost_equal(self.regressor_pfi['RSquared'].sum(), -0.203612, 6)
def test_regressor_from_loaded_model(self):
- model_path = "model.zip"
+ model_path = get_temp_model_file()
self.regressor_model.save_model(model_path)
loaded_model = Pipeline()
loaded_model.load_model(model_path)
@@ -113,7 +122,7 @@ def test_ranker(self):
assert_almost_equal(self.ranker_pfi['NDCG@3'].sum(), -0.236544, 6)
def test_ranker_from_loaded_model(self):
- model_path = "model.zip"
+ model_path = get_temp_model_file()
self.ranker_model.save_model(model_path)
loaded_model = Pipeline()
loaded_model.load_model(model_path)
diff --git a/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py b/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py
index 0b9c8141..aecfc20e 100644
--- a/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py
+++ b/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py
@@ -6,7 +6,9 @@
import unittest
import pandas
+from nimbusml import Pipeline
from nimbusml.preprocessing import DateTimeSplitter
+from nimbusml.preprocessing.schema import ColumnSelector
from sklearn.utils.testing import assert_equal
@@ -25,16 +27,15 @@ def test_holidays(self):
))
cols_to_drop = [
- 'Hour12', 'DayOfWeek', 'DayOfQuarter',
- 'DayOfYear', 'WeekOfMonth', 'QuarterOfYear',
- 'HalfOfYear', 'WeekIso', 'YearIso', 'MonthLabel',
- 'AmPmLabel', 'DayOfWeekLabel', 'IsPaidTimeOff'
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
]
- dts = DateTimeSplitter(prefix='dt',
- country='Canada',
- columns_to_drop=cols_to_drop) << 'tokens1'
- y = dts.fit_transform(df)
+ dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'
+ pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
+ y = pipeline.fit_transform(df)
self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day')
diff --git a/src/python/nimbusml/tests/test_entrypoints.py b/src/python/nimbusml/tests/test_entrypoints.py
index c4e53546..5ff8e0c3 100644
--- a/src/python/nimbusml/tests/test_entrypoints.py
+++ b/src/python/nimbusml/tests/test_entrypoints.py
@@ -118,8 +118,12 @@ def test_logistic_regression_graph(self):
input_data=""), dict(
output_model=""), DataOutputFormat.DF, *all_nodes)
# print(graph)
- graph.run(X=None, dryrun=True)
+ graph.run(X=None, dry_run=True)
# lr = graph.run(formula = "ylogical ~ xint1", data = ds
# , blocks_per_read = 1, report_progress = True
# )
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/test_variable_column.py b/src/python/nimbusml/tests/test_variable_column.py
index 6c1fc8bd..318094ff 100644
--- a/src/python/nimbusml/tests/test_variable_column.py
+++ b/src/python/nimbusml/tests/test_variable_column.py
@@ -8,17 +8,17 @@
import numpy as np
import pandas as pd
from nimbusml import Pipeline
-from nimbusml.internal.entrypoints.transforms_variablecolumn import transforms_variablecolumn
+from nimbusml.internal.entrypoints.transforms_variablecolumntransform import transforms_variablecolumntransform
from nimbusml.internal.utils.entrypoints import Graph, DataOutputFormat
class TestVariableColumn(unittest.TestCase):
def to_variable_column(self, input, features=None, length_column_name=None):
- node = transforms_variablecolumn(data='$data',
- output_data='$output_data',
- features=features,
- length_column_name=length_column_name)
+ node = transforms_variablecolumntransform(data='$data',
+ output_data='$output_data',
+ features=features,
+ length_column_name=length_column_name)
graph_nodes = [node]
graph = Graph(dict(data=''),
diff --git a/src/python/nimbusml/timeseries/ssaforecaster.py b/src/python/nimbusml/timeseries/ssaforecaster.py
index dd7e0296..35516d15 100644
--- a/src/python/nimbusml/timeseries/ssaforecaster.py
+++ b/src/python/nimbusml/timeseries/ssaforecaster.py
@@ -31,7 +31,7 @@ class SsaForecaster(core, BaseTransform, TransformerMixin):
input time-series where each component in the spectrum corresponds to a
trend, seasonal or noise component in the time-series. For details of the
Singular Spectrum Analysis (SSA), refer to `this document
- `_.
+ `_.
:param columns: see `Columns `_.
@@ -41,7 +41,7 @@ class SsaForecaster(core, BaseTransform, TransformerMixin):
:param series_length: The length of series that is kept in buffer for
modeling (parameter N).
- :param train_size: The length of series from the begining used for
+ :param train_size: The length of series from the beginning used for
training.
:param horizon: The number of values to forecast.
diff --git a/src/python/setup.py b/src/python/setup.py
index 5fc3fcba..e8481345 100644
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -45,7 +45,7 @@
# Versions should comply with PEP440. For a discussion on
# single-sourcing the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
- version='1.5.1',
+ version='1.6.1',
description='NimbusML',
long_description=long_description,
@@ -115,7 +115,7 @@
'nose>=1.3', 'pytest>=4.4.0',
'graphviz', 'imageio',
],
- 'dprep': ['azureml-dataprep>=1.1.12'],
+ 'dprep': ['azureml-dataprep>=1.1.33'],
'utils': ['graphviz', 'imageio'],
},
@@ -148,7 +148,7 @@
# Although 'package_data' is the preferred approach, in some case
# you may need to place data files outside of your packages. See:
- # http://docs.python.org/3.4/distutils/setupscript.html#installing
+ # https://docs.python.org/3.4/distutils/setupscript.html#installing
# -additional-files # noqa
# In this case, 'data_file' will be installed into
# '/my_data'
diff --git a/src/python/setup.py.in b/src/python/setup.py.in
index e65db7d8..0489bc13 100644
--- a/src/python/setup.py.in
+++ b/src/python/setup.py.in
@@ -115,7 +115,7 @@ setup(
'nose>=1.3', 'pytest>=4.4.0',
'graphviz', 'imageio',
],
- 'dprep': ['azureml-dataprep>=1.1.12'],
+ 'dprep': ['azureml-dataprep>=1.1.33'],
'utils': ['graphviz', 'imageio'],
},
@@ -148,7 +148,7 @@ setup(
# Although 'package_data' is the preferred approach, in some case
# you may need to place data files outside of your packages. See:
- # http://docs.python.org/3.4/distutils/setupscript.html#installing
+ # https://docs.python.org/3.4/distutils/setupscript.html#installing
# -additional-files # noqa
# In this case, 'data_file' will be installed into
# '