diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..f87ab00 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +pyculib/_version.py export-subst diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a4d23bb --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +build +_build +__pycache__ +*.pyc +\#*\# +*.so +*.dll +*.dylib diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..d442eba --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include versioneer.py +include pyculib/_version.py diff --git a/README.md b/README.md index 38c2318..0752f51 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,79 @@ -# pycudalib -pycudalib +# Pyculib + +Pyculib provides Python bindings to the following CUDA libraries: + + * [cuBLAS](https://developer.nvidia.com/cublas) + * [cuFFT](https://developer.nvidia.com/cufft) + * [cuSPARSE](https://developer.nvidia.com/cusparse) + * [cuRAND](https://developer.nvidia.com/curand) + * CUDA Sorting algorithms from the [CUB](https://nvlabs.github.io/cub/) and + [Modern GPU](https://github.com/moderngpu/moderngpu) libraries. + +These bindings are direct ports of those available in [Anaconda +Accelerate](https://docs.continuum.io/accelerate/cuda-libs). + +Documentation is located [here](LINK_TO_DOCS) + +## Installing + +The easiest way to install Pyculib and get updates is by using the [Anaconda +Distribution](https://www.continuum.io/downloads) + +``` +#> conda install pyculib +``` + +To compile from source, it is recommended to create a conda environment +containing the following: + + * cffi + * cudatoolkit + * numpy + * numba + * pyculib\_sorting + * scipy + +for instructions on how to do this see the [conda](https://conda.io/docs/) +documentation, specifically the section on [managing +environments](https://conda.io/docs/using/envs.html#managing-environments). + +Once a suitable environment is activated, installation achieved simply by +running: + +``` +#> python setup.py install +``` + +and the installation can be tested with: + +``` +#> ./runtests.py +``` + + +## Documentation + +Documentation is located [here](LINK_TO_DOCS). + +### Building Documentation + +It is also possible to build a local copy of the documentation from source. +This requires GNU Make and sphinx (available via conda). + + +Documentation is stored in the `doc` folder, and should be built with: + +``` +#> make SPHINXOPTS=-Wn clean html +``` + +This ensures that the documentation renders without errors. If errors occur, +they can all be seen at once by building with: + +``` +#> make SPHINXOPTS=-n clean html +``` + +However, these errors should all be fixed so that building with `-Wn` is +possible prior to merging any documentation changes or updates. + diff --git a/condarecipe/bld.bat b/condarecipe/bld.bat new file mode 100644 index 0000000..39b5e1f --- /dev/null +++ b/condarecipe/bld.bat @@ -0,0 +1 @@ +%PYTHON% setup.py install diff --git a/condarecipe/build.sh b/condarecipe/build.sh new file mode 100755 index 0000000..4dad93a --- /dev/null +++ b/condarecipe/build.sh @@ -0,0 +1,2 @@ +#!/bin/bash +$PYTHON setup.py install diff --git a/condarecipe/meta.yaml b/condarecipe/meta.yaml new file mode 100644 index 0000000..52d61a6 --- /dev/null +++ b/condarecipe/meta.yaml @@ -0,0 +1,33 @@ +package: + name: pyculib + version: {{ GIT_DESCRIBE_TAG }} + +source: + path: .. + +build: + number: {{ GIT_DESCRIBE_NUMBER|int }} + +requirements: + build: + - python + - numpy x.x + - numba + - cffi + run: + - python + - numpy x.x + - numba + - cudatoolkit 7.5 + - libgfortran [linux64] + - pyculib_sorting 1.0.0 + - cffi + +test: + requires: + - pytest + - scipy + imports: + - pyculib + commands: + - python -m unittest -v pyculib diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..525be10 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,192 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Pyculib.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Pyculib.qhc" + +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/Pyculib" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Pyculib" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..fe25fa4 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,263 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source +set I18NSPHINXOPTS=%SPHINXOPTS% source +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 2> nul +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Pyculib.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Pyculib.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/docs/quickstart.md b/docs/quickstart.md new file mode 100644 index 0000000..0a45f20 --- /dev/null +++ b/docs/quickstart.md @@ -0,0 +1,48 @@ +# Pyculib QuickStart + +Pyculib provides access to NVIDIA' optimized CUDA libraries +from a high-level, Pythonic interface. It builds on top of the functionality +provided in the open-source Numba JIT compiler. + + +## How do I install it? + +System requirements: + +* Python 2.7, 3.4+ +* Numpy XXX TODO: FIX +* NVIDIA CUDA-enabled GPU with compute + capability 2.0 or above. CUDA Toolkit 7.5 and driver version 349.00 or above + ([https://developer.nvidia.com/cuda-toolkit](https://developer.nvidia.com/cuda-toolkit)) +* Numba 0.33+ + +### Install from Anaconda + +Download and install Anaconda from +[https://www.continuum.io/downloads](https://www.continuum.io/downloads). + +In terminal: + +``` +conda update conda +conda install pyculib +``` + +## How do I use it? + +Pyculib provides access to optimized dense and sparse linear algebra, random +number generators, sorting routines, and FFTs. This example demonstrates the use +of CUDA-FFT routines: + + +### CUDA-Accelerated FFT + +```python +from pyculib.fft import fft +fft(x, xf) +``` + +## Where do I learn more? + +* Full documentation: (Insert URL) +* CUDA-accelerated functions: (Insert URL)/cudalibs.html diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..32e8a41 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Pyculib documentation build configuration file, created by +# sphinx-quickstart on Fri Aug 7 17:43:03 2015. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +import shlex + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.mathjax', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Pyculib' +copyright = '2017, Continuum Analytics, Inc.' +author = 'Continuum Analytics, Inc.' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1' +# The full version, including alpha/beta/rc tags. +release = '0.1.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Pyculibdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Pyculib.tex', 'Pyculib Documentation', + 'Continuum Analytics, Inc.', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pyculib', 'Pyculib Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Pyculib', 'Pyculib Documentation', + author, 'Pyculib', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = { 'python': ('https://docs.python.org/', None), + 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), + 'numpy': ('https://docs.scipy.org/doc/numpy', None), +} + +# Ignore python.array not being resolved +nitpick_ignore = [('py:obj', 'python.array')] diff --git a/docs/source/cublas.rst b/docs/source/cublas.rst new file mode 100644 index 0000000..8760cca --- /dev/null +++ b/docs/source/cublas.rst @@ -0,0 +1,368 @@ +cuBLAS +====== + +Provides basic linear algebra building blocks. See `NVIDIA cuBLAS +`_. + +The cuBLAS binding provides an interface that accepts NumPy arrays and Numba's +CUDA device arrays. The binding automatically transfers NumPy array arguments to +the device as required. This automatic transfer may generate some unnecessary +transfers, so optimal performance is likely to be obtained by the manual +transfer for NumPy arrays into device arrays and using the cuBLAS to manipulate +device arrays where possible. + +No special naming convention is used to identify the data +type, unlike in the BLAS C and Fortran APIs. Arguments for array storage +information which are part of the cuBLAS C API are also not necessary since +NumPy arrays and device arrays contain this information. + +All functions are accessed through the :class:`pyculib.blas.Blas` class: + +.. autoclass:: pyculib.blas.Blas + +BLAS Level 1 +------------ + +.. py:method:: pyculib.blas.Blas.nrm2(x) + + Computes the L2 norm for array `x`. Same as `numpy.linalg.norm(x)`. + + :param x: input vector + :type x: python.array + :returns: resulting norm. + +.. py:method:: pyculib.blas.Blas.dot(x, y) + + Compute the dot product of array `x` and array `y`. Same as `np.dot(x, y)`. + + :param x: vector + :type x: python.array + :param y: vector + :type y: python.array + :returns: dot product of `x` and `y` + +.. py:method:: pyculib.blas.Blas.dotc(x, y) + + Uses the conjugate of the element of the vectors to compute the dot product + of array `x` and array `y` for complex dtype only. Same as `np.vdot(x, y)`. + + :param x: vector + :type x: python.array + :param y: vector + :type y: python.array + :returns: dot product of `x` and `y` + + +.. py:method:: pyculib.blas.Blas.scal(alpha, x) + + Scale `x` inplace by alpha. Same as `x = alpha * x` + + :param alpha: scalar + :param x: vector + :type x: python.array + +.. py:method:: pyculib.blas.Blas.axpy(alpha, x) + + Compute `y = alpha * x + y` inplace. + + :param alpha: scalar + :param x: vector + :type x: python.array + + +.. py:method:: pyculib.blas.Blas.amax(x) + + + Find the index of the first largest element in array `x`. + Same as `np.argmax(x)` + + :param x: vector + :type x: python.array + :returns: index (start from 0). + + +.. py:method:: pyculib.blas.Blas.amin(x) + + Find the index of the first largest element in array `x`. + Same as `np.argmin(x)` + + :param x: vector + :type x: python.array + :returns: index (start from 0). + + +.. py:method:: pyculib.blas.Blas.asum(x) + + Compute the sum of all element in array `x`. + + :param x: vector + :type x: python.array + :returns: `x.sum()` + +.. py:method:: pyculib.blas.Blas.rot(x, y, c, s) + + Apply the Givens rotation matrix specified by the cosine element `c` and the + sine element `s` inplace on vector element `x` and `y`. + + Same as `x, y = c * x + s * y, -s * x + c * y` + + :param x: vector + :type x: python.array + :param y: vector + :type y: python.array + + +.. py:method:: pyculib.blas.Blas.rotg(a, b) + + Constructs the Givens rotation matrix with the column vector (a, b). + + :param a: first element of the column vector + :param b: second element of the column vector + :returns: a tuple (r, z, c, s) + + r -- `r = a**2 + b**2` + + z -- Use to reconstruct `c` and `s`. + Refer to cuBLAS documentation for detail. + + c -- The consine element. + + s -- The sine element. + + +.. py:method:: pyculib.blas.Blas.rotm(x, y, param) + + Applies the modified Givens transformation inplace. + + Same as:: + + param = flag, h11, h21, h12, h22 + x[i] = h11 * x[i] + h12 * y[i] + y[i] = h21 * x[i] + h22 * y[i] + + Refer to the cuBLAS documentation for the use of `flag`. + + :param x: vector + :type x: python.array + :param y: vector + :type y: python.array + + +.. py:method:: pyculib.blas.Blas.rotmg(d1, d2, x1, y1) + + Constructs the modified Givens transformation `H` that zeros out the second + entry of a column vector `(d1 * x1, d2 * y1)`. + + :param d1: scaling factor for the x-coordinate of the input vector + :param d2: scaling factor for the y-coordinate of the input vector + :param x1: x-coordinate of the input vector + :param y1: y-coordinate of the input vector + + :returns: A 1D array that is usable in `rotm`. + The first element is the flag for `rotm`. + The rest of the elements corresponds to the `h11, h21, h12, h22` + elements of `H`. + +BLAS Level 2 +------------- + +All level 2 routines follow the following naming convention for all arguments: + +* A, B, C, AP -- (2D array) Matrix argument. + `AP` implies packed storage for banded matrix. +* x, y, z -- (1D arrays) Vector argument. +* alpha, beta -- (scalar) Can be floats or complex numbers depending. +* m -- (scalar) Number of rows of matrix `A`. +* n -- (scalar) Number of columns of matrix `A`. If `m` is not needed, + `n` also means the number of rows of the matrix `A`; thus, + implying a square matrix. +* trans, transa, transb -- (string) + Select the operation `op` to apply to a matrix: + + - 'N': `op(X) = X`, the identity operation; + - 'T': `op(X) = X**T`, the transpose; + - 'C': `op(X) = X**H`, the conjugate transpose. + + `trans` only applies to the only matrix argument. + `transa` and `transb` apply to matrix `A` and matrix `B`, + respectively. +* uplo -- (string) Can be 'U' for filling the upper trianglar matrix; or 'L' for + filling the lower trianglar matrix. +* diag -- (boolean) Whether the matrix diagonal has unit elements. +* mode -- (string) 'L' means the matrix is on the left side in the equation. + 'R' means the matrix is on the right side in the equation. + +.. note:: The last array argument is always overwritten with the result. + +.. py:method:: pyculib.blas.Blas.gbmv(trans, m, n, kl, ku, alpha, A, x, beta, y) + + banded matrix-vector multiplication `y = alpha * op(A) * x + beta * y` where + `A` has `kl` sub-diagonals and `ku` super-diagonals. + +.. py:method:: pyculib.blas.Blas.gemv(trans, m, n, alpha, A, x, beta, y) + + matrix-vector multiplication `y = alpha * op(A) * x + beta * y` + +.. py:method:: pyculib.blas.Blas.trmv(uplo, trans, diag, n, A, x) + + triangular matrix-vector multiplication `x = op(A) * x` + +.. py:method:: pyculib.blas.Blas.tbmv(uplo, trans, diag, n, k, A, x) + + triangular banded matrix-vector `x = op(A) * x` + +.. py:method:: pyculib.blas.Blas.tpmv(uplo, trans, diag, n, AP, x) + + triangular packed matrix-vector multiplication `x = op(A) * x` + +.. py:method:: pyculib.blas.Blas.trsv(uplo, trans, diag, n, A, x) + + Solves the triangular linear system with a single right-hand-side. + `op(A) * x = b` + +.. py:method:: pyculib.blas.Blas.tpsv(uplo, trans, diag, n, AP, x) + + Solves the packed triangular linear system with a single right-hand-side. + `op(A) * x = b` + +.. py:method:: pyculib.blas.Blas.tbsv(uplo, trans, diag, n, k, A, x) + + Solves the triangular banded linear system with a single right-hand-side. + `op(A) * x = b` + +.. py:method:: pyculib.blas.Blas.symv(uplo, n, alpha, A, x, beta, y) + + symmetric matrix-vector multiplication `y = alpha * A * x + beta * y` + +.. py:method:: pyculib.blas.Blas.hemv(uplo, n, alpha, A, x, beta, y) + + Hermitian matrix-vector multiplication `y = alpha * A * x + beta * y` + +.. py:method:: pyculib.blas.Blas.sbmv(uplo, n, k, alpha, A, x, beta, y) + + symmetric banded matrix-vector multiplication `y = alpha * A * x + beta * y` + +.. py:method:: pyculib.blas.Blas.hbmv(uplo, n, k, alpha, A, x, beta, y) + + Hermitian banded matrix-vector multiplication `y = alpha * A * x + beta * y` + +.. py:method:: pyculib.blas.Blas.spmv(uplo, n, alpha, AP, x, beta, y) + + symmetric packed matrix-vector multiplication `y = alpha * A * x + beta * y` + +.. py:method:: pyculib.blas.Blas.hpmv(uplo, n, alpha, AP, x, beta, y) + + Hermitian packed matrix-vector multiplication `y = alpha * A * x + beta * y` + +.. py:method:: pyculib.blas.Blas.ger(m, n, alpha, x, y, A) + + the rank-1 update `A := alpha * x * y ** T + A` + +.. py:method:: pyculib.blas.Blas.geru(m, n, alpha, x, y, A) + + the rank-1 update `A := alpha * x * y ** T + A` + +.. py:method:: pyculib.blas.Blas.gerc(m, n, alpha, x, y, A) + + the rank-1 update `A := alpha * x * y ** H + A` + +.. py:method:: pyculib.blas.Blas.syr(uplo, n, alpha, x, A) + + symmetric rank 1 operation `A := alpha * x * x ** T + A` + +.. py:method:: pyculib.blas.Blas.her(uplo, n, alpha, x, A) + + hermitian rank 1 operation `A := alpha * x * x ** H + A` + +.. py:method:: pyculib.blas.Blas.spr(uplo, n, alpha, x, AP) + + the symmetric rank 1 operation `A := alpha * x * x ** T + A` + +.. py:method:: pyculib.blas.Blas.hpr(uplo, n, alpha, x, AP) + + hermitian rank 1 operation `A := alpha * x * x ** H + A` + +.. py:method:: pyculib.blas.Blas.syr2(uplo, n, alpha, x, y, A) + + symmetric rank-2 update `A = alpha * x * y ** T + y * x ** T + A` + +.. py:method:: pyculib.blas.Blas.her2(uplo, n, alpha, x, y, A) + + Hermitian rank-2 update `A = alpha * x * y ** H + alpha * y * x ** H + A` + +.. py:method:: pyculib.blas.Blas.spr2(uplo, n, alpha, x, y, A) + + packed symmetric rank-2 update `A = alpha * x * y ** T + y * x ** T + A` + +.. py:method:: pyculib.blas.Blas.hpr2(uplo, n, alpha, x, y, A) + + packed Hermitian rank-2 update `A = alpha * x * y ** H + alpha * y * x ** H + A` + +BLAS Level 3 +------------- + +All level 3 routines follow the same naming convention for arguments as in +level 2 routines. + +.. py:method:: pyculib.blas.Blas.gemm(transa, transb, m, n, k, alpha, A, B, beta, C) + + matrix-matrix multiplication `C = alpha * op(A) * op(B) + beta * C` + +.. py:method:: pyculib.blas.Blas.syrk(uplo, trans, n, k, alpha, A, beta, C) + + symmetric rank- k update `C = alpha * op(A) * op(A) ** T + beta * C` + +.. py:method:: pyculib.blas.Blas.herk(uplo, trans, n, k, alpha, A, beta, C) + + Hermitian rank- k update `C = alpha * op(A) * op(A) ** H + beta * C` + +.. py:method:: pyculib.blas.Blas.symm(side, uplo, m, n, alpha, A, B, beta, C) + + symmetric matrix-matrix multiplication:: + + if side == 'L': + C = alpha * A * B + beta * C + else: # side == 'R' + C = alpha * B * A + beta * C + +.. py:method:: pyculib.blas.Blas.hemm(side, uplo, m, n, alpha, A, B, beta, C) + + Hermitian matrix-matrix multiplication:: + + if side == 'L': + C = alpha * A * B + beta * C + else: # side == 'R': + C = alpha * B * A + beta * C + +.. py:method:: pyculib.blas.Blas.trsm(side, uplo, trans, diag, m, n, alpha, A, B) + + Solves the triangular linear system with multiple right-hand-sides:: + + if side == 'L': + op(A) * X = alpha * B + else: # side == 'R' + X * op(A) = alpha * B + + +.. py:method:: pyculib.blas.Blas.trmm(side, uplo, trans, diag, m, n, alpha, A, B, C) + + triangular matrix-matrix multiplication:: + + if side == ':' + C = alpha * op(A) * B + else: # side == 'R' + C = alpha * B * op(A) + +.. py:method:: pyculib.blas.Blas.dgmm(side, m, n, A, x, C) + + matrix-matrix multiplication:: + + if mode == 'R': + C = A * x * diag(X) + else: # mode == 'L' + C = diag(X) * x * A + + +.. py:method:: pyculib.blas.Blas.geam(transa, transb, m, n, alpha, A, beta, B, C) + + matrix-matrix addition/transposition `C = alpha * op(A) + beta * op(B)` diff --git a/docs/source/cuda-libs.rst b/docs/source/cuda-libs.rst new file mode 100644 index 0000000..ef2b409 --- /dev/null +++ b/docs/source/cuda-libs.rst @@ -0,0 +1,14 @@ +============== +CUDA libraries +============== + +The following CUDA libraries have bindings and algorithms that are available for use with Pyculib: + +.. toctree:: + :maxdepth: 1 + + cublas + cusparse + cufft + curand + sorting diff --git a/docs/source/cufft.rst b/docs/source/cufft.rst new file mode 100644 index 0000000..26cfd66 --- /dev/null +++ b/docs/source/cufft.rst @@ -0,0 +1,38 @@ +cuFFT +======= + +Provides FFT and inverse FFT for 1D, 2D and 3D arrays. +See `NVIDIA cuFFT `_. + + +.. note:: cuFFT only supports FFT operations on numpy.float32, numpy float64, + numpy.complex64, numpy.complex128 with C-contiguous datalayout. + + +Forward FFT +------------ + +.. py:function:: pyculib.fft.fft(ary, out[, stream]) +.. py:function:: pyculib.fft.fft_inplace(ary[, stream]) + + :param ary: The input array. The inplace version stores the result in here. + :param out: The output array for non-inplace versions. + :param stream: The CUDA stream in which all operations will take place. + + +Inverse FFT +------------ + +.. py:function:: pyculib.fft.ifft(ary, out[, stream]) +.. py:function:: pyculib.fft.ifft_inplace(ary[, stream]) + + :param ary: The input array. The inplace version stores the result in here. + :param out: The output array for non-inplace versions. + :param stream: The CUDA stream in which all operations will take place. + +FFTPlan +-------- + +.. autoclass:: pyculib.fft.FFTPlan + :members: + diff --git a/docs/source/curand.rst b/docs/source/curand.rst new file mode 100644 index 0000000..b31224e --- /dev/null +++ b/docs/source/curand.rst @@ -0,0 +1,43 @@ +cuRAND +====== + +Provides `pseudo-random number generator` (PRNG) and `quasi-random generator` (QRNG). +See `NVIDIA cuRAND `_. + +class PRNG +----------- + +.. autoclass:: pyculib.rand.PRNG + :members: + + +class QRNG +------------ + +.. autoclass:: pyculib.rand.QRNG + :members: + + +Top Level PRNG Functions +-------------------------- + +Simple interface to the PRNG methods. + +.. note:: This methods automatically create a PRNG object. + +.. autofunction:: pyculib.rand.uniform + +.. autofunction:: pyculib.rand.normal + +.. autofunction:: pyculib.rand.lognormal + +.. autofunction:: pyculib.rand.poisson + +Top Level QRNG Functions +-------------------------- + +Simple interface to the QRNG methods. + +.. note:: This methods automatically create a QRNG object. + +.. autofunction:: pyculib.rand.quasi diff --git a/docs/source/cusparse.rst b/docs/source/cusparse.rst new file mode 100644 index 0000000..5c6c405 --- /dev/null +++ b/docs/source/cusparse.rst @@ -0,0 +1,763 @@ +cuSPARSE +======== + +Provides basic linear algebra operations for sparse matrices. See `NVIDIA +cuSPARSE `_ for an in-depth description +of the cuSPARSE library and its methods and data types. All functions are +accessed through the :class:`pyculib.sparse.Sparse` class: + +.. autoclass:: pyculib.sparse.Sparse + +Similarly to the cuBLAS interface, no special naming convention is used for +functions to operate on different datatypes - all datatypes are handled by each +function, and dispatch of the corresponding library function is handled by +Pyculib. However, it is often necessary to provide a *matrix descriptor* to +functions, which provides some information about the format and properties of a +matrix. A matrix descriptor can be obtained from the +:py:meth:`pyculib.sparse.Sparse.matdescr` method: + +.. py:method:: pyculib.sparse.Sparse.matdescr(indexbase, diagtype, fillmode, matrixtype) + + Creates a matrix descriptor that describes a matrix with the given + `indexbase`, `diagtype`, `fillmode`, and `matrixtype`. Note that not all of + these options are relevant to every matrix storage format. + + :param indexbase: Optional. 0 for 0-based indexing, or 1 for 1-based + indexing. If not specified, the default given to the + :py:class:`pyculib.sparse.Sparse` constructor is + used instead. + :param diagtype: Optional. Defaults to `'N'`. `'N'` signifies that the matrix + diagonal has non-unit elements. `'U'` signifies that the + matrix diagonal only contains unit elements. + :param fillmode: Optional. Defaults to `'L'`. `'L'` indicates that the lower + triangular part of the matrix is stored. `'U'` indicates + that the upper triangular part of the matrix is stored. + :param matrixtype: Optional. Defaults to `'G'`. `'S'` indicates that the + matrix is symmetric. `'H'` indicates that the matrix is + Hermitian. `'T'` indicates that the matrix is triangular. + `'G'` is used for a *general* matrix, which is not + symmetric, Hermitian, or triangular. + :return: A matrix descriptor. + +Many of the methods of the :class:`pyculib.sparse.Sparse` class accept +the individual data structures that make up a sparse representation of a matrix +(for example the values, the row pointers and the column indices for a CSR +format matrix). However, some methods (such as +:py:meth:`pyculib.sparse.Sparse.csrgemm_ez`), accept an instance of the +:class:`pyculib.sparse.CudaSparseMatrix` class: + +.. py:class:: pyculib.sparse.CudaSparseMatrix() + + Base class for a representation of a sparse matrix on a CUDA device. The + constructor takes no arguments. + + .. py:method:: from_host_matrix(matrix, stream) + + Initialise the matrix structure and values from an instance of a matrix on + the host. The host matrix must be of the corresponding host type, which is + documented for each subclass below. + + .. py:method:: copy_to_host(stream) + + Create an instance of the corresponding host matrix type and copy the + matrix structure and data into it from the device. See subclass + documentation for an indication of the corresponding matrix type. + +Subclasses of the sparse matrix type are: + +.. py:class:: pyculib.sparse.CudaBSRMatrix() + + CUDA sparse matrix for which the corresponding type is a + :py:class:`scipy.sparse.bsr_matrix`. + +.. py:class:: pyculib.sparse.CudaCSRMatrix() + + CUDA sparse matrix for which the corresponding type is a + :py:class:`scipy.sparse.csr_matrix`. + +.. py:class:: pyculib.sparse.CudaCSCMatrix() + + CUDA sparse matrix for which the corresponding type is a + :py:class:`scipy.sparse.csc_matrix`. + +There are also some convenience methods for constructing CUDA sparse matrices in +a similar manner to Scipy sparse matrices: + +.. automethod:: pyculib.sparse.bsr_matrix + +.. automethod:: pyculib.sparse.csr_matrix + +.. automethod:: pyculib.sparse.csc_matrix + +BLAS Level 1 +------------ + +.. py:method:: pyculib.sparse.Sparse.axpyi(alpha, xVal, xInd, y) + + Multiplies the sparse vector `x` by `alpha` and adds the result to the dense + vector `y`. + + :param alpha: scalar + :param xVal: vector of non-zero values of `x` + :param xInd: vector of indices of non-zero values of `x` + :param y: dense vector + :return: dense vector + +.. py:method:: pyculib.sparse.Sparse.doti(xVal, xInd, y) + + Computes the dot product of the sparse vector `x` and dense vector `y`. + + :param xVal: vector of non-zero values of `x` + :param xInd: vector of indices of non-zero values of `x` + :param y: dense vector + :return: scalar + +.. py:method:: pyculib.sparse.Sparse.dotci(xVal, xInd, y) + + Computes the dot product of the complex conjugate of the sparse vector `x` + and the dense vector `y`. + + :param xVal: vector of non-zero values of `x` + :param xInd: vector of indices of non-zero values of `x` + :param y: dense vector + :return: scalar + +.. py:method:: pyculib.sparse.Sparse.gthr(y, xVal, xInd) + + Gathers the elements of `y` at the indices `xInd` into the array `xVal` + + :param xVal: vector of non-zero values of `x` + :param xInd: vector of indices of non-zero values of `x` + :param y: dense vector + :return: None + +.. py:method:: pyculib.sparse.Sparse.gthrz(y, xVal, xInd) + + Gathers the elements of `y` at the indices `xInd` into the array `xVal` and + zeroes out the gathered elements of `y`. + + :param xVal: vector of non-zero values of `x` + :param xInd: vector of indices of non-zero values of `x` + :param y: dense vector + :return: None + +.. py:method:: pyculib.sparse.Sparse.roti(xVal, xInd, y, c, s) + + Applies the Givens rotation matrix, `G`: + + .. math:: + + G = \left( \begin{array}{cc} + C & S \\ + -S & C + \end{array}\right) + + to the sparse vector `x` and dense vector + `y`. + + :param xVal: vector of non-zero values of `x` + :param xInd: vector of indices of non-zero values of `x` + :param y: dense vector + :param c: cosine element of the rotation matrix + :param s: sine element of the rotation matrix + :return: None + +.. py:method:: pyculib.sparse.Sparse.sctr(xVal, xInd, y) + + Scatters the elements of the sparse vector `x` into the dense vector `y`. + Elements of `y` whose indices are not listed in `xInd` are unmodified. + + :param xVal: vector of non-zero values of `x` + :param xInd: vector of indices of non-zero values of `x` + :param y: dense vector + :return: None + + +BLAS Level 2 +------------ + +All level 2 routines follow the following naming convention for the following +arguments: + +* alpha, beta -- (scalar) Can be real or complex numbers. +* descr, descrA, descrB -- (descriptor) Matrix descriptor. An appropriate + descriptor may be obtained by calling + :py:meth:`pyculib.sparse.Sparse.matdescr`. `descr` only applies to the + only matrix argument. `descrA` and `descrB` apply to matrix `A` and matrix + `B`, respectively. +* dir -- (string) Can be `'C'` to indicate column-major block storage or `'R'` + to indicate row-major block storage. +* trans, transa, transb -- (string) + Select the operation `op` to apply to a matrix: + + - `'N'`: `op(X) = X`, the identity operation; + - `'T'`: `op(X) = X**T`, the transpose; + - `'C'`: `op(X) = X**H`, the conjugate transpose. + + `trans` only applies to the only matrix argument. + `transa` and `transb` apply to matrix `A` and matrix `B`, + respectively. + + +.. py:method:: pyculib.sparse.Sparse.bsrmv_matrix(dir, trans, alpha, descr, bsrmat, x, beta, y) + + Matrix-vector multiplication `y = alpha * op(A) * x + beta * y` with a + BSR-format matrix. + + :param dir: block storage direction + :param trans: operation to apply to the matrix + :param alpha: scalar + :param descr: matrix descriptor + :param bsrmat: the matrix `A` + :param x: dense vector + :param beta: scalar + :param y: dense vector + :return: None + +.. py:method:: pyculib.sparse.Sparse.bsrmv(dir, trans, mb, nb, nnzb, alpha, descr, bsrVal, bsrRowPtr, bsrColInd, blockDim, x, beta, y) + + Matrix-vector multiplication `y = alpha * op(A) * x + beta * y` with a + BSR-format matrix. This function accepts the individual arrays that make up + the structure of a BSR matrix - if a + :class:`pyculib.sparse.CudaBSRMatrix` instance is to hand, it is + recommended to use the :py:meth:`bsrmv_matrix` method instead. + + :param dir: block storage direction + :param trans: operation to apply to the matrix + :param mb: Number of block rows of the matrix + :param nb: Number of block columns of the matrix + :param nnzb: Number of nonzero blocks of the matrix + :param alpha: scalar + :param descr: matrix descriptor + :param bsrVal: vector of nonzero values of the matrix + :param bsrRowPtr: vector of block row pointers of the matrix + :param bsrColInd: vector of block column indices of the matrix + :param blockDim: block dimension of the matrix + :param x: dense vector + :param beta: scalar + :param y: dense vector + :return: None + +.. py:method:: pyculib.sparse.Sparse.bsrxmv(dir, trans, sizeOfMask, mb, nb, nnzb, alpha, descr, bsrVal, bsrMaskPtr, bsrRowPtr, bsrEndPtr, bsrColInd, blockDim, x, beta, y) + + Matrix-vector multiplication similar to :py:meth:`bsrmv`, but including a + mask operation: `y(mask) = (alpha * op(A) * x + beta * y)(mask)`. The blocks + of y to be updated are specified in `bsrMaskPtr`. Blocks whose indices are + not specified in `bsrMaskPtr` are left unmodified. + + :param dir: block storage direction + :param trans: operation to apply to the matrix + :param sizeOfMask: number of updated blocks of rows of `y` + :param mb: Number of block rows of the matrix + :param nb: Number of block columns of the matrix + :param nnzb: Number of nonzero blocks of the matrix + :param alpha: scalar + :param descr: matrix descriptor + :param bsrVal: vector of nonzero values of the matrix + :param bsrMaskPtr: vector of indices of the block elements to be updated + :param bsrRowPtr: vector of block row pointers of the matrix + :param bsrEndPtr: vector of pointers to the end of every block row plus one + :param bsrColInd: vector of block column indices of the matrix + :param blockDim: block dimension of the matrix + :param x: dense vector + :param beta: scalar + :param y: dense vector + :return: None + +.. py:method:: pyculib.sparse.Sparse.csrmv(trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y) + + Matrix-vector multiplication `y = alpha * op(A) * x + beta * y` with a + CSR-format matrix. + + :param trans: operation to apply to the matrix + :param m: Number of rows of the matrix + :param n: Number of columns of the matrix + :param nnz: Number of nonzeroes of the matrix + :param alpha: scalar + :param descr: matrix descriptor + :param csrVal: vector of nonzero values of the matrix + :param csrRowPtr: vector of row pointers of the matrix + :param csrColInd: vector of column indices of the matrix + :param x: dense vector + :param beta: scalar + :param y: dense vector + :return: None + +.. py:method:: pyculib.sparse.Sparse.csrsv_analysis(trans, m, nnz, descr, csrVal, csrRowPtr, csrColInd) + + Performs the analysis phase of the solution of the sparse triangular linear + system `op(A) * y = alpha * x`. This needs to be executed only once for a + given matrix and operation type. + + :param trans: operation to apply to the matrix + :param m: number of rows of the matrix + :param nnz: number of nonzeroes of the matrix + :param descr: matrix descriptor + :param csrVal: vector of nonzero values of the matrix + :param csrRowPtr: vector of row pointers of the matrix + :param csrColInd: vector of column indices of the matrix + :return: the analysis result, which can be used as input to the solve phase + +.. py:method:: pyculib.sparse.Sparse.csrsv_solve(trans, m, alpha, descr, csrVal, csrRowPtr, csrColInd, info, x, y) + + Performs the analysis phase of the solution of the sparse triangular linear + system `op(A) * y = alpha * x`. + + :param trans: operation to apply to the matrix + :param m: number of rows of the matrix + :param alpha: scalar + :param descr: matrix descriptor + :param csrVal: vector of nonzero values of the matrix + :param csrRowPtr: vector of row pointers of the matrix + :param csrColInd: vector of column indices of the matrix + :param info: the analysis result from :py:meth:`csrsv_analysis` + :param x: dense vector + :param y: dense vector into which the solve result is stored + :return: None + + +BLAS Level 3 +------------ + +.. py:method:: pyculib.sparse.Sparse.csrmm(transA, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc) + + Matrix-matrix multiplication `C = alpha * op(A) * B + beta * C` where `A` is + a sparse matrix in CSR format and `B` and `C` are dense matrices. + + :param transA: operation to apply to `A` + :param m: number of rows of `A` + :param n: number of columns of `B` and `C` + :param k: number of columns of `A` + :param nnz: number of nonzeroes in `A` + :param alpha: scalar + :param descrA: matrix descriptor + :param csrValA: vector of nonzero values of `A` + :param csrRowPtrA: vector of row pointers of `A` + :param csrColIndA: vector of column indices of `A` + :param B: dense matrix + :param ldb: leading dimension of `B` + :param beta: scalar + :param C: dense matrix + :param ldc: leading dimension of `C` + :return: None + +.. py:method:: pyculib.sparse.Sparse.csrmm2(transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc) + + Matrix-matrix multiplication `C = alpha * op(A) * op(B) + beta * C` where `A` is + a sparse matrix in CSR format and `B` and `C` are dense matrices. + + :param transA: operation to apply to `A` + :param transB: operation to apply to `B` + :param m: number of rows of `A` + :param n: number of columns of `B` and `C` + :param k: number of columns of `A` + :param nnz: number of nonzeroes in `A` + :param alpha: scalar + :param descrA: matrix descriptor + :param csrValA: vector of nonzero values of `A` + :param csrRowPtrA: vector of row pointers of `A` + :param csrColIndA: vector of column indices of `A` + :param B: dense matrix + :param ldb: leading dimension of `B` + :param beta: scalar + :param C: dense matrix + :param ldc: leading dimension of `C` + :return: None + +.. py:method:: pyculib.sparse.Sparse.csrsm_analysis(transA, m, nnz, descrA, csrValA, csrRowPtrA, csrColIndA) + + Performs the analysis phase of the solution of a sparse triangular linear + system `op(A) * Y = alpha * X` with multiple right-hand sides where `A` is a + sparse matrix in CSR format, and `X` and `Y` are dense matrices. + + :param transA: operation to apply to `A` + :param m: number of rows of `A` + :param nnz: number of nonzeroes in `A` + :param descrA: matrix descriptor + :param csrValA: vector of nonzero values of `A` + :param csrRowPtrA: vector of row pointers of `A` + :param csrColIndA: vector of column indices of `A` + :return: the analysis result + +.. py:method:: pyculib.sparse.Sparse.csrsm_solve(transA, m, n, alpha, descrA, csrValA, csrRowPtrA, csrColIndA, info, X, ldx, Y, ldy) + + Performs the analysis phase of the solution of a sparse triangular linear + system `op(A) * Y = alpha * X` with multiple right-hand sides where `A` is a + sparse matrix in CSR format, and `X` and `Y` are dense matrices. + + :param transA: operation to apply to `A` + :param m: number of rows of `A` + :param n: number of columns of `B` and `C` + :param alpha: scalar + :param descrA: matrix descriptor + :param csrValA: vector of nonzero values of `A` + :param csrRowPtrA: vector of row pointers of `A` + :param csrColIndA: vector of column indices of `A` + :param info: the analysis result from :py:meth:`csrsm_analysis` + :param X: dense matrix + :param ldx: leading dimension of `X` + :param Y: dense matrix + :param ldy: leading dimension of `Y` + :return: None + + +Extra Functions +--------------- + +.. py:method:: pyculib.sparse.Sparse.XcsrgeamNnz(m, n, descrA, nnzA, csrRowPtrA, csrColIndA, descrB, nnzB, csrRowPtrB, csrColIndB, descrC, csrRowPtrC) + + Set up the sparsity pattern for the matrix operation `C = alpha * A + beta * + B` where `A`, `B`, and `C` are all sparse matrices in CSR format. + + :param m: number of rows of all matrices + :param n: number of columns of all matrices + :param descrA: matrix descriptor for `A` + :param nnzA: number of nonzeroes in `A` + :param csrRowPtrA: vector of row pointers of `A` + :param csrColIndA: vector of column indices of `A` + :param descrB: matrix descriptor for `B` + :param nnzB: number of nonzeroes in `B` + :param csrRowPtrB: vector of row pointers of `B` + :param csrColIndB: vector of column indices of `B` + :param descrC: matrix descriptor for `B` + :param csrRowPtrC: vector of row pointers of `C`, written to by this method + :return: number of nonzeroes in `C` + +.. py:method:: pyculib.sparse.Sparse.csrgeam(m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC) + + Performs the the matrix operation `C = alpha * A + beta * B` where `A`, `B`, + and `C` are all sparse matrices in CSR format. + + :param m: number of rows of all matrices + :param n: number of columns of all matrices + :param alpha: scalar + :param descrA: matrix descriptor for `A` + :param nnzA: number of nonzeroes in `A` + :param csrValA: vector of nonzero values of `A` + :param csrRowPtrA: vector of row pointers of `A` + :param csrColIndA: vector of column indices of `A` + :param beta: scalar + :param descrB: matrix descriptor for `B` + :param nnzB: number of nonzeroes in `B` + :param csrValB: vector of nonzero values of `B` + :param csrRowPtrB: vector of row pointers of `B` + :param csrColIndB: vector of column indices of `B` + :param descrC: matrix descriptor for `B` + :param csrValC: vector of nonzero values of `C` + :param csrRowPtrC: vector of row pointers of `C` + :param csrColIndC: vector of column indices of `C` + :return: None + +.. py:method:: pyculib.sparse.Sparse.XcsrgemmNnz(transA, transB, m, n, k, descrA, nnzA, csrRowPtrA, csrColIndA, descrB, nnzB, csrRowPtrB, csrColIndB, descrC, csrRowPtrC) + + Set up the sparsity pattern for the matrix operation `C = op(A) * op(B)` + where `A`, `B`, and `C` are all sparse matrices in CSR format. + + :param transA: operation to apply to `A` + :param transB: operation to apply to `B` + :param m: number of rows of `A` and `C` + :param n: number of columns of `B` and `C` + :param k: number of columns/rows of `A`/`B` + :param descrA: matrix descriptor for `A` + :param nnzA: number of nonzeroes in `A` + :param csrRowPtrA: vector of row pointers of `A` + :param csrColIndA: vector of column indices of `A` + :param descrB: matrix descriptor for `B` + :param nnzB: number of nonzeroes in `B` + :param csrRowPtrB: vector of row pointers of `B` + :param csrColIndB: vector of column indices of `B` + :param descrC: matrix descriptor for `C` + :param csrRowPtrC: vector of row pointers of `C`, written by this function + :return: number of nonzeroes in `C` + +.. py:method:: pyculib.sparse.Sparse.csrgemm(transA, transB, m, n, k, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC) + + Perform the matrix operation `C = op(A) * op(B)` where `A`, `B`, and `C` are + all sparse matrices in CSR format. + + :param transA: operation to apply to `A` + :param transB: operation to apply to `B` + :param m: number of rows of `A` and `C` + :param n: number of columns of `B` and `C` + :param k: number of columns/rows of `A`/`B` + :param descrA: matrix descriptor for `A` + :param nnzA: number of nonzeroes in `A` + :param csrValA: vector of nonzero values in `A` + :param csrRowPtrA: vector of row pointers of `A` + :param csrColIndA: vector of column indices of `A` + :param descrB: matrix descriptor for `B` + :param nnzB: number of nonzeroes in `B` + :param csrValB: vector of nonzero values in `B` + :param csrRowPtrB: vector of row pointers of `B` + :param csrColIndB: vector of column indices of `B` + :param descrC: matrix descriptor for `C` + :param csrValC: vector of nonzero values in `C` + :param csrRowPtrC: vector of row pointers of `C` + :param csrColIndC: vector of column indices of `C` + :return: None + +.. py:method:: pyculib.sparse.Sparse.csrgemm_ez(A, B, transA='N', transB='N', descrA=None, descrB=None, descrC=None) + + Performs the matrix operation `C = op(A) * op(B)` where `A`, `B` and `C` + are all sparse matrices in CSR format. This function accepts and returns + :py:class:`pyculib.sparse.CudaCSRMatrix` matrices, and makes + calls to :py:meth:`XcsrgemmNnz` and :py:meth:`csrgemm`. + + :param A: :py:class:`pyculib.sparse.CudaCSRMatrix` + :param B: :py:class:`pyculib.sparse.CudaCSRMatrix` + :param transA: optional, operation to apply to `A` + :param transB: optional, operation to apply to `B` + :param descrA: optional, matrix descriptor for `A` + :param descrB: optional, matrix descriptor for `B` + :param descrC: optional, matrix descriptor for `C` + :return: :py:class:`pyculib.sparse.CudaCSRMatrix` + + +Preconditioners +--------------- + +.. py:method:: pyculib.sparse.Sparse.csric0(trans, m, descr, csrValA, csrRowPtrA, csrColIndA, info) + + Computes incomplete Cholesky factorization of a sparse matrix in CSR format + with 0 fill-in and no pivoting: `op(A) = R**T * R`. This method must follow a + call to :py:meth:`csrsv_analysis`. The matrix `A` is overwritten with the + upper or lower triangular factors `R` or `R**T`. + + :param trans: operation to apply to the matrix + :param m: number of rows and columns of the matrix + :param descr: matrix descriptor + :param csrValA: vector of nonzero values in `A` + :param csrRowPtrA: vector of row pointers of `A` + :param csrColIndA: vector of column indices of `A` + :param info: analysis result + :return: None + +.. py:method:: pyculib.sparse.Sparse.csrilu0(trans, m, descr, csrValA, csrRowPtrA, csrColIndA, info) + + Computes incomplete-LU factorization of a sparse matrix in CSR format with 0 + fill-in and no pivoting: `op(A) = L * U`. This method must follow a call to + :py:meth:`csrsv_analysis`. The matrix `A` is overwritten with the lower and + upper triangular factors `L` and `U`. + + :param trans: operation to apply to the matrix + :param m: number of rows and columns of the matrix + :param descr: matrix descriptor + :param csrValA: vector of nonzero values in `A` + :param csrRowPtrA: vector of row pointers of `A` + :param csrColIndA: vector of column indices of `A` + :param info: analysis result + :return: None + +.. py:method:: pyculib.sparse.Sparse.gtsv(m, n, dl, d, du, B, ldb) + + Computes the solution of a tridiagonal linear system with multiple right-hand + sides: `A * Y = alpha * X`. + + :param m: the size of the linear system + :param n: the number of right-hand sides in the system + :param dl: dense vector storing the lower-diagonal elements + :param d: dense vector storing the diagonal elements + :param du: dense vector storing the upper-diagonal elements + :param B: dense matrix holding the right-hand sides of the system + :param ldb: the leading dimension of `B` + :return: None + +.. py:method:: pyculib.sparse.Sparse.gtsv_nopivot(m, n, dl, d, du, B, ldb) + + Similar to :py:meth:`gtsv`, but computes the solution without performing any + pivoting. + + :param m: the size of the linear system + :param n: the number of right-hand sides in the system + :param dl: dense vector storing the lower-diagonal elements + :param d: dense vector storing the diagonal elements + :param du: dense vector storing the upper-diagonal elements + :param B: dense matrix holding the right-hand sides of the system + :param ldb: the leading dimension of `B` + :return: None + +.. py:method:: pyculib.sparse.Sparse.gtsvStridedBatch(m, dl, d, du, x, batchCount, batchStride) + + Computes the solution of `i` tridiagonal linear systems: `A(i) * y(i) = alpha + * x(i)`. + + :param m: the size of the linear systems + :param dl: stacked dense vector storing the lower-diagonal elements of each + system + :param d: stacked dense vector storing the diagonal elements of each system + :param du: stacked dense vector storing the upper-diagonal elements of each + system + :param x: dense matrix holding the right-hand sides of the systems + :param batchCount: number of systems to solve + :param batchStride: number of elements separating the vectors of each system + :return: None + + +Format Conversion +----------------- + +.. py:method:: pyculib.sparse.Sparse.bsr2csr(dirA, mb, nb, descrA, bsrValA, bsrRowPtrA, bsrColIndA, blockDim, descrC, csrValC, csrRowPtrC, csrColIndC) + + Convert the sparse matrix `A` in BSR format to CSR format, stored in `C`. + + :param dirA: row ('R') or column ('C') orientation of block storage + :param mb: number of block rows of `A` + :param nb: number of block columns of `A` + :param descrA: matrix descriptor for `A` + :param bsrValA: vector of nonzero values of `A` + :param bsrRowPtrA: vector of block row pointers of `A` + :param bsrColIndA: vector of block column indices of `A` + :param blockDim: block dimension of `A` + :param descrC: matrix descriptor for `C` + :param csrValA: vector of nonzero values in `C` + :param csrRowPtrA: vector of row pointers of `C` + :param csrColIndA: vector of column indices of `C` + :return: None + +.. py:method:: pyculib.sparse.Sparse.Xcoo2csr(cooRowInd, nnz, m, csrRowPtr) + + Converts an array containing uncompressed row indices corresponding to the + COO format into into an array of compressed row pointers corresponding to the + CSR format. + + :param cooRowInd: integer array of uncompressed row indices + :param nnz: number of nonzeroes + :param m: number of matrix rows + :param csrRowPtr: vector of row pointers to be written to + :return: None + +.. py:method:: pyculib.sparse.Sparse.csc2dense(m, n, descrA, cscValA, cscRowIndA, cscColPtrA, A, lda) + + Convert the sparse matrix `A` in CSC format into a dense matrix. + + :param m: number of rows of `A` + :param n: number of columns of `A` + :param descrA: matrix descriptor for `A` + :param cscValA: values in the CSC representation of `A` + :param cscRowIndA: row indices in the CSC representation of `A` + :param cscColPtrA: column pointers in the CSC representation of `A` + :param A: dense matrix representation of `A` to be written by this function. + :param lda: leading dimension of `A` + :return: None + +.. py:method:: pyculib.sparse.Sparse.Xcsr2bsrNnz(dirA, m, n, descrA, csrRowPtrA, csrColIndA, blockDim, descrC, bsrRowPtrC) + + Performs the analysis necessary for converting a matrix in CSR format into + BSR format. + + :param dirA: row ('R') or column ('C') orientation of block storage + :param m: number of rows of matrix + :param n: number of columns of matrix + :param descrA: matrix descriptor for input matrix `A` + :param csrRowPtrA: row pointers of matrix + :param csrColIndA: column indices of matrix + :param blockDim: block dimension of output matrix `C` + :param descrC: matrix descriptor for output matrix `C` + :return: number of nonzeroes of matrix + +.. py:method:: pyculib.sparse.Sparse.csr2bsr(dirA, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, blockDim, descrC, bsrValC, bsrRowPtrC, bsrColIndC) + + Performs conversion of a matrix from CSR format into BSR format. + + :param dirA: row ('R') or column ('C') orientation of block storage + :param m: number of rows of matrix + :param n: number of columns of matrix + :param descrA: matrix descriptor for input matrix `A` + :param csrValA: nonzero values of matrix + :param csrRowPtrA: row pointers of matrix + :param csrColIndA: column indices of matrix + :param blockDim: block dimension of output matrix `C` + :param descrC: matrix descriptor for output matrix `C` + :param bsrValC: nonzero values of output matrix `C` + :param bsrRowPtrC: block row pointers of output matrix `C` + :param bsrColIndC: block column indices of output matrix `C` + :return: number of nonzeroes of matrix + +.. py:method:: pyculib.sparse.Sparse.Xcsr2coo(csrRowPtr, nnz, m, cooRowInd) + + Converts an array of compressed row pointers corresponding to the CSR format + into an array of uncompressed row indices corresponding to the COO format. + + :param csrRowPtr: vector of row pointers + :param nnz: number of nonzeroes + :param m: number of rows of matrix + :param cooRowInd: vector of uncompressed row indices written by this function + :return: None + +.. py:method:: pyculib.sparse.Sparse.csr2csc(m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues) + + Converts a sparse matrix in CSR format into a sparse matrix in CSC format. + + :param m: number of rows of matrix + :param n: number of columns of matrix + :param nnz: number of nonzeroes of the matrix + :param csrVal: values in the CSR representation + :param csrRowPtr: row indices in the CSR representation + :param csrColInd: column pointers in the CSR representation + :param cscVal: values in the CSC representation + :param cscRowInd: row indices in the CSC representation + :param cscColPtr: column pointers in the CSC representation + :param copyValues: `'N'` or `'S'` for symbolic or numeric copy of values + :return: None + +.. py:method:: pyculib.sparse.Sparse.csr2dense(m, n, descr, csrVal, csrRowPtr, csrColInd, A, lda) + + Convert a sparse matrix in CSR format into dense format. + + :param m: number of rows of matrix + :param n: number of columns of matrix + :param descr: matrix descriptor + :param csrVal: values in the CSR representation + :param csrRowPtr: row indices in the CSR representation + :param csrColInd: column pointers in the CSR representation + :param A: the dense representation, written to by this function + :param lda: leading dimension of the matrix + :return: None + +.. py:method:: pyculib.sparse.Sparse.dense2csc(m, n, descrA, A, lda, nnzPerCol, cscVal, cscRowInd, cscColPtr) + + Convert a dense matrix into a sparse matrix in CSC format. The `nnzPerCol` + parameter may be computed with a call to :py:meth:`nnz`. + + :param m: number of rows of matrix + :param n: number of columns of matrix + :param descrA: matrix descriptor + :param A: the matrix in dense format + :param lda: leading dimension of the matrix + :param nnzPerCol: array containing the number of nonzero elements per column + :param cscVal: values in the CSC representation + :param cscRowInd: row indices in the CSC representation + :param cscColPtr: column pointers in the CSC representation + :return: None + +.. py:method:: pyculib.sparse.Sparse.dense2csr(m, n, descrA, A, lda, nnzPerRow, csrVal, csrRowPtr, csrColInd) + + Convert a dense matrix into a sparse matrix in CSR format. The `nnzPerRow` + parameter may be computed with a call to :py:meth:`nnz`. + + :param m: number of rows of matrix + :param n: number of columns of matrix + :param descrA: matrix descriptor + :param A: the matrix in dense format + :param lda: leading dimension of the matrix + :param nnzPerRow: array containing the number of nonzero elements per row + :param csrVal: values in the CSR representation + :param csrRowPtr: row indices in the CSR representation + :param csrColInd: column pointers in the CSR representation + :return: None + +.. py:method:: pyculib.sparse.Sparse.nnz(dirA, m, n, descrA, A, lda, nnzPerRowCol) + + Computes the number of nonzero elements per row or column of a dense matrix, + and the total number of nonzero elements in the matrix. + + :param dirA: `'R'` for the number of nonzeroes per row, or `'C'` for per + column. + :param m: number of rows of matrix + :param n: number of columns of matrix + :param descrA: matrix descriptor + :param A: the matrix + :param lda: leading dimension of the matrix + :param nnzPerRowCol: array to contain the number of nonzeroes per row or + column + :return: total number of nonzeroes in the matrix diff --git a/docs/source/env-variables.rst b/docs/source/env-variables.rst new file mode 100644 index 0000000..43fdb18 --- /dev/null +++ b/docs/source/env-variables.rst @@ -0,0 +1,7 @@ +===================== +Environment variables +===================== + +``PYCULIB_WARNINGS`` + +If set to anything but 0 (zero), Pyculib may issue performance warnings, such as when input arguments need to be copied to adjust their data layout, or types, to match particular backend requirements. diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..5678440 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,51 @@ +========= +Pyculib +========= + +:emphasis:`High Performance Computing` + +Pyculib is a package that provides access to several numerical libraries that are optimized for performance on NVidia GPUs. + +Pyculib was originally part of Accelerate, developed by Continuum Analytics Inc. + +The current version, 0.1.0, was released on TODO. + +Features +======== + +* Bindings to the following :doc:`cuda-libs`: + * :doc:`cublas` + * :doc:`cufft` + * :doc:`cusparse` + * :doc:`curand` + * :doc:`sorting` algorithms from the CUB and Modern GPU libraries + +Installation +============ + +This section contains information related to: + +.. toctree:: + :maxdepth: 1 + + install + + +User guide +========== + +This section contains information related to: + +.. toctree:: + :maxdepth: 1 + + cuda-libs + env-variables + +Release notes +============= + +.. toctree:: + :maxdepth: 1 + + release-notes diff --git a/docs/source/install.rst b/docs/source/install.rst new file mode 100644 index 0000000..b5c5fa2 --- /dev/null +++ b/docs/source/install.rst @@ -0,0 +1,36 @@ +Requirements +============ + +* 64-bit operating system--Windows, macOS or Linux +* Supported Python and Numpy combinations: + * Python XXX with Numpy XXX TODO: FILL IN +* Numba 0.33 + +CUDA feature requirements +------------------------- + +* NVidia driver version XXX or later TODO: FILL IN +* CUDA toolkit 7.5 +* At least one CUDA GPU with compute capability 2.0 or above + + +Installing Pyculib +===================== + +If you already have the `Anaconda free +Python distribution `, take the following steps to install Pyculib: + +#. Run the command ``conda update conda``. +#. Run the command ``conda install pyculib``. + +If you do not have Anaconda installed, see `Downloads `_. + +NOTE: Pyculib can also be installed into your own non-Anaconda Python environment via pip or setuptools. + +Updating Pyculib +=================== + +To update Pyculib, take the following steps: + +#. Run the command ``conda update conda``. +#. Run the command ``conda update pyculib``. diff --git a/docs/source/release-notes.rst b/docs/source/release-notes.rst new file mode 100644 index 0000000..93b55c5 --- /dev/null +++ b/docs/source/release-notes.rst @@ -0,0 +1,27 @@ +============= +Release notes +============= + +Version 1.0.0 +============= + +NumbaPro and Accelerate have been deprecated, and code generation features have +been moved into open-source Numba. The CUDA library functions have been moved +into Pyculib. There will be no further updates to NumbaPro or Accelerate. + +CUDA libraries +-------------- + +Pyculib CUDA library functionality is equivalent to that in Accelerate 2.+, +with the following packages renamed: + +=========================== =========================== +Accelerate package Pyculib package +=========================== =========================== +``accelerate.cuda.blas`` ``pyculib.blas`` +``accelerate.cuda.fft`` ``pyculib.fft`` +``accelerate.cuda.rand`` ``pyculib.rand`` +``accelerate.cuda.sparse`` ``pyculib.sparse`` +``accelerate.cuda.sorting`` ``pyculib.sorting`` +=========================== =========================== + diff --git a/docs/source/sorting.rst b/docs/source/sorting.rst new file mode 100644 index 0000000..60fb84e --- /dev/null +++ b/docs/source/sorting.rst @@ -0,0 +1,25 @@ +============ +CUDA Sorting +============ + +Pyculib provides routines for sorting arrays on CUDA GPUs. + +Sorting Large Arrays +==================== + +The :py:class:`pyculib.sorting.RadixSort` class is recommended for +sorting large (approx. more than 1 million items) arrays of numeric types. + +.. autoclass:: pyculib.sorting.RadixSort + :members: + +Sorting Many Small Arrays +========================= + +Using :py:class:`pyculib.sorting.RadixSort` on small (approx. less than +1 million items) arrays has significant overhead due to multiple kernel +launches. + +A better alternative is to use :py:func:`pyculib.sorting.segmented_sort`-which launches a single kernel for sorting a batch of many small arrays. + +.. autofunction:: pyculib.sorting.segmented_sort diff --git a/pyculib/__init__.py b/pyculib/__init__.py new file mode 100644 index 0000000..babdaba --- /dev/null +++ b/pyculib/__init__.py @@ -0,0 +1,77 @@ +from __future__ import absolute_import +import numba +import re +import unittest +import sys +import platform +from pyculib import config +from pyculib import warnings + +NUMBA_VERSION_REQ = (0, 33, 0) + +def check_numba_version(): + m = re.match(r"(\d+)\.(\d+)\.(\d+).*", numba.__version__) + if m is None or tuple(map(int, m.groups())) < NUMBA_VERSION_REQ: + import warnings + warnings.showwarning( + "Numba version too old; expecting %d.%d.%d" % NUMBA_VERSION_REQ, + ImportWarning, __name__, 1) + +check_numba_version() + +def load_tests(loader, tests, pattern): + from .tests import test_cases + + suite = unittest.TestSuite() + for test_class in test_cases: + tests = loader.loadTestsFromTestCase(test_class) + suite.addTests(tests) + return suite + +def cuda_compatible(): + if sys.platform.startswith('darwin'): + ver = platform.mac_ver()[0] + # version string can contain two or three components + major, minor = ver.split('.', 1) + if '.' in minor: + minor, micro = minor.split('.', 1) + if (int(major), int(minor)) < (10, 9): + return False + + is_64bits = sys.maxsize > 2**32 + if not is_64bits: + return False + + return True + +if cuda_compatible(): + from numba import cuda + from . import blas, sparse, fft, rand, sorting + +def test(): + success = True + if cuda_compatible() and cuda.is_available(): + print('CUDA Library tests'.center(80, '~'), '\n') + print('cuBLAS'.center(80, '-')) + if not blas.test().wasSuccessful(): + success = False + print('cuSPARSE'.center(80, '-')) + if not sparse.test().wasSuccessful(): + success = False + print('cuFFT'.center(80, '-')) + if not fft.test().wasSuccessful(): + success = False + print('cuRAND'.center(80, '-')) + if not rand.test().wasSuccessful(): + success = False + print('Sorting'.center(80, '-')) + if not sorting.test().wasSuccessful(): + success = False + else: + print('CUDA unavailable - skipped CUDA tests') + + return success + +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions diff --git a/pyculib/_version.py b/pyculib/_version.py new file mode 100644 index 0000000..8a81283 --- /dev/null +++ b/pyculib/_version.py @@ -0,0 +1,520 @@ + +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.18 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "" + cfg.parentdir_prefix = "pyculib" + cfg.versionfile_source = "pyculib/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} diff --git a/pyculib/blas/__init__.py b/pyculib/blas/__init__.py new file mode 100644 index 0000000..6d9a747 --- /dev/null +++ b/pyculib/blas/__init__.py @@ -0,0 +1,149 @@ +from __future__ import absolute_import, print_function +from .api import Blas, validate_argument_dtype +from pyculib.nputil import promote, alias, astype, colmajor +import numpy as np +from numbers import Number + + +_blas = None + +def _normalize_op(op): + + if op == 'n': return 'N' + elif op == 't': return 'T' + elif op == 'c': return 'C' + return op + + +def dot(x, y, stream=None): + """Compute and return the vector dot product of x and y.""" + global _blas + + validate_argument_dtype(x, 'x') + validate_argument_dtype(y, 'y') + if not _blas: _blas = Blas() + _blas.stream = stream + dtype = promote(x.dtype, y.dtype) + # FIXME: the col-major constraint actually only applies to host arrays. + # If x and y are already device arrays they can be strided. + return _blas.dot(colmajor(x, dtype, 'x'), colmajor(y, dtype, 'y')) + +def axpy(alpha, x, y, stream=None): + """y <- alpha*x + y """ + + global _blas + + if not isinstance(alpha, Number): raise ValueError('alpha is not a numeric type') + validate_argument_dtype(x, 'x') + validate_argument_dtype(y, 'y') + if not _blas: _blas = Blas() + _blas.stream = stream + dtype = promote(promote(type(alpha), x.dtype), y.dtype) + yf = colmajor(y, dtype, 'y') + _blas.axpy(dtype.type(alpha), x.astype(dtype), yf) + if y.dtype == yf.dtype and not alias(y, yf): + y[:] = yf + return y + else: + return yf + +def gemv(trans, alpha, A, x, beta=0, y=None, stream=None): + """Generalized matrix-vector multiplication: + + y <- alpha*trans(A)*x + beta*y + + 'beta' and 'y' are optional on input. Return 'y'.""" + + global _blas + + if not isinstance(alpha, Number): raise ValueError('alpha is not a numeric type') + validate_argument_dtype(A, 'A') + validate_argument_dtype(x, 'x') + if not isinstance(beta, Number): raise ValueError('beta is not a numeric type') + if A.ndim != 2: raise ValueError('A is not a two-dimensional array') + if x.ndim != 1: raise ValueError('x is not a one-dimensional array') + if not _blas: _blas = Blas() + _blas.stream = stream + m, n = A.shape + trans = _normalize_op(trans) + if trans not in ('N', 'T', 'C'): raise ValueError('trans has invalid value') + dtype = promote(promote(type(alpha), A.dtype), + promote(x.dtype, type(beta))) + if y is None: + y = np.empty(trans == 'N' and n or m, dtype=dtype) + yf = y + else: + validate_argument_dtype(y, 'y') + if y.ndim != 1: raise ValueError('y is not a one-dimensional array') + dtype = promote(dtype, y.dtype) + yf = colmajor(y, dtype, 'y') + if trans == 'N': + if A.shape[1] != x.shape[0]: + raise ValueError('arrays A and x have incompatible shapes') + if A.shape[0] != y.shape[0]: + raise ValueError('arrays A and y have incompatible shapes') + else: + if A.shape[0] != x.shape[0]: + raise ValueError('arrays A and x have incompatible shapes') + if A.shape[1] != y.shape[0]: + raise ValueError('arrays A and y have incompatible shapes') + _blas.gemv(trans, m, n, dtype.type(alpha), colmajor(A, dtype, 'A'), + x.astype(dtype), dtype.type(beta), yf) + if y.dtype == yf.dtype and not alias(y, yf): + y[:] = yf + return y + else: + return yf + +def gemm(transa, transb, alpha, A, B, beta=0, C=None, stream=None): + """Generalized matrix-matrix multiplication: + + C <- alpha*transa(A)*transb(B) + beta*C + + 'beta' and 'C' are optional on input. Return 'C'.""" + + global _blas + + if not isinstance(alpha, Number): raise ValueError('alpha is not a numeric type') + validate_argument_dtype(A, 'A') + validate_argument_dtype(B, 'B') + if not isinstance(beta, Number): raise ValueError('beta is not a numeric type') + if A.ndim != 2: raise ValueError('A is not a two-dimensional array') + if B.ndim != 2: raise ValueError('B is not a two-dimensional array') + if not _blas: _blas = Blas() + _blas.stream = stream + transa = _normalize_op(transa) + if transa not in ('N', 'T', 'C'): raise ValueError('transa has invalid value') + transb = _normalize_op(transb) + if transb not in ('N', 'T', 'C'): raise ValueError('transb has invalid value') + dtype = promote(promote(type(alpha), A.dtype), + promote(B.dtype, type(beta))) + M = transa == 'N' and A.shape[0] or A.shape[1] + N = transb == 'N' and B.shape[1] or B.shape[0] + K = transa == 'N' and A.shape[1] or A.shape[0] + if C is None: + C = np.empty(shape=(M, N), order='F', dtype=dtype) + Cf = C + else: + validate_argument_dtype(C, 'C') + if C.ndim != 2: raise ValueError('C is not a two-dimensional array') + if C.shape[0] != M: + raise ValueError('arrays A and C have incompatible shapes') + if C.shape[1] != N: + raise ValueError('arrays B and C have incompatible shapes') + dtype = promote(dtype, C.dtype) + Cf = colmajor(C, dtype, 'C') + + if transb == 'N': + if B.shape[0] != K: raise ValueError('arrays A and B have incompatible shapes') + else: + if B.shape[1] != K: raise ValueError('arrays A and B have incompatible shapes') + + _blas.gemm(transa, transb, M, N, K, dtype.type(alpha), + colmajor(A, dtype, 'A'), colmajor(B, dtype, 'B'), + dtype.type(beta), Cf) + if C.dtype == Cf.dtype and not alias(C, Cf): + C[:] = Cf + return C + else: + return Cf diff --git a/pyculib/blas/api.py b/pyculib/blas/api.py new file mode 100644 index 0000000..5b4af52 --- /dev/null +++ b/pyculib/blas/api.py @@ -0,0 +1,501 @@ +from __future__ import print_function, absolute_import, division +from contextlib import contextmanager +import numpy as np +from .binding import cuBlas +from numba import cuda + +def validate_argument_dtype(var, name): + + # Type map + if var.dtype.name not in ('float32', 'float64', 'complex64', 'complex128'): + raise TypeError('%s has unsupported type %s'%(name, var.dtype.name)) + +def _dtype_vtable(table): + return dict((np.dtype(k), v) for k, v in table.items()) + +def _sel_complex(real, imag): + return {float: real, + complex: imag, + np.float32: real, + np.float64: real, + np.complex64: imag, + np.complex128: imag,} + +def _auto_l2_functions(fname, tnames, argfmt, extras): + writebacks = set() + readonlys = set() + arglist = [] + extras = [s.lstrip().rstrip() for s in extras.split(',')] + dtypemap = { + np.dtype(np.float32): 'S', + np.dtype(np.float64): 'D', + np.dtype(np.complex64): 'C', + np.dtype(np.complex128): 'Z', + } + for i, a in enumerate(argfmt.split(',')): + a = a.lstrip().rstrip() + if ':' in a: + name, mode = a.split(':') + assert mode in 'wr', "invalid mode" + if mode == 'w': + writebacks.add(name) + else: + readonlys.add(name) + else: + name = a + arglist.append(name) + + def prepare_args(args, kws): + for i, a in enumerate(args): + name = arglist[i] + assert name not in kws, "missing argument %s" % name + kws[name] = a + for a in extras: + if a.startswith('ld') and len(a) == 3: + kws[a] = kws[a[-1].upper()].shape[0] + elif a.startswith('inc') and len(a) == 4: + ary = kws[a[-1]] + kws[a] = ary.strides[0] // ary.dtype.itemsize + else: + assert False, 'unreachable' + + devargs = list(writebacks | readonlys) + + def autodevice(kws, stream): + newkws = kws.copy() + cleanups = [] + for a in readonlys: + newkws[a], _ = cuda._auto_device(kws[a], stream=stream) + for a in writebacks: + dmem, conv = cuda._auto_device(kws[a], stream=stream) + newkws[a] = dmem + if conv: + cleanups.append((dmem, kws[a])) + return newkws, cleanups + + def _dispatch(self, *args, **kws): + prepare_args(args, kws) + dtype = kws[devargs[0]].dtype + for i, darg in enumerate(devargs[1:]): + got = kws[darg].dtype + if got != dtype: + msg = "%dth array dtype mismatch: got %s but expect %s" + raise TypeError(msg % (i + 1, got, dtype)) + typecode = dtypemap[dtype] + assert typecode in tnames, "unknown typecode" + fn = getattr(self._cublas, '%s%s' % (typecode, fname)) + kws, cleanups = autodevice(kws, self.stream) + res = fn(**kws) + for dmem, ary in cleanups: + dmem.copy_to_host(ary, stream=self.stream) + return res + + # changes how user see this function through help() + _dispatch.__name__ = fname + _dispatch.__doc__ = "%s(%s)" % (fname, argfmt) + return _dispatch + +class Blas(object): + '''All BLAS subprograms are available under the Blas object. + + :param stream: Optional. A CUDA Stream. + ''' + @cuda.require_context + def __init__(self, stream=0): + self._cublas = cuBlas() + if stream: + self._cublas.stream = stream + + @property + def stream(self): + return self._cublas.stream + + @stream.setter + def stream(self, stream): + self._cublas.stream = stream + + @contextmanager + def _auto(self, *arys): + ctx = (cuda._auto_device(ary, stream=self.stream) for ary in arys) + darys, convs = zip(*ctx) + if len(darys) == 1: + yield darys[0] + else: + yield darys + for dary, conv, ary in zip(darys, convs, arys): + if conv: + dary.copy_to_host(ary, stream=self.stream) + + @contextmanager + def _auto_read(self, *arys): + ctx = (cuda._auto_device(ary, stream=self.stream) for ary in arys) + darys, convs = zip(*ctx) + if len(darys) == 1: + yield darys[0] + else: + yield darys + + def _dispatch(self, vtable, *keys): + rsvl = vtable + for k in keys: + if not isinstance(rsvl, dict): + break + + try: + rsvl = rsvl[k] + except KeyError: + raise TypeError(k) + + return getattr(self._cublas, rsvl) + + def nrm2(self, x): + "Same as np.linalg.norm" + _sentry_same_dtype(x) + _sentry_ndim(1, x) + fn = self._dispatch(self.nrm2.vtable, x.dtype) + with self._auto_read(x) as dx: + return fn(x.size, dx, *_norm_stride(x)) + + nrm2.vtable = _dtype_vtable({np.float32: 'Snrm2', + np.float64: 'Dnrm2', + np.complex64: 'Scnrm2', + np.complex128: 'Dznrm2'}) + + def dot(self, x, y): + """Compute and return the vector dot product of x and y.""" + _sentry_same_dtype(x, y) + _sentry_ndim(1, x, y) + _sentry_same_shape(x, y) + fn = self._dispatch(self.dot.vtable, x.dtype) + with self._auto_read(x, y) as (dx, dy): + result = fn(x.size, dx, _norm_stride(x)[0], dy, _norm_stride(y)[0]) + result = x.dtype.type(result) + return result + + dot.vtable = _dtype_vtable({np.float32: 'Sdot', + np.float64: 'Ddot', + np.complex64: 'Cdotu', + np.complex128: 'Zdotu',}) + + def dotc(self, x, y): + "Same as np.vdot" + _sentry_same_dtype(x, y) + _sentry_ndim(1, x, y) + _sentry_same_shape(x, y) + fn = self._dispatch(self.dotc.vtable, x.dtype) + with self._auto_read(x, y) as (dx, dy): + return fn(x.size, dx, _norm_stride(x)[0], dy, _norm_stride(y)[0]) + + dotc.vtable = _dtype_vtable({np.complex64: 'Cdotc', + np.complex128: 'Zdotc',}) + + def scal(self, alpha, x): + "Same as x = alpha * x" + _sentry_ndim(1, x) + fn = self._dispatch(self.scal.vtable, x.dtype, type(alpha)) + with self._auto(x) as dx: + return fn(x.size, alpha, dx, *_norm_stride(x)) + + scal.vtable = _dtype_vtable({np.float32: 'Sscal', + np.float64: 'Dscal', + np.complex64: _sel_complex(imag='Cscal', + real='Csscal'), + np.complex128: _sel_complex(imag='Zscal', + real='Zdscal')}) + def axpy(self, alpha, x, y): + "Same as y = alpha * x + y" + _sentry_ndim(1, x, y) + _sentry_same_dtype(x, y) + _sentry_same_shape(x, y) + fn = self._dispatch(self.axpy.vtable, x.dtype) + with self._auto_read(x) as dx: + with self._auto(y) as dy: + return fn(x.size, alpha, dx, _norm_stride(x)[0], dy, + _norm_stride(y)[0]) + + axpy.vtable = _dtype_vtable({np.float32: 'Saxpy', + np.float64: 'Daxpy', + np.complex64: 'Caxpy', + np.complex128: 'Zaxpy'}) + + def amax(self, x): + "Same as np.argmax(x)" + _sentry_ndim(1, x) + fn = self._dispatch(self.amax.vtable, x.dtype) + with self._auto_read(x) as dx: + return fn(x.size, dx, _norm_stride(x)[0]) - 1 + + amax.vtable = _dtype_vtable({np.float32: 'Isamax', + np.float64: 'Idamax', + np.complex64: 'Icamax', + np.complex128: 'Izamax'}) + + def amin(self, x): + "Same as np.argmin(x)" + _sentry_ndim(1, x) + fn = self._dispatch(self.amin.vtable, x.dtype) + with self._auto_read(x) as dx: + return fn(x.size, dx, _norm_stride(x)[0]) - 1 + + amin.vtable = _dtype_vtable({np.float32: 'Isamin', + np.float64: 'Idamin', + np.complex64: 'Icamin', + np.complex128: 'Izamin'}) + + def asum(self, x): + "Same as np.sum(x)" + _sentry_ndim(1, x) + fn = self._dispatch(self.asum.vtable, x.dtype) + with self._auto_read(x) as dx: + return fn(x.size, dx, _norm_stride(x)[0]) + + asum.vtable = _dtype_vtable({np.float32: 'Sasum', + np.float64: 'Dasum', + np.complex64: 'Scasum', + np.complex128: 'Dzasum'}) + + def rot(self, x, y, c, s): + "Same as x, y = c * x + s * y, -s * x + c * y" + _sentry_ndim(1, x, y) + fn = self._dispatch(self.rot.vtable, x.dtype, type(s)) + with self._auto(x, y) as (dx, dy): + return fn(x.size, dx, _norm_stride(x)[0], dy, _norm_stride(x)[0], + c, s) + + rot.vtable = _dtype_vtable({np.float32: 'Srot', + np.float64: 'Drot', + np.complex64: _sel_complex(imag='Crot', + real='Csrot'), + np.complex128: _sel_complex(imag='Zrot', + real='Zdrot')}) + + def rotg(self, a, b): + '''Compute the given rotation matrix given a column vector (a, b). + Returns r, z, c, s. + + r: r = a ** 2 + b ** 2. + + z: Use to recover c and s. + + if abs(z) < 1: + c, s = 1 - z ** 2, z + elif abs(z) == 1: + c, s = 0, 1 + else: + c, s = 1 / z, 1 - z ** 2 + + c: Cosine element of the rotation matrix. + + s: Sine element of the rotation matrix. + ''' + a, b = np.asarray(a), np.asarray(b) + _sentry_same_dtype(a, b) + fn = self._dispatch(self.rotg.vtable, a.dtype) + return fn(np.asscalar(a), np.asscalar(b)) + + rotg.vtable = _dtype_vtable({np.float32: 'Srotg', + np.float64: 'Drotg', + np.complex64: 'Crotg', + np.complex128: 'Zrotg'}) + + def rotm(self, x, y, param): + '''Applies the modified Givens transformation. + + x, y = h11 * x + h12 * y, h21 * x + h22 * y + + param --- [flag, h11, h21, h12, h22] + + Refer to cuBLAS documentation for detail. + ''' + _sentry_ndim(1, x, y) + _sentry_same_dtype(x, y) + _sentry_same_shape(x, y) + fn = self._dispatch(self.rotm.vtable, x.dtype) + with self._auto(x, y) as (dx, dy): + return fn(x.size, dx, _norm_stride(x)[0], dy, _norm_stride(y)[0], + param) + + rotm.vtable = _dtype_vtable({np.float32: 'Srotm', + np.float64: 'Drotm'}) + + def rotmg(self, d1, d2, x1, y1): + '''Constructs the modified Givens transformation. + + Returns param that is usable in rotm. + + Refer to cuBLAS documentation for detail. + ''' + d1, d2, x1, y1 = map(np.asarray, [d1, d2, x1, y1]) + _sentry_same_dtype(d1, d2, x1, y1) + fn = self._dispatch(self.rotmg.vtable, x1.dtype) + return fn(*map(np.asscalar, [d1, d2, x1, y1])) + + rotmg.vtable = _dtype_vtable({np.float32: 'Srotmg', + np.float64: 'Drotmg'}) + + # Level 2 + + gbmv = _auto_l2_functions('gbmv', 'SDCZ', + 'trans, m, n, kl, ku, alpha, A:r, x:r, beta, y:w', + 'lda, incx, incy') + + gemv = _auto_l2_functions('gemv', 'SDCZ', + 'trans, m, n, alpha, A:r, x:r, beta, y:w', + 'lda, incx, incy') + + trmv = _auto_l2_functions('trmv', 'SDCZ', + 'uplo, trans, diag, n, A:r, x:w', + 'lda, incx') + + tbmv = _auto_l2_functions('tbmv', 'SDCZ', + 'uplo, trans, diag, n, k, A:r, x:w', + 'lda, incx') + + tpmv = _auto_l2_functions('tpmv', 'SDCZ', + 'uplo, trans, diag, n, AP:r, x:w', + 'incx') + + trsv = _auto_l2_functions('trsv', 'SDCZ', + 'uplo, trans, diag, n, A:r, x:w', + 'lda, incx') + + tpsv = _auto_l2_functions('tpsv', 'SDCZ', + 'uplo, trans, diag, n, AP:r, x:w', + 'incx') + + tbsv = _auto_l2_functions('tbsv', 'SDCZ', + 'uplo, trans, diag, n, k, A:r, x:w', + 'lda, incx') + + symv = _auto_l2_functions('symv', 'SDCZ', + 'uplo, n, alpha, A:r, x:r, beta, y:w', + 'lda, incx, incy') + + hemv = _auto_l2_functions('hemv', 'CZ', + 'uplo, n, alpha, A:r, x:r, beta, y:w', + 'lda, incx, incy') + + sbmv = _auto_l2_functions('sbmv', 'SDCZ', + 'uplo, n, k, alpha, A:r, x:r, beta, y:w', + 'lda, incx, incy') + + hbmv = _auto_l2_functions('hbmv', 'CZ', + 'uplo, n, k, alpha, A:r, x:r, beta, y:w', + 'lda, incx, incy') + + spmv = _auto_l2_functions('spmv', 'SD', + 'uplo, n, alpha, AP:r, x:r, beta, y:w', + 'incx, incy') + + hpmv = _auto_l2_functions('hpmv', 'CZ', + 'uplo, n, alpha, AP:r, x:r, beta, y:w', + 'incx, incy') + + ger = _auto_l2_functions('ger', 'SD', + 'm, n, alpha, x:r, y:r, A:w', + 'incx, incy, lda') + + geru = _auto_l2_functions('geru', 'CZ', + 'm, n, alpha, x:r, y:r, A:w', + 'incx, incy, lda') + + gerc = _auto_l2_functions('gerc', 'CZ', + 'm, n, alpha, x:r, y:r, A:w', + 'incx, incy, lda') + + syr = _auto_l2_functions('syr', 'SDCZ', 'uplo, n, alpha, x:r, A:w', + 'incx, lda') + + her = _auto_l2_functions('her', 'CZ', 'uplo, n, alpha, x:r, A:w', + 'incx, lda') + + spr = _auto_l2_functions('spr', 'SD', 'uplo, n, alpha, x:r, AP:w', + 'incx') + + hpr = _auto_l2_functions('hpr', 'CZ', 'uplo, n, alpha, x:r, AP:w', + 'incx') + + syr2 = _auto_l2_functions('syr2', 'SDCZ', + 'uplo, n, alpha, x:r, y:r, A:w', + 'incx, incy, lda') + + her2 = _auto_l2_functions('her2', 'CZ', + 'uplo, n, alpha, x:r, y:r, A:w', + 'incx, incy, lda') + + spr2 = _auto_l2_functions('spr2', 'SDCZ', + 'uplo, n, alpha, x:r, y:r, A:w', + 'incx, incy') + + hpr2 = _auto_l2_functions('hpr2', 'CZ', + 'uplo, n, alpha, x:r, y:r, A:w', + 'incx, incy') + + # Level 3 + + gemm = _auto_l2_functions('gemm', 'SDCZ', + 'transa, transb, m, n, k, alpha, A:r, B:r, beta, C:w', + 'lda, ldb, ldc') + + syrk = _auto_l2_functions('syrk', 'SDCZ', + 'uplo, trans, n, k, alpha, A:r, beta, C:w', + 'lda, ldc') + + herk = _auto_l2_functions('herk', 'CZ', + 'uplo, trans, n, k, alpha, A:r, beta, C:w', + 'lda, ldc') + + symm = _auto_l2_functions('symm', 'SDCZ', + 'side, uplo, m, n, alpha, A:r, B:r, beta, C:w', + 'lda, ldb, ldc') + + hemm = _auto_l2_functions('hemm', 'CZ', + 'side, uplo, m, n, alpha, A:r, B:r, beta, C:w', + 'lda, ldb, ldc') + + trsm = _auto_l2_functions('trsm', 'SDCZ', + 'side, uplo, trans, diag, m, n, alpha, A:r, B:w', + 'lda, ldb') + + trmm = _auto_l2_functions('trmm', 'SDCZ', + 'side, uplo, trans, diag, m, n, alpha, A:r, B:r, C:w', + 'lda, ldb, ldc') + + dgmm = _auto_l2_functions('dgmm', 'SDCZ', + 'side, m, n, A:r, x:r, C:w', + 'lda, ldc, incx') + + geam = _auto_l2_functions('geam', 'SDCZ', + 'transa, transb, m, n, alpha, A:r, beta, B:r, C:w', + 'lda, ldb, ldc') + +#---------------- +# utils +#---------------- + + + +def _sentry_same_shape(*arys): + first = arys[0] + for ary in arys: + if ary.shape != first.shape: + raise ValueError("Expecting all arrays to have the same shape.") + +def _sentry_same_dtype(*arys): + first = arys[0] + for ary in arys: + if ary.dtype != first.dtype: + raise TypeError("All arrays must have the same dtype.") + +def _sentry_ndim(ndim, *arys): + for ary in arys: + if ary.ndim != ndim: + raise ValueError("Expecting %d dimension array." % ndim) + +def _norm_stride(ary): + retval = [] + for stride in ary.strides: + if stride % ary.dtype.itemsize != 0: + raise ValueError("Misalignment.") + retval.append(stride // ary.dtype.itemsize) + return retval diff --git a/pyculib/blas/binding.py b/pyculib/blas/binding.py new file mode 100644 index 0000000..adc6d90 --- /dev/null +++ b/pyculib/blas/binding.py @@ -0,0 +1,1476 @@ +from __future__ import absolute_import +import numpy as np +from ctypes import c_float, c_double, byref, c_int, c_void_p, POINTER + +from numba.cuda.cudadrv.drvapi import cu_stream +from numba.cuda.cudadrv.driver import device_pointer, host_pointer +from pyculib.utils import (Lib, ctype_function, OwnerMixin, + c_double_complex, c_complex, memalign) + +INV_STATUS = dict( + CUBLAS_STATUS_SUCCESS=0, + CUBLAS_STATUS_NOT_INITIALIZED=1, + CUBLAS_STATUS_ALLOC_FAILED=3, + CUBLAS_STATUS_INVALID_VALUE=7, + CUBLAS_STATUS_ARCH_MISMATCH=8, + CUBLAS_STATUS_MAPPING_ERROR=11, + CUBLAS_STATUS_EXECUTION_FAILED=13, + CUBLAS_STATUS_INTERNAL_ERROR=14 +) + +STATUS = dict((v, k) for k, v in INV_STATUS.items()) + +cublasStatus_t = c_int + +CUBLAS_FILL_MODE_LOWER = 0 +CUBLAS_FILL_MODE_UPPER = 1 + +CUBLAS_FILL_MODE_MAP = { + 'L': CUBLAS_FILL_MODE_LOWER, + 'U': CUBLAS_FILL_MODE_UPPER, +} + +cublasFillMode_t = c_int + +CUBLAS_DIAG_NON_UNIT = 0 +CUBLAS_DIAG_UNIT = 1 + +cublasDiagType_t = c_int + +CUBLAS_DIAG_MAP = { + True: CUBLAS_DIAG_UNIT, + False: CUBLAS_DIAG_NON_UNIT, +} + +CUBLAS_SIDE_LEFT = 0 +CUBLAS_SIDE_RIGHT = 1 + +CUBLAS_SIDE_MAP = { + 'L': CUBLAS_SIDE_LEFT, + 'R': CUBLAS_SIDE_RIGHT, +} + +cublasSideMode_t = c_int + +CUBLAS_OP_N = 0 +CUBLAS_OP_T = 1 +CUBLAS_OP_C = 2 + +cublasOperation_t = c_int + +CUBLAS_POINTER_MODE_HOST = 0 +CUBLAS_POINTER_MODE_DEVICE = 1 + +cublasPointerMode_t = c_int + +CUBLAS_ATOMICS_NOT_ALLOWED = 0 +CUBLAS_ATOMICS_ALLOWED = 1 + +cublasAtomicsMode_t = c_int + +cublasHandle_t = c_void_p # opaque handle + +CUBLAS_OP_MAP = {'N': CUBLAS_OP_N, + 'T': CUBLAS_OP_T, + 'C': CUBLAS_OP_C, + 'H': CUBLAS_OP_C, } + + +class CuBLASError(Exception): + def __init__(self, code): + super(CuBLASError, self).__init__(STATUS[code]) + + +class libcublas(Lib): + lib = 'cublas' + ErrorType = CuBLASError + + cublasCreate_v2 = ctype_function(cublasStatus_t, + POINTER(cublasHandle_t)) # handle + + cublasDestroy_v2 = ctype_function(cublasStatus_t, + cublasHandle_t) # handle + + cublasGetVersion_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + POINTER(c_int)) # version + + cublasSetStream_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + cu_stream) # streamId + + cublasGetStream_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + POINTER(cu_stream)) # streamId + + cublasGetPointerMode_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + POINTER( + cublasPointerMode_t)) # mode + + cublasSetPointerMode_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + cublasPointerMode_t) # mode + + cublasGetAtomicsMode = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + POINTER(cublasAtomicsMode_t)) # mode + + cublasSetAtomicsMode = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + cublasAtomicsMode_t) # mode + + # Level 1 + cublasSnrm2_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + c_int, # n + c_void_p, # device array + c_int, # incx + c_void_p) # result - host/device scalar + + cublasDnrm2_v2 = cublasSnrm2_v2 + cublasScnrm2_v2 = cublasSnrm2_v2 + cublasDznrm2_v2 = cublasSnrm2_v2 + + cublasSdot_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + c_int, # n + c_void_p, # x + c_int, # incx + c_void_p, # y + c_int, # incy, + c_void_p) # result h/d ptr + cublasDdot_v2 = cublasSdot_v2 + cublasCdotu_v2 = cublasSdot_v2 + cublasZdotu_v2 = cublasSdot_v2 + cublasCdotc_v2 = cublasSdot_v2 + cublasZdotc_v2 = cublasSdot_v2 + + cublasSscal_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + c_int, # n + c_void_p, # alpha h/d + c_void_p, # x + c_int) # incx + + cublasDscal_v2 = cublasSscal_v2 + cublasCscal_v2 = cublasSscal_v2 + cublasZscal_v2 = cublasSscal_v2 + cublasCsscal_v2 = cublasSscal_v2 + cublasZdscal_v2 = cublasSscal_v2 + + cublasSaxpy_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + c_int, # n + c_void_p, # alpha h/d + c_void_p, # x + c_int, # incx + c_void_p, # y + c_int) # incy + cublasDaxpy_v2 = cublasSaxpy_v2 + cublasCaxpy_v2 = cublasSaxpy_v2 + cublasZaxpy_v2 = cublasSaxpy_v2 + + cublasIsamax_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + c_int, # n + c_void_p, # x + c_int, # incx + POINTER(c_int)) # result h/d ptr + + cublasIdamax_v2 = cublasIsamax_v2 + cublasIcamax_v2 = cublasIsamax_v2 + cublasIzamax_v2 = cublasIsamax_v2 + + cublasIsamin_v2 = cublasIsamax_v2 + + cublasIdamin_v2 = cublasIsamin_v2 + cublasIcamin_v2 = cublasIsamin_v2 + cublasIzamin_v2 = cublasIsamin_v2 + + cublasSasum_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + c_int, # n + c_void_p, # x + c_int, # incx + c_void_p) # result h/d ptr + + cublasDasum_v2 = cublasSasum_v2 + cublasScasum_v2 = cublasSasum_v2 + cublasDzasum_v2 = cublasSasum_v2 + + cublasSrot_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle, + c_int, # n + c_void_p, # x + c_int, # incx + c_void_p, # y + c_int, # incy + c_void_p, # c + c_void_p) # s h/d ptr + + cublasDrot_v2 = cublasSrot_v2 + cublasCrot_v2 = cublasSrot_v2 + cublasZrot_v2 = cublasSrot_v2 + cublasCsrot_v2 = cublasSrot_v2 + cublasZdrot_v2 = cublasSrot_v2 + + cublasSrotg_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle, + c_void_p, # a h/d ptr + c_void_p, # b h/d ptr + c_void_p, # c h/d ptr + c_void_p) # s h/d ptr + + cublasDrotg_v2 = cublasSrotg_v2 + cublasCrotg_v2 = cublasSrotg_v2 + cublasZrotg_v2 = cublasSrotg_v2 + + cublasSrotm_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle + c_int, # n + c_void_p, # x + c_int, # incx + c_void_p, # y + c_int, # incy + c_void_p) # param h/d pointer + cublasDrotm_v2 = cublasSrotm_v2 + + cublasSrotmg_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle, + c_void_p, # d1 h/d ptr + c_void_p, # d2 h/d ptr + c_void_p, # x1 h/d ptr + c_void_p, # y1 h/d ptr + c_void_p) # param h/d ptr + + cublasDrotmg_v2 = cublasSrotmg_v2 + + # + # Level 2 + # + cublasSgbmv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle, + cublasOperation_t, # trans, + c_int, # m, + c_int, # n, + c_int, # kl, + c_int, # ku, + c_void_p, # *alpha, + c_void_p, # *A, + c_int, # lda, + c_void_p, # *x, + c_int, # incx, + c_void_p, # *beta, + c_void_p, # *y, + c_int) # incy) + + cublasDgbmv_v2 = cublasSgbmv_v2 + cublasCgbmv_v2 = cublasSgbmv_v2 + cublasZgbmv_v2 = cublasSgbmv_v2 + + cublasSgemv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle, + cublasOperation_t, # trans, + c_int, # m, + c_int, # n, + c_void_p, # *alpha, + c_void_p, # *A, + c_int, # lda, + c_void_p, # *x, + c_int, # incx, + c_void_p, # *beta, + c_void_p, # *y, + c_int) # incy) + + cublasDgemv_v2 = cublasSgemv_v2 + cublasCgemv_v2 = cublasSgemv_v2 + cublasZgemv_v2 = cublasSgemv_v2 + + cublasStrmv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle, + cublasFillMode_t, # uplo, + cublasOperation_t, # trans, + cublasDiagType_t, # diag, + c_int, # n, + c_void_p, # *A, + c_int, # lda, + c_void_p, # *x, + c_int) # incx); + + cublasDtrmv_v2 = cublasStrmv_v2 + cublasCtrmv_v2 = cublasStrmv_v2 + cublasZtrmv_v2 = cublasStrmv_v2 + + cublasStbmv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle, + cublasFillMode_t, # uplo, + cublasOperation_t, # trans, + cublasDiagType_t, # diag, + c_int, # n, + c_int, # k, + c_void_p, # *A, + c_int, # lda, + c_void_p, # *x, + c_int) # incx); + + cublasDtbmv_v2 = cublasStbmv_v2 + cublasCtbmv_v2 = cublasStbmv_v2 + cublasZtbmv_v2 = cublasStbmv_v2 + + cublasStpmv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle, + cublasFillMode_t, # uplo, + cublasOperation_t, # trans, + cublasDiagType_t, # diag, + c_int, # n, + c_void_p, # *AP, + c_void_p, # *x, + c_int) # incx); + + cublasDtpmv_v2 = cublasStpmv_v2 + cublasCtpmv_v2 = cublasStpmv_v2 + cublasZtpmv_v2 = cublasStpmv_v2 + + cublasStrsv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, # handle, + cublasFillMode_t, # uplo, + cublasOperation_t, # trans, + cublasDiagType_t, # diag, + c_int, # n, + c_void_p, # *A, + c_int, # lda, + c_void_p, # *x, + c_int) # incx); + + cublasDtrsv_v2 = cublasStrsv_v2 + cublasCtrsv_v2 = cublasStrsv_v2 + cublasZtrsv_v2 = cublasStrsv_v2 + + cublasStpsv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + cublasOperation_t, #trans, + cublasDiagType_t, #diag, + c_int, #n, + c_void_p, #*AP, + c_void_p, #*x, + c_int) #incx); + + cublasDtpsv_v2 = cublasStpsv_v2 + cublasCtpsv_v2 = cublasStpsv_v2 + cublasZtpsv_v2 = cublasStpsv_v2 + + cublasStbsv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + cublasOperation_t, #trans, + cublasDiagType_t, #diag, + c_int, #n, + c_int, #k, + c_void_p, #*A, + c_int, #lda, + c_void_p, #*x, + c_int) #incx); + + cublasDtbsv_v2 = cublasStbsv_v2 + cublasCtbsv_v2 = cublasStbsv_v2 + cublasZtbsv_v2 = cublasStbsv_v2 + + cublasSsymv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + c_int, #n, + c_void_p, #*alpha, + c_void_p, #*A, + c_int, #lda, + c_void_p, #*x, + c_int, #incx, + c_void_p, #*beta, + c_void_p, #*y, + c_int) #incy); + + cublasDsymv_v2 = cublasSsymv_v2 + cublasCsymv_v2 = cublasSsymv_v2 + cublasZsymv_v2 = cublasSsymv_v2 + + cublasChemv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + c_int, #n, + c_void_p, #*alpha, + c_void_p, #*A, + c_int, #lda, + c_void_p, #*x, + c_int, #incx, + c_void_p, #*beta, + c_void_p, #*y, + c_int) #incy); + cublasZhemv_v2 = cublasChemv_v2 + + cublasSsbmv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + c_int, #n, + c_int, #k, + c_void_p, #*alpha + c_void_p, #*A, + c_int, #lda, + c_void_p, #*x, + c_int, #incx, + c_void_p, #*beta + c_void_p, #*y, + c_int) #incy); + cublasDsbmv_v2 = cublasSsbmv_v2 + + cublasChbmv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + c_int, #n, + c_int, #k, + c_void_p, #*alpha, + c_void_p, #*A, + c_int, #lda, + c_void_p, #*x, + c_int, #incx, + c_void_p, #*beta, + c_void_p, #*y, + c_int) #incy); + cublasZhbmv_v2 = cublasChbmv_v2 + + cublasSspmv_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + c_int, #n, + c_void_p, #*alpha, + c_void_p, #*AP, + c_void_p, #*x, + c_int, #incx, + c_void_p, #*beta, + c_void_p, #*y, + c_int) #incy); + + cublasDspmv_v2 = cublasSspmv_v2 + cublasChpmv_v2 = cublasSspmv_v2 + cublasZhpmv_v2 = cublasChpmv_v2 + + cublasSger_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + c_int, #m, + c_int, #n, + c_void_p, #*alpha, + c_void_p, #*x, + c_int, #incx, + c_void_p, #*y, + c_int, #incy, + c_void_p, #*A, + c_int) #lda); + cublasDger_v2 = cublasSger_v2 + cublasCgeru_v2 = cublasDger_v2 + cublasCgerc_v2 = cublasDger_v2 + cublasZgeru_v2 = cublasDger_v2 + cublasZgerc_v2 = cublasDger_v2 + + cublasSsyr_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + c_int, #n, + c_void_p, #*alpha, + c_void_p, #*x, + c_int, #incx, + c_void_p, #*A, + c_int) #lda); + cublasDsyr_v2 = cublasSsyr_v2 + cublasCsyr_v2 = cublasSsyr_v2 + cublasZsyr_v2 = cublasSsyr_v2 + + cublasCher_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + c_int, #n, + c_void_p, #*alpha, + c_void_p, #*x, + c_int, #incx, + c_void_p, #*A, + c_int) #lda); + + cublasZher_v2 = cublasCher_v2 + + cublasSspr_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, # uplo, + c_int, #n, + c_void_p, #*alpha, + c_void_p, #*x, + c_int, #incx, + c_void_p) #*AP); + + cublasDspr_v2 = cublasSspr_v2 + cublasChpr_v2 = cublasSspr_v2 + cublasZhpr_v2 = cublasSspr_v2 + + cublasSsyr2_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + c_int, #n, + c_void_p, #*alpha, + c_void_p, #*x, + c_int, #incx, + c_void_p, #*y, + c_int, #incy, + c_void_p, #*A, + c_int) #lda); + + cublasDsyr2_v2 = cublasSsyr2_v2 + cublasCsyr2_v2 = cublasSsyr2_v2 + cublasZsyr2_v2 = cublasSsyr2_v2 + cublasCher2_v2 = cublasSsyr2_v2 + cublasZher2_v2 = cublasSsyr2_v2 + + cublasSspr2_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + c_int, #n, + c_void_p, #*alpha + c_void_p, #*x, + c_int, #incx, + c_void_p, #*y, + c_int, #incy, + c_void_p) #*AP); + + cublasDspr2_v2 = cublasSspr2_v2 + + cublasChpr2_v2 = cublasSspr2_v2 + cublasZhpr2_v2 = cublasSspr2_v2 + + # Level 3 + cublasSgemm_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasOperation_t, #transa, + cublasOperation_t, #transb, + c_int, #m, + c_int, #n, + c_int, #k, + c_void_p, #*alpha + c_void_p, #*A, + c_int, #lda, + c_void_p, #*B, + c_int, #ldb, + c_void_p, #*beta, + c_void_p, #*C, + c_int) #ldc); + + cublasDgemm_v2 = cublasSgemm_v2 + cublasCgemm_v2 = cublasSgemm_v2 + cublasZgemm_v2 = cublasSgemm_v2 + + cublasSsyrk_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + cublasOperation_t, #trans, + c_int, #n, + c_int, #k, + c_void_p, #*alpha, + c_void_p, #*A, + c_int, #lda, + c_void_p, #*beta, + c_void_p, #*C, + c_int) #ldc); + + cublasDsyrk_v2 = cublasSsyrk_v2 + cublasCsyrk_v2 = cublasSsyrk_v2 + cublasZsyrk_v2 = cublasSsyrk_v2 + + cublasCherk_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasFillMode_t, #uplo, + cublasOperation_t, #trans, + c_int, #n, + c_int, #k, + c_void_p, #*alpha, + c_void_p, #*A, + c_int, #lda, + c_void_p, #*beta + c_void_p, #*C, + c_int) #ldc); + cublasZherk_v2 = cublasCherk_v2 + + cublasSsymm_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasSideMode_t, #side, + cublasFillMode_t, #uplo, + c_int, #m, + c_int, #n, + c_void_p, #*alpha + c_void_p, #*A, + c_int, #lda, + c_void_p, #*B, + c_int, #ldb, + c_void_p, #*beta + c_void_p, #*C, + c_int) #ldc); + + cublasDsymm_v2 = cublasSsymm_v2 + cublasCsymm_v2 = cublasSsymm_v2 + cublasZsymm_v2 = cublasSsymm_v2 + + cublasChemm_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasSideMode_t, #side, + cublasFillMode_t, #uplo, + c_int, #m, + c_int, #n, + c_void_p, #*alpha + c_void_p, #*A, + c_int, #lda, + c_void_p, #*B, + c_int, #ldb, + c_void_p, #*beta + c_void_p, #*C, + c_int) #ldc); + cublasZhemm_v2 = cublasChemm_v2 + + cublasStrsm_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasSideMode_t, #side, + cublasFillMode_t, #uplo, + cublasOperation_t, #trans, + cublasDiagType_t, #diag, + c_int, #m, + c_int, #n, + c_void_p, #*alpha + c_void_p, #*A, + c_int, #lda, + c_void_p, #*B, + c_int) #ldb); + + cublasDtrsm_v2 = cublasStrsm_v2 + cublasCtrsm_v2 = cublasStrsm_v2 + cublasZtrsm_v2 = cublasStrsm_v2 + + cublasStrmm_v2 = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasSideMode_t, #side, + cublasFillMode_t, #uplo, + cublasOperation_t, #trans, + cublasDiagType_t, #diag, + c_int, #m, + c_int, #n, + c_void_p, #*alpha + c_void_p, #*A, + c_int, #lda, + c_void_p, #*B, + c_int, #ldb, + c_void_p, #*C, + c_int) #ldc); + + cublasDtrmm_v2 = cublasStrmm_v2 + cublasCtrmm_v2 = cublasStrmm_v2 + cublasZtrmm_v2 = cublasStrmm_v2 + + cublasSdgmm = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasSideMode_t, #mode, + c_int, #m, + c_int, #n, + c_void_p, #*A, + c_int, #lda, + c_void_p, #*x, + c_int, #incx, + c_void_p, #*C, + c_int) #ldc); + cublasDdgmm = cublasSdgmm + cublasCdgmm = cublasSdgmm + cublasZdgmm = cublasSdgmm + + cublasSgeam = ctype_function(cublasStatus_t, + cublasHandle_t, #handle, + cublasOperation_t, #transa, + cublasOperation_t, #transb, + c_int, #m, + c_int, #n, + c_void_p, #*alpha, + c_void_p, #*A, + c_int, #lda, + c_void_p, #*beta, + c_void_p, #*B, + c_int, #ldb, + c_void_p, #*C + c_int) #ldc); + cublasDgeam = cublasSgeam + cublasCgeam = cublasSgeam + cublasZgeam = cublasSgeam + + +class _alloc_scalar(object): + def __init__(self, cty, value=None): + if cty == c_double_complex: + cval, data = memalign(cty, align=16) + self._data = data + self._cval = cval + self._as_parameter_ = byref(self._cval) + self.return_value = lambda: self._cval.value + + if value is not None: + val = cty(value) + self._cval.real = val.real + self._cval.imag = val.imag + + else: + if value is not None: + self._data = cty(value) + else: + self._data = cty() + self._as_parameter_ = byref(self._data) + self.return_value = lambda: self._data.value + +def _return_scalar(result): + return result.return_value() + + +def _Tnrm2(fmt, cty): + def nrm2(self, n, x, incx): + result = _alloc_scalar(cty) + fn = getattr(self._api, 'cublas%snrm2_v2' % fmt) + fn(self._handle, int(n), device_pointer(x), int(incx), result) + return _return_scalar(result) + + return nrm2 + + +def _Tdot(fmt, cty, postfix=''): + def dot(self, n, x, incx, y, incy): + result = _alloc_scalar(cty) + fn = getattr(self._api, 'cublas%sdot%s_v2' % (fmt, postfix)) + fn(self._handle, int(n), device_pointer(x), int(incx), + device_pointer(y), int(incy), result) + return _return_scalar(result) + + return dot + + +def _Tscal(fmt, cty): + def scal(self, n, alpha, x, incx): + "Stores result to x" + c_alpha = _alloc_scalar(cty, alpha) + fn = getattr(self._api, 'cublas%sscal_v2' % fmt) + fn(self._handle, int(n), c_alpha, device_pointer(x), int(incx)) + + return scal + + +def _Taxpy(fmt, cty): + def axpy(self, n, alpha, x, incx, y, incy): + "Stores result to y" + c_alpha = _alloc_scalar(cty, alpha) + fn = getattr(self._api, 'cublas%saxpy_v2' % fmt) + fn(self._handle, int(n), c_alpha, device_pointer(x), int(incx), + device_pointer(y), int(incy)) + + return axpy + + +def _Itamax(fmt, cty): + def amax(self, n, x, incx): + result = _alloc_scalar(c_int) + fn = getattr(self._api, 'cublasI%samax_v2' % fmt) + fn(self._handle, int(n), device_pointer(x), int(incx), + result) + return _return_scalar(result) + + return amax + + +def _Itamin(fmt, cty): + def amin(self, n, x, incx): + result = _alloc_scalar(c_int) + fn = getattr(self._api, 'cublasI%samin_v2' % fmt) + fn(self._handle, int(n), device_pointer(x), int(incx), result) + return _return_scalar(result) + + return amin + + +def _Tasum(fmt, cty): + def asum(self, n, x, incx): + result = _alloc_scalar(cty) + fn = getattr(self._api, 'cublas%sasum_v2' % fmt) + fn(self._handle, int(n), device_pointer(x), int(incx), result) + return _return_scalar(result) + + return asum + + +def _Trot(fmt, cty, sty): + def rot(self, n, x, incx, y, incy, c, s): + "Stores to x and y" + c_c = _alloc_scalar(cty, c) + c_s = _alloc_scalar(sty, s) + fn = getattr(self._api, 'cublas%srot_v2' % fmt) + fn(self._handle, int(n), device_pointer(x), int(incx), + device_pointer(y), int(incy), c_c, c_s) + + return rot + + +def _Trotg(fmt, ty, cty): + def rotg(self, a, b): + c_a = _alloc_scalar(ty, a) + c_b = _alloc_scalar(ty, b) + c_c = _alloc_scalar(cty) + c_s = _alloc_scalar(ty) + fn = getattr(self._api, 'cublas%srotg_v2' % fmt) + fn(self._handle, c_a, c_b, c_c, c_s) + r, z, c, s = map(_return_scalar, [c_a, c_b, c_c, c_s]) + return r, z, c, s + + return rotg + + +def _Trotm(fmt, dtype): + def rotm(self, n, x, incx, y, incy, param): + "Stores result to x, y" + fn = getattr(self._api, 'cublas%srotm_v2' % fmt) + assert len(param.shape) == 1, "param must be a 1-d array" + assert param.size >= 5, "param must have at least 5 elements" + assert param.dtype == np.dtype(dtype), "param dtype mismatch" + fn(self._handle, int(n), device_pointer(x), int(incx), + device_pointer(y), int(incy), host_pointer(param)) + + return rotm + + +def _Trotmg(fmt, cty, dtype): + def rotmg(self, d1, d2, x1, y1): + fn = getattr(self._api, 'cublas%srotmg_v2' % fmt) + c_d1 = _alloc_scalar(cty, d1) + c_d2 = _alloc_scalar(cty, d2) + c_x1 = _alloc_scalar(cty, x1) + c_y1 = _alloc_scalar(cty, y1) + param = np.zeros(5, dtype=dtype) + fn(self._handle, c_d1, c_d2, c_x1, c_y1, host_pointer(param)) + return param + + return rotmg + + +def _Tgbmv(fmt, cty, dtype): + def gbmv(self, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy): + '''This function performs the banded matrix-vector multiplication + ''' + fn = getattr(self._api, 'cublas%sgbmv_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + trans = CUBLAS_OP_MAP[trans] + fn(self._handle, trans, m, n, kl, ku, c_alpha, device_pointer(A), + lda, device_pointer(x), incx, c_beta, device_pointer(y), incy) + + return gbmv + + +def _Tgemv(fmt, cty, dtype): + def gemv(self, trans, m, n, alpha, A, lda, x, incx, beta, y, incy): + '''This function performs the banded matrix-vector multiplication + ''' + fn = getattr(self._api, 'cublas%sgemv_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + trans = CUBLAS_OP_MAP[trans] + fn(self._handle, trans, m, n, c_alpha, device_pointer(A), + lda, device_pointer(x), incx, c_beta, device_pointer(y), incy) + + return gemv + + +def _Ttrmv(fmt, dtype): + def trmv(self, uplo, trans, diag, n, A, lda, x, incx): + fn = getattr(self._api, 'cublas%strmv_v2' % fmt) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans], + CUBLAS_DIAG_MAP[diag], n, device_pointer(A), lda, device_pointer(x), + incx) + + return trmv + + +def _Ttbmv(fmt, dtype): + def tbmv(self, uplo, trans, diag, n, k, A, lda, x, incx): + fn = getattr(self._api, 'cublas%stbmv_v2' % fmt) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans], + CUBLAS_DIAG_MAP[diag], n, k, device_pointer(A), lda, + device_pointer(x), incx) + + return tbmv + + +def _Ttpmv(fmt, dtype): + def tpmv(self, uplo, trans, diag, n, AP, x, incx): + fn = getattr(self._api, 'cublas%stpmv_v2' % fmt) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans], + CUBLAS_DIAG_MAP[diag], n, device_pointer(AP), device_pointer(x), + incx) + + return tpmv + + +def _Ttrsv(fmt, dtype): + def trsv(self, uplo, trans, diag, n, A, lda, x, incx): + fn = getattr(self._api, 'cublas%strsv_v2' % fmt) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans], + CUBLAS_DIAG_MAP[diag], n, device_pointer(A), lda, device_pointer(x), + incx) + + return trsv + + +def _Ttpsv(fmt, dtype): + def tpsv(self, uplo, trans, diag, n, AP, x, incx): + fn = getattr(self._api, 'cublas%stpsv_v2' % fmt) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans], + CUBLAS_DIAG_MAP[diag], n, device_pointer(AP), device_pointer(x), + incx) + + return tpsv + + +def _Ttbsv(fmt, dtype): + def tbsv(self, uplo, trans, diag, n, k, A, lda, x, incx): + fn = getattr(self._api, 'cublas%stbsv_v2' % fmt) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans], + CUBLAS_DIAG_MAP[diag], n, k, device_pointer(A), lda, + device_pointer(x), incx) + + return tbsv + + +def _Tsymv(fmt, cty, dtype): + def symv(self, uplo, n, alpha, A, lda, x, incx, beta, y, incy): + fn = getattr(self._api, 'cublas%ssymv_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(A), lda, device_pointer(x), incx, c_beta, + device_pointer(y), incy) + + return symv + + +def _Themv(fmt, cty, dtype): + def symv(self, uplo, n, alpha, A, lda, x, incx, beta, y, incy): + fn = getattr(self._api, 'cublas%shemv_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(A), lda, device_pointer(x), incx, c_beta, + device_pointer(y), incy) + + return symv + + +def _Tsbmv(fmt, cty, dtype): + def sbmv(self, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy): + fn = getattr(self._api, 'cublas%ssbmv_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, k, c_alpha, + device_pointer(A), lda, device_pointer(x), incx, c_beta, + device_pointer(y), incy) + + return sbmv + + +def _Thbmv(fmt, cty, dtype): + def sbmv(self, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy): + fn = getattr(self._api, 'cublas%shbmv_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, k, c_alpha, + device_pointer(A), lda, device_pointer(x), incx, c_beta, + device_pointer(y), incy) + + return sbmv + + +def _Tspmv(fmt, cty, dtype): + def sbmv(self, uplo, n, alpha, AP, x, incx, beta, y, incy): + fn = getattr(self._api, 'cublas%sspmv_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(AP), device_pointer(x), incx, c_beta, + device_pointer(y), incy) + + return sbmv + + +def _Thpmv(fmt, cty, dtype): + def sbmv(self, uplo, n, alpha, AP, x, incx, beta, y, incy): + fn = getattr(self._api, 'cublas%shpmv_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(AP), device_pointer(x), incx, c_beta, + device_pointer(y), incy) + + return sbmv + + +def _Tger(fmt, cty, dtype): + def ger(self, m, n, alpha, x, incx, y, incy, A, lda): + fn = getattr(self._api, 'cublas%sger_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, m, n, c_alpha, + device_pointer(x), incx, device_pointer(y), incy, + device_pointer(A), lda) + + return ger + + +def _Tgeru(fmt, cty, dtype): + def ger(self, m, n, alpha, x, incx, y, incy, A, lda): + fn = getattr(self._api, 'cublas%sgeru_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, m, n, c_alpha, + device_pointer(x), incx, device_pointer(y), incy, + device_pointer(A), lda) + + return ger + + +def _Tgerc(fmt, cty, dtype): + def ger(self, m, n, alpha, x, incx, y, incy, A, lda): + fn = getattr(self._api, 'cublas%sgerc_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, m, n, c_alpha, device_pointer(x), incx, + device_pointer(y), incy, device_pointer(A), lda) + + return ger + + +def _Tsyr(fmt, cty, dtype): + def syr(self, uplo, n, alpha, x, incx, A, lda): + fn = getattr(self._api, 'cublas%ssyr_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(x), incx, device_pointer(A), lda) + + return syr + + +def _Ther(fmt, cty, dtype): + def her(self, uplo, n, alpha, x, incx, A, lda): + fn = getattr(self._api, 'cublas%sher_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(x), incx, device_pointer(A), lda) + + return her + + +def _Tspr(fmt, cty, dtype): + def spr(self, uplo, n, alpha, x, incx, AP): + fn = getattr(self._api, 'cublas%sspr_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(x), incx, device_pointer(AP)) + + return spr + + +def _Thpr(fmt, cty, dtype): + def hpr(self, uplo, n, alpha, x, incx, AP): + fn = getattr(self._api, 'cublas%shpr_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(x), incx, device_pointer(AP)) + + return hpr + + +def _Tsyr2(fmt, cty, dtype): + def syr2(self, uplo, n, alpha, x, incx, y, incy, A, lda): + fn = getattr(self._api, 'cublas%ssyr2_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(x), incx, device_pointer(y), incy, device_pointer(A), + lda) + + return syr2 + + +def _Ther2(fmt, cty, dtype): + def her2(self, uplo, n, alpha, x, incx, y, incy, A, lda): + fn = getattr(self._api, 'cublas%sher2_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(x), incx, device_pointer(y), incy, device_pointer(A), + lda) + + return her2 + + +def _Tspr2(fmt, cty, dtype): + def spr2(self, uplo, n, alpha, x, incx, y, incy, A): + fn = getattr(self._api, 'cublas%sspr2_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(x), incx, device_pointer(y), incy, device_pointer(A)) + + return spr2 + + +def _Thpr2(fmt, cty, dtype): + def spr2(self, uplo, n, alpha, x, incx, y, incy, A): + fn = getattr(self._api, 'cublas%shpr2_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha, + device_pointer(x), incx, device_pointer(y), incy, device_pointer(A)) + + return spr2 + + +def _Tgemm(fmt, cty, dtype): + def gemm(self, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, + ldc): + fn = getattr(self._api, 'cublas%sgemm_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_OP_MAP[transa], CUBLAS_OP_MAP[transb], m, n, k, + c_alpha, device_pointer(A), lda, device_pointer(B), ldb, + c_beta, device_pointer(C), ldc) + + return gemm + + +def _Tsyrk(fmt, cty, dtype): + def syrk(self, uplo, trans, n, k, alpha, A, lda, beta, C, ldc): + fn = getattr(self._api, 'cublas%ssyrk_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans], n, k, + c_alpha, device_pointer(A), lda, c_beta, + device_pointer(C), ldc) + + return syrk + + +def _Therk(fmt, cty, dtype): + def herk(self, uplo, trans, n, k, alpha, A, lda, beta, C, ldc): + fn = getattr(self._api, 'cublas%sherk_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans], n, k, + c_alpha, device_pointer(A), lda, c_beta, + device_pointer(C), ldc) + + return herk + + +def _Tsymm(fmt, cty, dtype): + def symm(self, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc): + fn = getattr(self._api, 'cublas%ssymm_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_SIDE_MAP[side], CUBLAS_FILL_MODE_MAP[uplo], m, + n, c_alpha, device_pointer(A), lda, device_pointer(B), ldb, + c_beta, device_pointer(C), ldc) + + return symm + + +def _Themm(fmt, cty, dtype): + def hemm(self, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc): + fn = getattr(self._api, 'cublas%shemm_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_SIDE_MAP[side], CUBLAS_FILL_MODE_MAP[uplo], m, + n, c_alpha, device_pointer(A), lda, device_pointer(B), ldb, + c_beta, device_pointer(C), ldc) + + return hemm + + +def _Ttrsm(fmt, cty, dtype): + def trsm(self, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb): + fn = getattr(self._api, 'cublas%strsm_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, CUBLAS_SIDE_MAP[side], CUBLAS_FILL_MODE_MAP[uplo], + CUBLAS_OP_MAP[trans], CUBLAS_DIAG_MAP[diag], m, n, + c_alpha, device_pointer(A), lda, device_pointer(B), ldb) + + return trsm + + +def _Ttrmm(fmt, cty, dtype): + def trmm(self, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, + C, ldc): + fn = getattr(self._api, 'cublas%strmm_v2' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + fn(self._handle, CUBLAS_SIDE_MAP[side], CUBLAS_FILL_MODE_MAP[uplo], + CUBLAS_OP_MAP[trans], CUBLAS_DIAG_MAP[diag], m, n, + c_alpha, device_pointer(A), lda, device_pointer(B), ldb, + device_pointer(C), ldc) + + return trmm + + +def _Tdgmm(fmt, cty, dtype): + def dgmm(self, side, m, n, A, lda, x, incx, C, ldc): + fn = getattr(self._api, 'cublas%sdgmm' % fmt) + fn(self._handle, CUBLAS_SIDE_MAP[side], m, n, device_pointer(A), lda, + device_pointer(x), incx, device_pointer(C), ldc) + + return dgmm + + +def _Tgeam(fmt, cty, dtype): + def geam(self, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc): + fn = getattr(self._api, 'cublas%sgeam' % fmt) + c_alpha = _alloc_scalar(cty, alpha) + c_beta = _alloc_scalar(cty, beta) + fn(self._handle, CUBLAS_OP_MAP[transa], CUBLAS_OP_MAP[transb], m, n, + c_alpha, device_pointer(A), lda, c_beta, + device_pointer(B), ldb, device_pointer(C), ldc) + + return geam + + +class cuBlas(OwnerMixin): + def __init__(self): + self._api = libcublas() + self._handle = cublasHandle_t() + self._api.cublasCreate_v2(byref(self._handle)) + self._finalizer_track((self._handle, self._api)) + self._stream = 0 + + @classmethod + def _finalize(self, res): + handle, api = res + api.cublasDestroy_v2(handle) + + @property + def version(self): + ver = c_int() + self._api.cublasGetVersion_v2(self._handle, byref(ver)) + return ver.value + + @property + def stream(self): + return self._stream + + @stream.setter + def stream(self, stream): + self._stream = stream + self._api.cublasSetStream_v2(self._handle, self.stream and self.stream.handle or 0) + + @property + def pointer_mode(self): + mode = cublasPointerMode_t() + self._api.cublasGetPointerMode_v2(self._handle, byref(mode)) + return mode.value + + @pointer_mode.setter + def pointer_mode(self, mode): + self._api.cublasSetPointerMode_v2(self._handle, int(mode)) + + @property + def atomics_mode(self): + mode = cublasAtomicsMode_t() + self._api.cublasGetAtomicsMode(self._handle, byref(mode)) + return mode.value + + @atomics_mode.setter + def atomics_mode(self, mode): + self._api.cublasSetAtomicsMode(self._handle, int(mode)) + + # Level 1 + + Snrm2 = _Tnrm2('S', c_float) + Dnrm2 = _Tnrm2('D', c_double) + Scnrm2 = _Tnrm2('Sc', c_float) + Dznrm2 = _Tnrm2('Dz', c_double) + + Sdot = _Tdot('S', c_float) + Ddot = _Tdot('D', c_double) + Cdotu = _Tdot('C', c_complex, 'u') + Zdotu = _Tdot('Z', c_double_complex, 'u') + Cdotc = _Tdot('C', c_complex, 'c') + Zdotc = _Tdot('Z', c_double_complex, 'c') + + Sscal = _Tscal('S', c_float) + Dscal = _Tscal('D', c_double) + Cscal = _Tscal('C', c_complex) + Zscal = _Tscal('Z', c_double_complex) + Csscal = _Tscal('Cs', c_float) + Zdscal = _Tscal('Zd', c_double) + + Saxpy = _Taxpy('S', c_float) + Daxpy = _Taxpy('D', c_double) + Caxpy = _Taxpy('C', c_complex) + Zaxpy = _Taxpy('Z', c_double_complex) + + Isamax = _Itamax('s', c_float) + Idamax = _Itamax('d', c_double) + Icamax = _Itamax('c', c_complex) + Izamax = _Itamax('z', c_double_complex) + + Isamin = _Itamin('s', c_float) + Idamin = _Itamin('d', c_double) + Icamin = _Itamin('c', c_complex) + Izamin = _Itamin('z', c_double_complex) + + Sasum = _Tasum('S', c_float) + Dasum = _Tasum('D', c_double) + Scasum = _Tasum('Sc', c_float) + Dzasum = _Tasum('Dz', c_double) + + Srot = _Trot('S', c_float, c_float) + Drot = _Trot('D', c_double, c_double) + Crot = _Trot('C', c_float, c_complex) + Zrot = _Trot('Z', c_double, c_double_complex) + Csrot = _Trot('Cs', c_float, c_float) + Zdrot = _Trot('Zd', c_double, c_double) + + Srotg = _Trotg('S', c_float, c_float) + Drotg = _Trotg('D', c_double, c_double) + Crotg = _Trotg('C', c_complex, c_float) + Zrotg = _Trotg('Z', c_double_complex, c_double) + + Srotm = _Trotm('S', np.float32) + Drotm = _Trotm('D', np.float64) + + Srotmg = _Trotmg('S', c_float, np.float32) + Drotmg = _Trotmg('D', c_double, np.float64) + + Sgbmv = _Tgbmv('S', c_float, np.float32) + Dgbmv = _Tgbmv('D', c_double, np.float64) + Cgbmv = _Tgbmv('C', c_complex, np.complex64) + Zgbmv = _Tgbmv('Z', c_double_complex, np.complex128) + + Sgemv = _Tgemv('S', c_float, np.float32) + Dgemv = _Tgemv('D', c_double, np.float64) + Cgemv = _Tgemv('C', c_complex, np.complex64) + Zgemv = _Tgemv('Z', c_double_complex, np.complex128) + + Strmv = _Ttrmv('S', np.float32) + Dtrmv = _Ttrmv('D', np.float64) + Ctrmv = _Ttrmv('C', np.complex64) + Ztrmv = _Ttrmv('Z', np.complex128) + + Stbmv = _Ttbmv('S', np.float32) + Dtbmv = _Ttbmv('D', np.float64) + Ctbmv = _Ttbmv('C', np.complex64) + Ztbmv = _Ttbmv('Z', np.complex128) + + Stpmv = _Ttpmv('S', np.float32) + Dtpmv = _Ttpmv('D', np.float64) + Ctpmv = _Ttpmv('C', np.complex64) + Ztpmv = _Ttpmv('Z', np.complex128) + + Strsv = _Ttrsv('S', np.float32) + Dtrsv = _Ttrsv('D', np.float64) + Ctrsv = _Ttrsv('C', np.complex64) + Ztrsv = _Ttrsv('Z', np.complex128) + + Stpsv = _Ttpsv('S', np.float32) + Dtpsv = _Ttpsv('D', np.float64) + Ctpsv = _Ttpsv('C', np.complex64) + Ztpsv = _Ttpsv('Z', np.complex128) + + Stbsv = _Ttbsv('S', np.float32) + Dtbsv = _Ttbsv('D', np.float64) + Ctbsv = _Ttbsv('C', np.complex64) + Ztbsv = _Ttbsv('Z', np.complex128) + + Ssymv = _Tsymv('S', c_float, np.float32) + Dsymv = _Tsymv('D', c_double, np.float64) + Csymv = _Tsymv('C', c_complex, np.complex64) + Zsymv = _Tsymv('Z', c_double_complex, np.complex128) + + Chemv = _Themv('C', c_complex, np.complex64) + Zhemv = _Themv('Z', c_double_complex, np.complex128) + + Ssbmv = _Tsbmv('S', c_float, np.float32) + Dsbmv = _Tsbmv('D', c_double, np.float64) + + Chbmv = _Thbmv('C', c_complex, np.complex64) + Zhbmv = _Thbmv('Z', c_double_complex, np.complex128) + + Sspmv = _Tspmv('S', c_float, np.float32) + Dspmv = _Tspmv('D', c_double, np.float64) + + Chpmv = _Thpmv('C', c_complex, np.complex64) + Zhpmv = _Thpmv('Z', c_double_complex, np.complex128) + + Sger = _Tger('S', c_float, np.float32) + Dger = _Tger('D', c_double, np.float64) + Cgeru = _Tgeru('C', c_complex, np.complex64) + Cgerc = _Tgerc('C', c_complex, np.complex64) + Zgeru = _Tgeru('Z', c_double_complex, np.complex128) + Zgerc = _Tgerc('Z', c_double_complex, np.complex128) + + Ssyr = _Tsyr('S', c_float, np.float32) + Dsyr = _Tsyr('D', c_double, np.float64) + Csyr = _Tsyr('C', c_complex, np.complex64) + Zsyr = _Tsyr('Z', c_double_complex, np.complex128) + + Cher = _Ther('C', c_float, np.complex64) + Zher = _Ther('Z', c_double, np.complex128) + + Sspr = _Tspr('S', c_float, np.float32) + Dspr = _Tspr('D', c_double, np.float64) + Chpr = _Thpr('C', c_float, np.complex64) + Zhpr = _Thpr('Z', c_double, np.complex128) + + Ssyr2 = _Tsyr2('S', c_float, np.float32) + Dsyr2 = _Tsyr2('D', c_double, np.float64) + Csyr2 = _Tsyr2('C', c_complex, np.complex64) + Zsyr2 = _Tsyr2('Z', c_double_complex, np.complex128) + Cher2 = _Ther2('C', c_complex, np.complex64) + Zher2 = _Ther2('Z', c_double_complex, np.complex128) + + Sspr2 = _Tspr2('S', c_float, np.float32) + Dspr2 = _Tspr2('D', c_double, np.float64) + + Chpr2 = _Thpr2('C', c_complex, np.complex64) + Zhpr2 = _Thpr2('Z', c_double_complex, np.complex128) + + Sgemm = _Tgemm('S', c_float, np.float32) + Dgemm = _Tgemm('D', c_double, np.float64) + Cgemm = _Tgemm('C', c_complex, np.complex64) + Zgemm = _Tgemm('Z', c_double_complex, np.complex128) + + Ssyrk = _Tsyrk('S', c_float, np.float32) + Dsyrk = _Tsyrk('D', c_double, np.float64) + Csyrk = _Tsyrk('C', c_complex, np.complex64) + Zsyrk = _Tsyrk('Z', c_double_complex, np.complex128) + + Cherk = _Therk('C', c_float, np.complex64) + Zherk = _Therk('Z', c_double, np.complex128) + + Ssymm = _Tsymm('S', c_float, np.float32) + Dsymm = _Tsymm('D', c_double, np.float64) + Csymm = _Tsymm('C', c_complex, np.complex64) + Zsymm = _Tsymm('Z', c_double_complex, np.complex128) + + Chemm = _Themm('C', c_complex, np.complex64) + Zhemm = _Themm('Z', c_double_complex, np.complex128) + + Strsm = _Ttrsm('S', c_float, np.float32) + Dtrsm = _Ttrsm('D', c_double, np.float64) + Ctrsm = _Ttrsm('C', c_complex, np.complex64) + Ztrsm = _Ttrsm('Z', c_double_complex, np.complex128) + + Strmm = _Ttrmm('S', c_float, np.float32) + Dtrmm = _Ttrmm('D', c_double, np.float64) + Ctrmm = _Ttrmm('C', c_complex, np.complex64) + Ztrmm = _Ttrmm('Z', c_double_complex, np.complex128) + + Sdgmm = _Tdgmm('S', c_float, np.float32) + Ddgmm = _Tdgmm('D', c_double, np.float64) + Cdgmm = _Tdgmm('C', c_complex, np.complex64) + Zdgmm = _Tdgmm('Z', c_double_complex, np.complex128) + + Sgeam = _Tgeam('S', c_float, np.float32) + Dgeam = _Tgeam('D', c_double, np.float64) + Cgeam = _Tgeam('C', c_complex, np.complex64) + Zgeam = _Tgeam('Z', c_double_complex, np.complex128) diff --git a/pyculib/config.py b/pyculib/config.py new file mode 100644 index 0000000..58813fd --- /dev/null +++ b/pyculib/config.py @@ -0,0 +1,26 @@ +import os + +class Environment: + + def __init__(self): + """Set config flags based on environment variables.""" + + self._environ = os.environ + WARNINGS = Environment._readenv("PYCULIB_WARNINGS", int, 0) + + globals()['WARNINGS'] = WARNINGS + + @staticmethod + def _readenv(name, ctor, default): + value = os.environ.get(name) + if value is None: + return default() if callable(default) else default + try: + return ctor(value) + except Exception: + warnings.warn("environ %s defined but failed to parse '%s'" % + (name, res), RuntimeWarning) + return default + +_env = Environment() + diff --git a/pyculib/fft/__init__.py b/pyculib/fft/__init__.py new file mode 100644 index 0000000..edfa8a1 --- /dev/null +++ b/pyculib/fft/__init__.py @@ -0,0 +1 @@ +from .api import FFTPlan, fft, ifft, fft_inplace, ifft_inplace diff --git a/pyculib/fft/api.py b/pyculib/fft/api.py new file mode 100644 index 0000000..2d96788 --- /dev/null +++ b/pyculib/fft/api.py @@ -0,0 +1,233 @@ +import numpy as np +from . import binding as _cufft +from numba import cuda +from numba.cuda.kernels.transpose import transpose +from numba.cuda.cudadrv.devicearray import DeviceNDArray + +def _prepare_types(pairs): + return dict((tuple(map(np.dtype, k)), + getattr(_cufft, 'CUFFT_' + v)) + for k, v in pairs.items()) + +class FFTPlan(object): + ''' + :param shape: Input array shape. + :param itype: Input data type. + :param otype: Output data type. + :param batch: Maximum number of operation to perform. + :param stream: A CUDA stream for all the operations to put on. + :param mode: Operation mode; e.g. MODE_NATIVE, MODE_FFTW_PADDING, + MODE_FFTW_ASYMMETRIC, MODE_FFTW_ALL, MODE_DEFAULT. + ''' + + MODE_NATIVE = _cufft.CUFFT_COMPATIBILITY_NATIVE + MODE_FFTW_PADDING = _cufft.CUFFT_COMPATIBILITY_FFTW_PADDING + MODE_FFTW_ASYMMETRIC = _cufft.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC + MODE_FFTW_ALL = _cufft.CUFFT_COMPATIBILITY_FFTW_ALL + MODE_DEFAULT = _cufft.CUFFT_COMPATIBILITY_DEFAULT + + SUPPORTED_TYPES = _prepare_types({ + (np.float32, np.complex64) : 'R2C', + (np.float64, np.complex128) : 'D2Z', + (np.complex64, np.float32) : 'C2R', + (np.complex128, np.float64) : 'Z2D', + (np.complex64, np.complex64) : 'C2C', + (np.complex128, np.complex128) : 'Z2Z', + }) + + @cuda.require_context + def __init__(self, shape, itype, otype, batch=1, stream=0, + mode=MODE_DEFAULT): + + itype = np.dtype(itype) + otype = np.dtype(otype) + + try: + operation = self.SUPPORTED_TYPES[(itype, otype)] + except KeyError: + raise ValueError("Invalid type combination") + + if operation in (_cufft.CUFFT_R2C, _cufft.CUFFT_D2Z): + direction = 'forward' + elif operation in (_cufft.CUFFT_C2R, _cufft.CUFFT_Z2D): + direction = 'inverse' + shape = shape[:-1] + ((shape[-1] - 1) * 2,) + else: + direction = 'both' + + self._plan = _cufft.Plan.many(shape, operation, batch=batch) + if stream: + self._plan.set_stream(stream) + self._plan.set_compatibility_mode(mode) + + complex_types = [np.dtype(x) for x in (np.complex64, np.complex128)] + + if itype in complex_types and otype in complex_types: + ishape = oshape = shape + elif itype in complex_types: + ishape = oshape = shape[:-1] + (shape[-1]//2 + 1,) + elif otype in complex_types: + ishape = shape + oshape = shape[:-1] + (shape[-1]//2 + 1,) + else: + raise ValueError("Invalid type combination") + + self.operation = operation + self.itype = itype + self.otype = otype + self.shape = shape + self.ishape = ishape + self.oshape = oshape + self.batch = batch + self.stream = stream + self.mode = mode + self.direction = direction + + def _prepare(self, ary, out): + if ary.shape < self.ishape: + raise ValueError("Incompatible input array shape") + + if ary.dtype != self.itype: + raise ValueError("Incompatiable input array dtype") + + do_host_copy = False + if out is not None: + h_out = out + d_out, do_host_copy = cuda._auto_device(out, copy=False, + stream=self.stream) + else: + h_out = np.empty(shape=self.oshape, dtype=self.otype) + d_out = from_array_like(h_out, self.stream) + do_host_copy = True + + if h_out.shape < self.oshape: + raise ValueError("Incompatible output shape") + + d_ary, _ = cuda._auto_device(ary, stream=self.stream) + return d_ary, d_out, h_out, do_host_copy + + def _maybe_transpose(self, d_ary, d_out): + """Transpose device arrays into row-major format if needed, as cuFFT + can't handle column-major data.""" + + transpose_in = len(d_ary.shape) == 2 and d_ary.is_f_contiguous() + transpose_out = len(d_out.shape) == 2 and d_out.is_f_contiguous() + if transpose_in: + # Create a row-major device array + used_in = DeviceNDArray(shape=(d_ary.shape[1], d_ary.shape[0]), + strides=(d_ary.dtype.itemsize, + d_ary.dtype.itemsize*d_ary.shape[1]), + dtype=d_ary.dtype) + transpose(d_ary, used_in) + else: + used_in = d_ary + if transpose_out: + # Create a row-major device array + used_out = DeviceNDArray(shape=d_out.shape, + strides=(d_out.dtype.itemsize*d_out.shape[1], + d_out.dtype.itemsize), + dtype=d_out.dtype) + else: + used_out = d_out + return used_in, used_out, transpose_out + + def forward(self, ary, out=None): + '''Perform forward FFT + + :param ary: Input array + :param out: Optional output array + + :returns: The output array or a new numpy array is `out` is None. + + .. note:: If `ary` is `out`, an inplace operation is performed. + ''' + if self.direction not in ('both', 'forward'): + raise TypeError("Invalid operation") + d_ary, d_out, h_out, do_host_copy = self._prepare(ary, out) + used_in, used_out, transpose_out = self._maybe_transpose(d_ary, d_out) + self._plan.forward(used_in, used_out) + if do_host_copy: + if transpose_out: + # reshape the array so it can be transposed back + used_out.shape=(used_out.shape[1], used_out.shape[0]) + used_out.strides=(used_out.dtype.itemsize, + used_out.dtype.itemsize*used_out.shape[0]) + # and finally transpose it + transpose(used_out, d_out) + d_out.copy_to_host(h_out) + return h_out + + def inverse(self, ary, out=None): + '''Perform inverse FFT + + :param ary: Input array + :param out: Optional output array + + :returns: The output array or a new numpy array is `out` is None. + + .. note: If `ary` is `out`, an inplace operation is performed. + ''' + if self.direction not in ('both', 'inverse'): + raise TypeError("Invalid operation") + d_ary, d_out, h_out, do_host_copy = self._prepare(ary, out) + used_in, used_out, transpose_out = self._maybe_transpose(d_ary, d_out) + self._plan.inverse(used_in, used_out) + if do_host_copy: + if transpose_out: + # reshape the array so it can be transposed back + used_out.shape=(used_out.shape[1], used_out.shape[0]) + used_out.strides=(used_out.dtype.itemsize, + used_out.dtype.itemsize*used_out.shape[0]) + # and finally transpose it + transpose(used_out, d_out) + d_out.copy_to_host(h_out) + return h_out + +# +# Simple one-off functions +# + +def fft(ary, out, stream=None): + '''Perform forward FFT on `ary` and output to `out`. + + out --- can be a numpy array or a GPU device array with 1 <= ndim <= 3 + stream --- a CUDA stream + ''' + plan = FFTPlan(ary.shape, ary.dtype, out.dtype, stream=stream) + plan.forward(ary, out) + return out + +def ifft(ary, out, stream=None): + '''Perform inverse FFT on `ary` and output to `out`. + + out --- can be a numpy array or a GPU device array with 1 <= ndim <= 3 + stream --- a CUDA stream + ''' + plan = FFTPlan(ary.shape, ary.dtype, out.dtype, stream=stream) + plan.inverse(ary, out) + return out + +def fft_inplace(ary, stream=None): + '''Perform inplace forward FFT. `ary` must have complex dtype. + + out --- can be a numpy array or a GPU device array with 1 <= ndim <= 3 + stream --- a CUDA stream + ''' + d_ary, conv = cuda._auto_device(ary, stream=stream) + fft(d_ary, d_ary, stream=stream) + if conv: + d_ary.copy_to_host(ary) + return ary + + +def ifft_inplace(ary, stream=None): + '''Perform inplace inverse FFT. `ary` must have complex dtype. + + out --- can be a numpy array or a GPU device array with 1 <= ndim <= 3 + stream --- a CUDA stream + ''' + d_ary, conv = cuda._auto_device(ary, stream=stream) + ifft(d_ary, d_ary, stream=stream) + if conv: + d_ary.copy_to_host(ary) + return ary diff --git a/pyculib/fft/binding.py b/pyculib/fft/binding.py new file mode 100644 index 0000000..55b273d --- /dev/null +++ b/pyculib/fft/binding.py @@ -0,0 +1,261 @@ +from __future__ import absolute_import +import numpy as np +from ctypes import c_void_p, c_int, POINTER, byref + +from numba.cuda.cudadrv.drvapi import cu_stream +from numba.cuda.cudadrv.driver import device_pointer +from pyculib.utils import Lib, ctype_function, finalizer + +STATUS = { + 0x0: 'CUFFT_SUCCESS', + 0x1: 'CUFFT_INVALID_PLAN', + 0x2: 'CUFFT_ALLOC_FAILED', + 0x3: 'CUFFT_INVALID_TYPE', + 0x4: 'CUFFT_INVALID_VALUE', + 0x5: 'CUFFT_INTERNAL_ERROR', + 0x6: 'CUFFT_EXEC_FAILED', + 0x7: 'CUFFT_SETUP_FAILED', + 0x8: 'CUFFT_INVALID_SIZE', + 0x9: 'CUFFT_UNALIGNED_DATA', +} + +cufftResult = c_int + +CUFFT_FORWARD = -1 +CUFFT_INVERSE = 1 + +CUFFT_R2C = 0x2a # Real to Complex (interleaved) +CUFFT_C2R = 0x2c # Complex (interleaved) to Real +CUFFT_C2C = 0x29 # Complex to Complex, interleaved +CUFFT_D2Z = 0x6a # Double to Double-Complex +CUFFT_Z2D = 0x6c # Double-Complex to Double +CUFFT_Z2Z = 0x69 # Double-Complex to Double-Complex + +cufftType = c_int + +CUFFT_COMPATIBILITY_NATIVE = 0x00 +CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01 # The default value +CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC = 0x02 +CUFFT_COMPATIBILITY_FFTW_ALL = 0x03 + +CUFFT_COMPATIBILITY_DEFAULT = CUFFT_COMPATIBILITY_FFTW_PADDING + +cufftCompatibility = c_int + +cufftHandle = c_int + + +class CuFFTError(Exception): + def __init__(self, code): + super(CuFFTError, self).__init__(STATUS[code]) + + +class libcufft(Lib): + lib = 'cufft' + ErrorType = CuFFTError + + @property + def version(self): + ver = c_int(0) + self.cufftGetVersion(byref(ver)) + return ver.value + + cufftGetVersion = ctype_function(cufftResult, POINTER(c_int)) + + cufftPlan1d = ctype_function(cufftResult, + POINTER(cufftHandle), # plan + c_int, # nx + cufftType, # type + c_int, + # batch - deprecated - use cufftPlanMany + ) + + cufftPlan2d = ctype_function(cufftResult, + POINTER(cufftHandle), # plan + c_int, # nx + c_int, # ny + cufftType # type + ) + + cufftPlan3d = ctype_function(cufftResult, + POINTER(cufftHandle), # plan + c_int, # nx + c_int, # ny + c_int, # nz + cufftType # type + ) + + cufftPlanMany = ctype_function(cufftResult, + POINTER(cufftHandle), # plan + c_int, # rank + c_void_p, # POINTER(c_int) n + c_void_p, # POINTER(c_int) inembed + c_int, # istride + c_int, # idist + c_void_p, # POINTER(c_int) onembed + c_int, # ostride + c_int, # odist + cufftType, # type + c_int, # batch + ) + + cufftDestroy = ctype_function(cufftResult, + cufftHandle, # plan + ) + + cufftExecC2C = ctype_function(cufftResult, + cufftHandle, # plan + c_void_p, # POINTER(cufftComplex) idata + c_void_p, # POINTER(cufftComplex) odata + c_int, # direction + ) + + cufftExecR2C = ctype_function(cufftResult, + cufftHandle, # plan + c_void_p, # POINTER(cufftReal) idata + c_void_p, # POINTER(cufftComplex) odata + c_int, + ) + + cufftExecC2R = ctype_function(cufftResult, + cufftHandle, # plan + c_void_p, # POINTER(cufftComplex) idata + c_void_p, # POINTER(cufftReal) odata + c_int, + ) + + cufftExecZ2Z = ctype_function(cufftResult, + cufftHandle, # plan + c_void_p, # POINTER(cufftDoubleComplex) idata + c_void_p, # POINTER(cufftDoubleComplex) odata + c_int, # direction + ) + + cufftExecD2Z = ctype_function(cufftResult, + cufftHandle, # plan + c_void_p, # POINTER(cufftDoubleReal) idata + c_void_p, # POINTER(cufftDoubleComplex) odata + c_int, + ) + + cufftExecZ2D = ctype_function(cufftResult, + cufftHandle, # plan + c_void_p, # POINTER(cufftDoubleComplex) idata + c_void_p, # POINTER(cufftDoubleReal) odata + c_int, + ) + + cufftSetStream = ctype_function(cufftResult, + cufftHandle, # plan, + cu_stream, # stream + ) + + cufftSetCompatibilityMode = ctype_function(cufftResult, + cufftHandle, # plan, + cufftCompatibility # mode + ) + + +cufft_dtype_to_name = { + CUFFT_R2C: 'R2C', + CUFFT_C2R: 'C2R', + CUFFT_C2C: 'C2C', + CUFFT_D2Z: 'D2Z', + CUFFT_Z2D: 'Z2D', + CUFFT_Z2Z: 'Z2Z', +} + + +class Plan(finalizer.OwnerMixin): + @classmethod + def one(cls, dtype, nx): + "cufftPlan1d" + inst = object.__new__(cls) + inst._api = libcufft() + inst._handle = cufftHandle() + BATCH = 1 # deprecated args to cufftPlan1d + inst._api.cufftPlan1d(byref(inst._handle), int(nx), int(dtype), + BATCH) + inst.dtype = dtype + inst._finalizer_track((inst._handle, inst._api)) + return inst + + @classmethod + def two(cls, dtype, nx, ny): + "cufftPlan2d" + inst = object.__new__(cls) + inst._api = libcufft() + inst._handle = cufftHandle() + inst._api.cufftPlan2d(byref(inst._handle), int(nx), int(ny), + int(dtype)) + inst.dtype = dtype + inst._finalizer_track((inst._handle, inst._api)) + return inst + + @classmethod + def three(cls, dtype, nx, ny, nz): + "cufftPlan3d" + inst = object.__new__(cls) + inst._api = libcufft() + inst._handle = cufftHandle() + inst._api.cufftPlan3d(byref(inst._handle), int(nx), int(ny), + int(nz), int(dtype)) + inst.dtype = dtype + inst._finalizer_track((inst._handle, inst._api)) + return inst + + @classmethod + def many(cls, shape, dtype, batch=1): + "cufftPlanMany" + inst = object.__new__(cls) + inst._api = libcufft() + inst._handle = cufftHandle() + + c_shape = np.asarray(shape, dtype=np.int32) + inst._api.cufftPlanMany(byref(inst._handle), + len(shape), + c_shape.ctypes.data, + None, 1, 0, + None, 1, 0, + int(dtype), int(batch)) + inst.shape = shape + inst.dtype = dtype + inst.batch = batch + inst._finalizer_track((inst._handle, inst._api)) + return inst + + @classmethod + def _finalize(cls, res): + handle, api = res + api.cufftDestroy(handle) + + def set_stream(self, stream): + "Associate a CUDA stream to this plan object" + return self._api.cufftSetStream(self._handle, stream.handle) + + def set_compatibility_mode(self, mode): + return self._api.cufftSetCompatibilityMode(self._handle, mode) + + def set_native_mode(self): + return self.set_compatibility_mode(CUFFT_COMPATIBILITY_NATIVE) + + def set_fftw_padding_mode(self): + return self.set_compatibility_mode(CUFFT_COMPATIBILITY_FFTW_PADDING) + + def set_fftw_asymmetric_mode(self): + return self.set_compatibility_mode(CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC) + + def set_fftw_all_mode(self): + return self.set_compatibility_mode(CUFFT_COMPATIBILITY_FFTW_ALL) + + def exe(self, idata, odata, dir): + postfix = cufft_dtype_to_name[self.dtype] + meth = getattr(self._api, 'cufftExec' + postfix) + return meth(self._handle, device_pointer(idata), + device_pointer(odata), int(dir)) + + def forward(self, idata, odata): + return self.exe(idata, odata, dir=CUFFT_FORWARD) + + def inverse(self, idata, odata): + return self.exe(idata, odata, dir=CUFFT_INVERSE) diff --git a/pyculib/nputil.py b/pyculib/nputil.py new file mode 100644 index 0000000..982e967 --- /dev/null +++ b/pyculib/nputil.py @@ -0,0 +1,45 @@ +import numpy as np +from pyculib import warnings + +promote = np.promote_types # type promotion + +def alias(a, b): + """Check whether the arrays `a` and `b` alias.""" + + if a is b: + return True + elif a.base is None and b.base is None: + return False + else: + return a.base is b or a is b.base or a.base is b.base + +def astype(x, dtype, var, stacklevel=3): + """Return `x` or a copy of `x`, with its type converted to `dtype`. + `var` is the name of `x` as seen by users of a public API, which may be + used in a warning message. `stacklevel` corresponds to the number of frames + to skip when reporting the warning.""" + + # stacklevel=3 means the warning will be reported against the BLAS call, + # not against this (astype()) function. + # Make this a variable as sometimes the call is nested, so the number of + # frames needs to be adjusted. + if dtype != x.dtype: + warnings.warn("%s (%s) is converted to %s"%(var, x.dtype, dtype), + warnings.PerformanceWarning, stacklevel=stacklevel) + return x.astype(dtype, copy=False) + + +def colmajor(x, dtype, var): + """Return `x` or a copy of `x`, with its dimension ordering converted to + column-major, and its type converted to `dtype`. + `var` is the name of `x` as seen by users of a public API, which may be + used in a warning message.""" + + if not x.flags['F_CONTIGUOUS']: + warnings.warn("%s is converted to column-major layout"%(var), + warnings.PerformanceWarning, stacklevel=3) + return np.asfortranarray(x, dtype=dtype) + else: + return astype(x, dtype, var, stacklevel=4) + + diff --git a/pyculib/rand/__init__.py b/pyculib/rand/__init__.py new file mode 100644 index 0000000..1e9c70b --- /dev/null +++ b/pyculib/rand/__init__.py @@ -0,0 +1,2 @@ +from .api import PRNG, QRNG +from .api import uniform, normal, lognormal, poisson, quasi diff --git a/pyculib/rand/api.py b/pyculib/rand/api.py new file mode 100644 index 0000000..573b53b --- /dev/null +++ b/pyculib/rand/api.py @@ -0,0 +1,371 @@ +import numpy as np +import time +from . import binding +from numba import cuda + +class RNG(object): + "cuRAND pseudo random number generator" + def __init__(self, gen): + self._gen = gen + self.__stream = 0 + + @property + def offset(self): + return self.__offset + + @offset.setter + def offset(self, offset): + self.__offset = offset + self._gen.set_offset(offset) + + @property + def stream(self): + '''Associate a CUDA stream to the generator object. + All subsequent calls will use this stream.''' + return self.__stream + + @stream.setter + def stream(self, stream): + self.__stream = stream + self._gen.set_stream(stream) + + def _require_array(self, ary): + if ary.ndim != 1: + raise TypeError("Only accept 1-D array") + if ary.strides[0] != ary.dtype.itemsize: + raise TypeError("Only accept unit strided array") + + +class PRNG(RNG): + '''cuRAND pseudo random number generator + + :param rndtype: Algorithm type. All possible values are listed as + class attributes of this class, e.g. TEST, DEFAULT, + XORWOW, MRG32K3A, MTGP32. + :param seed: Seed for the RNG. + :param offset: Offset to the random number stream. + :param stream: CUDA stream. + + Example: + + >>> from pyculib import rand + >>> from numpy import empty + >>> prng = rand.PRNG(rndtype=rand.PRNG.XORWOW) + >>> r = empty(10) + >>> prng.uniform(r) + >>> r + array([ ... ]) + ''' + + TEST = binding.CURAND_RNG_TEST + DEFAULT = binding.CURAND_RNG_PSEUDO_DEFAULT + XORWOW = binding.CURAND_RNG_PSEUDO_XORWOW + MRG32K3A = binding.CURAND_RNG_PSEUDO_MRG32K3A + MTGP32 = binding.CURAND_RNG_PSEUDO_MTGP32 + + @cuda.require_context + def __init__(self, rndtype=DEFAULT, seed=None, offset=None, stream=None): + super(PRNG, self).__init__(binding.Generator(rndtype)) + self.rndtype = rndtype + if seed is not None: + self.seed = seed + if offset is not None: + self.offset = offset + if stream is not None: + self.stream = stream + + @property + def seed(self): + "Mutatable attribute for the seed for the RNG" + return self.__seed + + @seed.setter + def seed(self, seed): + self.__seed = seed + self._gen.set_pseudo_random_generator_seed(seed) + + def uniform(self, ary, size=None): + '''Generate floating point random number sampled + from a uniform distribution and fill into ary. + + :param ary: Numpy array or cuda device array. + :param size: Number of samples. Default to array size. + ''' + self._require_array(ary) + size = size or ary.size + dary, conv = cuda._auto_device(ary, stream=self.stream) + self._gen.generate_uniform(dary, size) + if conv: + dary.copy_to_host(ary, stream=self.stream) + + def normal(self, ary, mean, sigma, size=None): + '''Generate floating point random number sampled + from a normal distribution and fill into ary. + + :param ary: Numpy array or cuda device array. + :param mean: Center of the distribution. + :param sigma: Standard deviation of the distribution. + :param size: Number of samples. Default to array size. + ''' + self._require_array(ary) + size = size or ary.size + dary, conv = cuda._auto_device(ary, stream=self.stream) + self._gen.generate_normal(dary, size, mean, sigma) + if conv: + dary.copy_to_host(ary, stream=self.stream) + + + def lognormal(self, ary, mean, sigma, size=None): + '''Generate floating point random number sampled + from a log-normal distribution and fill into ary. + + :param ary: Numpy array or cuda device array. + :param mean: Center of the distribution. + :param sigma: Standard deviation of the distribution. + :param size: Number of samples. Default to array size. + ''' + self._require_array(ary) + size = size or ary.size + dary, conv = cuda._auto_device(ary, stream=self.stream) + self._gen.generate_log_normal(dary, size, mean, sigma) + if conv: + dary.copy_to_host(ary, stream=self.stream) + + def poisson(self, ary, lmbd, size=None): + '''Generate floating point random number sampled + from a poisson distribution and fill into ary. + + :param ary: Numpy array or cuda device array. + :param lmbda: Lambda for the distribution. + :param size: Number of samples. Default to array size. + ''' + self._require_array(ary) + size = size or ary.size + dary, conv = cuda._auto_device(ary, stream=self.stream) + self._gen.generate_poisson(dary, lmbd, size) + if conv: + dary.copy_to_host(ary, stream=self.stream) + + +class QRNG(RNG): + '''cuRAND quasi random number generator + + :param rndtype: Algorithm type. + Also control output data type. + All possible values are listed as class + attributes of this class, e.g. TEST, DEFAULT, SOBOL32, + SCRAMBLED_SOBOL32, SOBOL64, SCRAMABLED_SOBOL64. + :param ndim: Number of dimension for the QRNG. + :param offset: Offset to the random number stream. + :param stream: CUDA stream. + ''' + + + TEST = binding.CURAND_RNG_TEST + DEFAULT = binding.CURAND_RNG_QUASI_DEFAULT + SOBOL32 = binding.CURAND_RNG_QUASI_SOBOL32 + SCRAMBLED_SOBOL32 = binding.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 + SOBOL64 = binding.CURAND_RNG_QUASI_SOBOL64 + SCRAMBLED_SOBOL64 = binding.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 + + @cuda.require_context + def __init__(self, rndtype=DEFAULT, ndim=None, offset=None, stream=None): + super(QRNG, self).__init__(binding.Generator(rndtype)) + self.rndtype = rndtype + if ndim is not None: + self.ndim = ndim + if offset is not None: + self.offset = offset + if stream is not None: + self.stream = stream + + @property + def ndim(self, ndim): + '''Mutatable attribute for number of dimension for the QRNG. + ''' + return self.__ndim + + @ndim.setter + def ndim(self, ndim): + self.__ndim = ndim + self._gen.set_quasi_random_generator_dimensions(ndim) + + def generate(self, ary, size=None): + """Generate quasi random number in ary. + + :param ary: Numpy array or cuda device array. + + :param size: Number of samples; + Default to array size. Must be multiple of ndim. + """ + self._require_array(ary) + size = size or ary.size + dary, conv = cuda._auto_device(ary, stream=self.stream) + self._gen.generate(dary, size) + if conv: + dary.copy_to_host(ary, stream=self.stream) + + +# +# Top level function entry points. +# + +_global_rng = {} + +def _get_prng(): + key = 'prng' + prng = _global_rng.get(key) + if not prng: + prng = PRNG() + prng.seed = int(time.time()) + _global_rng[key] = prng + return prng + + +def _get_qrng(bits): + assert bits in (32, 64), "not 32 or 64 bit" + key = 'qrng%d' % bits + qrng = _global_rng.get(key) + if not qrng: + qrng = QRNG(rndtype=getattr(QRNG, 'SOBOL%d' % bits)) + _global_rng[key] = qrng + return qrng + +def uniform(size, dtype=np.float, device=False): + '''Generate floating point random number sampled + from a uniform distribution + + :param size: Number of samples. + :param dtype: np.float32 or np.float64. + :param device: Set to True to return a device array instead or numpy array. + + :returns: A numpy array or a device array. + + >>> from pyculib import rand + >>> rand.uniform(size=10) + array([...]) + + .. seealso:: :py:meth:`pyculib.rand.PRNG.uniform` + ''' + ary = np.empty(size, dtype=dtype) + devary = cuda.to_device(ary, copy=False) + prng = _get_prng() + prng.uniform(devary, size) + if device: + return devary + else: + devary.copy_to_host(ary) + return ary + +def normal(mean, sigma, size, dtype=np.float, device=False): + '''Generate floating point random number sampled + from a normal distribution + + :param mean: Center point of the distribution. + :param sigma: Standard deviation of the distribution. + :param size: --- Number of samples. + :param dtype: np.float32 or np.float64. + :param device: Set to True to return a device array instead or ndarray. + :returns: A numpy array or a device array. + + >>> from pyculib import rand + >>> rand.normal(mean=0, sigma=1, size=10) + array([...]) + + .. seealso:: :py:meth:`pyculib.rand.PRNG.normal` + + ''' + ary = np.empty(size, dtype=dtype) + devary = cuda.to_device(ary, copy=False) + prng = _get_prng() + prng.normal(devary, mean, sigma, size) + if device: + return devary + else: + devary.copy_to_host(ary) + return ary + +def lognormal(mean, sigma, size, dtype=np.float, device=False): + '''Generate floating point random number sampled + from a log-normal distribution. + + :param mean: Center point of the distribution. + :param sigma: Standard deviation of the distribution. + :param size: Number of samples. + :param dtype: np.float32 or np.float64. + :param device: set to True to return a device array instead or ndarray. + :returns: A numpy array or a device array. + + >>> from pyculib import rand + >>> rand.lognormal(mean=0, sigma=1, size=10) + array([...]) + + .. seealso:: :py:meth:`pyculib.rand.PRNG.lognormal` + + ''' + ary = np.empty(size, dtype=dtype) + devary = cuda.to_device(ary, copy=False) + prng = _get_prng() + prng.lognormal(devary, mean, sigma, size) + if device: + return devary + else: + devary.copy_to_host(ary) + return ary + +def poisson(lmbd, size, device=False): + '''Generate int32 random number sampled + from a poisson distribution. + + :param lmbda: Lambda of the distribution. + :param size: Number of samples + :param device: Set to True to return a device array instead or ndarray. + :returns: A numpy array or a device array. + + >>> from pyculib import rand + >>> rand.poisson(lmbd=1, size=10) + array([...], dtype=uint32) + + .. seealso:: :py:meth:`pyculib.rand.PRNG.poisson` + ''' + ary = np.empty(size, dtype=np.uint32) + devary = cuda.to_device(ary, copy=False) + prng = _get_prng() + prng.poisson(devary, lmbd, size) + if device: + return devary + else: + devary.copy_to_host(ary) + return ary + +def quasi(size, bits=32, nd=1, device=False): + '''Generate quasi random number using SOBOL{bits} RNG type. + + :param size: Number of samples. + :param bits: Bit length of output element; e.g. 32 or 64. + :param nd: Number of dimension . + :param device: Set to True to return a device array instead or ndarray. + :returns: A numpy array or a device array. + + >>> from pyculib import rand + >>> rand.quasi(10) + array([...], dtype=uint32) + + .. seealso:: :py:meth:`pyculib.rand.QRNG.generate` + ''' + if bits == 64: + dtype = np.uint64 + elif bits == 32: + dtype = np.uint32 + else: + raise ValueError("Only accept bits = 32 or 64") + ary = np.empty(size, dtype=dtype) + devary = cuda.to_device(ary, copy=False) + qrng = _get_qrng(bits) + qrng.ndim = nd + qrng.generate(devary, size) + if device: + return devary + else: + devary.copy_to_host(ary) + return ary diff --git a/pyculib/rand/binding.py b/pyculib/rand/binding.py new file mode 100644 index 0000000..8914017 --- /dev/null +++ b/pyculib/rand/binding.py @@ -0,0 +1,298 @@ +from __future__ import absolute_import +import numpy as np +from ctypes import (c_float, c_int, c_void_p, POINTER, byref, cast, c_ulonglong, + c_uint, c_double, c_size_t) + +from numba.cuda.cudadrv.drvapi import cu_stream +from numba.cuda.cudadrv.driver import device_pointer +from pyculib.utils import Lib, ctype_function, finalizer + +# enum curandStatus +STATUS = { + 0: ('CURAND_STATUS_SUCCESS', + 'No errors'), + 100: ('CURAND_STATUS_VERSION_MISMATCH', + 'Header file and linked library version do not match'), + 101: ('CURAND_STATUS_NOT_INITIALIZED', + 'Generator not initialized'), + 102: ('CURAND_STATUS_ALLOCATION_FAILED', + 'Memory allocation failed'), + 103: ('CURAND_STATUS_TYPE_ERROR', + 'Generator is wrong type'), + 104: ('CURAND_STATUS_OUT_OF_RANGE', + 'Argument out of range'), + 105: ('CURAND_STATUS_LENGTH_NOT_MULTIPLE', + 'Length requested is not a multple of dimension'), + 106: ('CURAND_STATUS_DOUBLE_PRECISION_REQUIRED', + 'GPU does not have double precision required by MRG32k3a'), + 201: ('CURAND_STATUS_LAUNCH_FAILURE', + 'Kernel launch failure'), + 202: ('CURAND_STATUS_PREEXISTING_FAILURE', + 'Preexisting failure on library entry'), + 203: ('CURAND_STATUS_INITIALIZATION_FAILED', + 'Initialization of CUDA failed'), + 204: ('CURAND_STATUS_ARCH_MISMATCH', + 'Architecture mismatch, GPU does not support requested feature'), + 999: ('CURAND_STATUS_INTERNAL_ERROR', + 'Internal library error'), +} +curandStatus_t = c_int + + +# enum curandRngType +CURAND_RNG_TEST = 0 +## Default pseudorandom generator +CURAND_RNG_PSEUDO_DEFAULT = 100 +## XORWOW pseudorandom generator +CURAND_RNG_PSEUDO_XORWOW = 101 +## MRG32k3a pseudorandom generator +CURAND_RNG_PSEUDO_MRG32K3A = 121 +## Mersenne Twister pseudorandom generator +CURAND_RNG_PSEUDO_MTGP32 = 141 +## Default quasirandom generator +CURAND_RNG_QUASI_DEFAULT = 200 +## Sobol32 quasirandom generator +CURAND_RNG_QUASI_SOBOL32 = 201 +## Scrambled Sobol32 quasirandom generator +CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202 +## Sobol64 quasirandom generator +CURAND_RNG_QUASI_SOBOL64 = 203 +## Scrambled Sobol64 quasirandom generator +CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204 +curandRngType_t = c_int + +# enum curandOrdering +## Best ordering for pseudorandom results +CURAND_ORDERING_PSEUDO_BEST = 100 +## Specific default 4096 thread sequence for pseudorandom results +CURAND_ORDERING_PSEUDO_DEFAULT = 101 +## Specific seeding pattern for fast lower quality pseudorandom results +CURAND_ORDERING_PSEUDO_SEEDED = 102 +## Specific n-dimensional ordering for quasirandom results +CURAND_ORDERING_QUASI_DEFAULT = 201 +curandOrdering_t = c_int + +# enum curandDirectionVectorSet +## Specific set of 32-bit direction vectors generated from polynomials +## recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions +CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101 +## Specific set of 32-bit direction vectors generated from polynomials +## recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions, +## and scrambled +CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 102 +## Specific set of 64-bit direction vectors generated from polynomials +## recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions +CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103 +## Specific set of 64-bit direction vectors generated from polynomials +## recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions, +## and scrambled +CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 104 +curandDirectionVectorSet_t = c_int + +# enum curandMethod +CURAND_CHOOSE_BEST = 0 +CURAND_ITR = 1 +CURAND_KNUTH = 2 +CURAND_HITR = 3 +CURAND_M1 = 4 +CURAND_M2 = 5 +CURAND_BINARY_SEARCH = 6 +CURAND_DISCRETE_GAUSS = 7 +CURAND_REJECTION = 8 +CURAND_DEVICE_API = 9 +CURAND_FAST_REJECTION = 10 +CURAND_3RD = 11 +CURAND_DEFINITION = 12 +CURAND_POISSON = 13 +curandMethod_t = c_int + +curandGenerator_t = c_void_p +p_curandGenerator_t = POINTER(curandGenerator_t) + + +class CuRandError(Exception): + def __init__(self, code): + super(CuRandError, self).__init__(STATUS[code]) + + +class libcurand(Lib): + lib = 'curand' + ErrorType = CuRandError + + @property + def version(self): + ver = c_int(0) + self.curandGetVersion(byref(ver)) + return ver.value + + curandGetVersion = ctype_function(curandStatus_t, POINTER(c_int)) + + curandCreateGenerator = ctype_function( + curandStatus_t, + p_curandGenerator_t, # generator reference + curandRngType_t) # rng_type + + curandDestroyGenerator = ctype_function( + curandStatus_t, + curandGenerator_t) + + curandSetStream = ctype_function(curandStatus_t, + curandGenerator_t, + cu_stream) + + curandSetGeneratorOffset = ctype_function(curandStatus_t, + curandGenerator_t, + c_ulonglong) + + curandSetPseudoRandomGeneratorSeed = ctype_function( + curandStatus_t, + curandGenerator_t, + c_ulonglong) + + curandSetQuasiRandomGeneratorDimensions = ctype_function( + curandStatus_t, + curandGenerator_t, + c_uint) + + curandGenerate = ctype_function(curandStatus_t, + curandGenerator_t, + POINTER(c_uint), + c_size_t) + + curandGenerateLongLong = ctype_function(curandStatus_t, + curandGenerator_t, + POINTER(c_ulonglong), + c_size_t) + + curandGenerateUniform = ctype_function(curandStatus_t, + curandGenerator_t, + POINTER(c_float), + c_size_t) + + curandGenerateUniformDouble = ctype_function(curandStatus_t, + curandGenerator_t, + POINTER(c_double), + c_size_t) + + curandGenerateNormal = ctype_function(curandStatus_t, + curandGenerator_t, + POINTER(c_float), + c_size_t, + c_float, + c_float) + + curandGenerateNormalDouble = ctype_function(curandStatus_t, + curandGenerator_t, + POINTER(c_double), + c_size_t, + c_double, + c_double) + + curandGenerateLogNormal = ctype_function(curandStatus_t, + curandGenerator_t, + POINTER(c_float), + c_size_t, + c_float, + c_float) + + curandGenerateLogNormalDouble = ctype_function(curandStatus_t, + curandGenerator_t, + POINTER(c_double), + c_size_t, + c_double, + c_double) + + curandGeneratePoisson = ctype_function(curandStatus_t, + curandGenerator_t, + POINTER(c_uint), + c_size_t, + c_double) + + +class Generator(finalizer.OwnerMixin): + def __init__(self, rng_type=CURAND_RNG_TEST): + self._api = libcurand() + self._handle = curandGenerator_t(0) + self._api.curandCreateGenerator(byref(self._handle), rng_type) + self._finalizer_track((self._handle, self._api)) + + @classmethod + def _finalize(cls, res): + handle, api = res + api.curandDestroyGenerator(handle) + + def set_stream(self, stream): + return self._api.curandSetStream(self._handle, stream.handle) + + def set_offset(self, offset): + return self._api.curandSetGeneratorOffset(self._handle, offset) + + def set_pseudo_random_generator_seed(self, seed): + return self._api.curandSetPseudoRandomGeneratorSeed(self._handle, seed) + + def set_quasi_random_generator_dimensions(self, num_dim): + return self._api.curandSetQuasiRandomGeneratorDimensions(self._handle, + num_dim) + + def generate(self, devout, num): + fn, ptr = self.__uint32_or_uint64(devout, + self._api.curandGenerate, + self._api.curandGenerateLongLong) + return fn(self._handle, ptr, num) + + def generate_uniform(self, devout, num): + ''' + devout --- device array for the output + num --- # of float to generate + ''' + fn, ptr = self.__float_or_double(devout, + self._api.curandGenerateUniform, + self._api.curandGenerateUniformDouble) + return fn(self._handle, ptr, num) + + def generate_normal(self, devout, num, mean, stddev): + fn, ptr = self.__float_or_double(devout, + self._api.curandGenerateNormal, + self._api.curandGenerateNormalDouble) + return fn(self._handle, ptr, num, mean, stddev) + + def generate_log_normal(self, devout, num, mean, stddev): + fn, ptr = self.__float_or_double( + devout, + self._api.curandGenerateLogNormal, + self._api.curandGenerateLogNormalDouble) + return fn(self._handle, ptr, num, mean, stddev) + + def generate_poisson(self, devout, num, lmbd): + if devout.dtype not in (np.dtype(np.uint32), np.dtype(np.int32)): + raise ValueError("Only accept int32 or uint32 arrays") + dptr = device_pointer(devout) + ptr = cast(c_void_p(dptr), POINTER(c_uint)) + return self._api.curandGeneratePoisson(self._handle, ptr, num, lmbd) + + def __float_or_double(self, devary, floatfn, doublefn): + if devary.dtype == np.float32: + fn = floatfn + fty = c_float + elif devary.dtype == np.float64: + fn = doublefn + fty = c_double + else: + raise ValueError("Only accept float or double arrays.") + dptr = device_pointer(devary) + ptr = cast(c_void_p(dptr), POINTER(fty)) + return fn, ptr + + def __uint32_or_uint64(self, devary, fn32, fn64): + if devary.dtype in (np.dtype(np.uint32), np.dtype(np.int32)): + fn = self._api.curandGenerate + ity = c_uint + elif devary.dtype in (np.dtype(np.uint64), np.dtype(np.int64)): + fn = self._api.curandGenerateLongLong + ity = c_ulonglong + else: + raise ValueError("Only accept int32, int64, " + "uint32 or uint64 arrays") + dptr = device_pointer(devary) + ptr = cast(c_void_p(dptr), POINTER(ity)) + return fn, ptr diff --git a/pyculib/runtests.py b/pyculib/runtests.py new file mode 100644 index 0000000..d583710 --- /dev/null +++ b/pyculib/runtests.py @@ -0,0 +1,20 @@ +import sys +from os.path import dirname +from numba.testing import run_tests + +def _main(argv, **kwds): + # This helper function assumes the first element of argv + # is the name of the calling program. + # The 'main' API function is invoked in-process, and thus + # will synthesize that name. + return run_tests(argv, defaultTest='pyculib', + topleveldir=dirname(dirname(__file__)), **kwds).wasSuccessful() + +def main(*argv, **kwds): + """keyword arguments are accepted for backward compatiblity only. + See `numba.testing.run_tests()` documentation for details.""" + + return _main(['
'] + list(argv), **kwds) + +if __name__ == '__main__': + sys.exit(0 if _main(sys.argv) else 1) diff --git a/pyculib/sorting/__init__.py b/pyculib/sorting/__init__.py new file mode 100644 index 0000000..c735d15 --- /dev/null +++ b/pyculib/sorting/__init__.py @@ -0,0 +1,2 @@ +from .radixsort import RadixSort +from .segsort import segmented_sort diff --git a/pyculib/sorting/common.py b/pyculib/sorting/common.py new file mode 100644 index 0000000..f6fb046 --- /dev/null +++ b/pyculib/sorting/common.py @@ -0,0 +1,26 @@ +from __future__ import print_function, absolute_import, division +from numba import findlib +import ctypes +import os +import platform +import warnings + +def library_extension(): + p = platform.system() + if p == 'Linux': + return 'so' + if p == 'Windows': + return 'dll' + if p == 'Darwin': + return 'dylib' + +def load_lib(libname): + fullname = 'pyculib_%s.%s' % (libname, library_extension()) + devlib = os.path.join(os.path.abspath(os.path.dirname(__file__)), fullname) + if os.path.exists(devlib): + libpath = devlib + warnings.warn('Using in-tree library %s' % libpath) + else: + libpath = os.path.join(findlib.get_lib_dir(), fullname) + + return ctypes.CDLL(libpath) diff --git a/pyculib/sorting/radixsort.py b/pyculib/sorting/radixsort.py new file mode 100644 index 0000000..cbcf787 --- /dev/null +++ b/pyculib/sorting/radixsort.py @@ -0,0 +1,275 @@ +""" +Uses radixsort implementation from CUB which has the following license: + +Copyright (c) 2011, Duane Merrill. All rights reserved. +Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + Neither the name of the NVIDIA CORPORATION nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +from __future__ import print_function, absolute_import, division +import ctypes +from .common import load_lib +from contextlib import contextmanager +from numba.cuda.cudadrv.driver import device_pointer +from numba.cuda.cudadrv.drvapi import cu_stream +from numba.cuda.cudadrv.devicearray import auto_device, is_cuda_ndarray +from numba import cuda +import numpy as np + +lib = load_lib('radixsort') + +_argtypes = [ + ctypes.c_void_p, # temp + ctypes.c_uint, # count + ctypes.c_void_p, # d_key + ctypes.c_void_p, # d_key_alt + ctypes.c_void_p, # d_vals + ctypes.c_void_p, # d_vals_alt + cu_stream, + ctypes.c_int, # descending + ctypes.c_uint, # begin_bit + ctypes.c_uint, # end_bit +] + +_support_types = { + np.float32: 'float', + np.float64: 'double', + np.int32: 'int32', + np.uint32: 'uint32', + np.int64: 'int64', + np.uint64: 'uint64' +} + +_overloads = {} + + +def _init(): + for ty, name in _support_types.items(): + dtype = np.dtype(ty) + fn = getattr(lib, "radixsort_{0}".format(name)) + _overloads[dtype] = fn + fn.argtypes = _argtypes + fn.restype = ctypes.c_void_p + + +_init() + +lib.radixsort_cleanup.argtypes = [ctypes.c_void_p] + + +def _devptr(p): + if p is None: + return None + else: + return device_pointer(p) + + +@contextmanager +def _autodevice(ary, stream, firstk=None): + if ary is not None: + dptr, conv = auto_device(ary, stream=stream) + yield dptr + if conv: + if firstk is None: + dptr.copy_to_host(ary, stream=stream) + else: + dptr.bind(stream)[:firstk].copy_to_host(ary[:firstk], + stream=stream) + else: + yield None + + +@cuda.jit +def _cu_arange(ary, count): + i = cuda.grid(1) + if i < count: + ary[i] = i + + +class RadixSort(object): + """Provides radix sort and radix select. + + The algorithm implemented here is best for large arrays (``N > 1e6``) due to + the latency introduced by its use of multiple kernel launches. It is + recommended to use ``segmented_sort`` instead for batches of smaller arrays. + + :type maxcount: int + :param maxcount: Maximum number of items to sort + :type dtype: numpy.dtype + :param dtype: The element type to sort + :type descending: bool + :param descending: Sort in descending order? + :param stream: The CUDA stream to run the kernels in + """ + + def __init__(self, maxcount, dtype, descending=False, stream=0): + self.maxcount = int(maxcount) + self.dtype = np.dtype(dtype) + self._arysize = int(self.maxcount * self.dtype.itemsize) + self.descending = descending + self.stream = stream + self._sort = _overloads[self.dtype] + self._cleanup = lib.radixsort_cleanup + + ctx = cuda.current_context() + self._temp_keys = ctx.memalloc(self._arysize) + self._temp_vals = ctx.memalloc(self._arysize) + self._temp = self._call(temp=None, keys=None, vals=None) + + def __del__(self): + try: + self.close() + except: + pass + + def close(self): + """Explicitly release internal resources + + Called automatically when the object is deleted. + """ + if self._temp is not None: + self._cleanup(self._temp) + self._temp = None + + def _call(self, temp, keys, vals, begin_bit=0, end_bit=None): + stream = self.stream.handle if self.stream else self.stream + begin_bit = begin_bit + end_bit = end_bit or self.dtype.itemsize * 8 + descending = int(self.descending) + + count = self.maxcount + if keys: + count = keys.size + + return self._sort( + temp, + ctypes.c_uint(count), + _devptr(keys), + _devptr(self._temp_keys), + _devptr(vals), + _devptr(self._temp_vals), + stream, + descending, + begin_bit, + end_bit + ) + + def _sentry(self, ary): + if ary.dtype != self.dtype: + raise TypeError("dtype mismatch") + if ary.size > self.maxcount: + raise ValueError("keys array too long") + + def sort(self, keys, vals=None, begin_bit=0, end_bit=None): + """ + Perform a inplace sort on ``keys``. Memory transfer is performed + automatically. + + :type keys: numpy.ndarray + :param keys: Keys to sort inplace + :type vals: numpy.ndarray + :param vals: Optional. Additional values to be reordered along the sort. + It is modified in place. Only the ``uint32`` dtype is + supported in this version. + :type begin_bit: int + :param begin_bit: The first bit to sort + :type end_bit: int + :param end_bit: Optional. The last bit to sort + """ + self._sentry(keys) + with _autodevice(keys, self.stream) as d_keys: + with _autodevice(vals, self.stream) as d_vals: + self._call(self._temp, keys=d_keys, vals=d_vals, + begin_bit=begin_bit, end_bit=end_bit) + + def select(self, k, keys, vals=None, begin_bit=0, end_bit=None): + """Perform a inplace k-select on ``keys``. + + Memory transfer is performed automatically. + + :type keys: numpy.ndarray + :param keys: Keys to sort inplace + :type vals: numpy.ndarray + :param vals: Optional. Additional values to be reordered along the sort. + It is modified in place. Only the ``uint32`` dtype is + supported in this version. + :type begin_bit: int + :param begin_bit: The first bit to sort + :type end_bit: int + :param end_bit: Optional. The last bit to sort + """ + self._sentry(keys) + with _autodevice(keys, self.stream, firstk=k) as d_keys: + with _autodevice(vals, self.stream, firstk=k) as d_vals: + self._call(self._temp, keys=d_keys, vals=d_vals, + begin_bit=begin_bit, end_bit=end_bit) + + def init_arg(self, size): + """Initialize an empty CUDA ndarray of uint32 with ascending integers + starting from zero + + :type size: int + :param size: Number of elements for the output array + :return: An array with values ``[0, 1, 2, ...m size - 1 ]`` + """ + d_vals = cuda.device_array(size, dtype=np.uint32, stream=self.stream) + _cu_arange.forall(d_vals.size, stream=self.stream)(d_vals, size) + return d_vals + + def argselect(self, k, keys, begin_bit=0, end_bit=None): + """Similar to ``RadixSort.select`` but returns the new sorted indices. + + :type keys: numpy.ndarray + :param keys: Keys to sort inplace + :type begin_bit: int + :param begin_bit: The first bit to sort + :type end_bit: int + :param end_bit: Optional. The last bit to sort + :return: The indices indicating the new ordering as an array on the CUDA + device or on the host. + """ + d_vals = self.init_arg(keys.size) + self.select(k, keys, vals=d_vals, begin_bit=begin_bit, end_bit=end_bit) + res = d_vals.bind(self.stream)[:k] + if not is_cuda_ndarray(keys): + res = res.copy_to_host(stream=self.stream) + return res + + def argsort(self, keys, begin_bit=0, end_bit=None): + """Similar to ``RadixSort.sort`` but returns the new sorted indices. + + :type keys: numpy.ndarray + :param keys: Keys to sort inplace + :type begin_bit: int + :param begin_bit: The first bit to sort + :type end_bit: int + :param end_bit: Optional. The last bit to sort + :return: The indices indicating the new ordering as an array on the CUDA + device or on the host. + """ + d_vals = self.init_arg(keys.size) + self.sort(keys, vals=d_vals, begin_bit=begin_bit, end_bit=end_bit) + res = d_vals + if not is_cuda_ndarray(keys): + res = res.copy_to_host(stream=self.stream) + return res + diff --git a/pyculib/sorting/segsort.py b/pyculib/sorting/segsort.py new file mode 100644 index 0000000..36f1a98 --- /dev/null +++ b/pyculib/sorting/segsort.py @@ -0,0 +1,116 @@ +""" +Uses segmented sort implementation from ModernGPU which has the following +license: + +Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +from __future__ import print_function, absolute_import, division +import ctypes +from .common import load_lib +from contextlib import contextmanager +from numba.cuda.cudadrv.driver import device_pointer +from numba.cuda.cudadrv.drvapi import cu_stream +from numba.cuda.cudadrv.devicearray import auto_device +import numpy as np + +lib = load_lib('segsort') + +_argtypes = [ + # d_key + ctypes.c_void_p, + # d_vals + ctypes.c_void_p, + # N + ctypes.c_uint, + # segments + ctypes.c_void_p, + # Nseg + ctypes.c_uint, + # stream + cu_stream, +] + +_support_types = { + np.float32: 'float32', + np.float64: 'float64', + np.int32: 'int32', + np.uint32: 'uint32', + np.int64: 'int64', + np.uint64: 'uint64' +} + +_overloads = {} + + +def _init(): + for k, v in _support_types.items(): + fn = getattr(lib, 'segsortpairs_{0}'.format(v)) + fn.argtypes = _argtypes + _overloads[np.dtype(k)] = fn + + +_init() + + +@contextmanager +def _autodevice(ary, stream): + if ary is not None: + dptr, conv = auto_device(ary, stream=stream) + yield dptr + if conv: + dptr.copy_to_host(ary, stream=stream) + else: + yield None + + +def _segmentedsort(d_keys, d_vals, d_segments, stream): + _overloads[d_keys.dtype](device_pointer(d_keys), + device_pointer(d_vals), + d_keys.size, + device_pointer(d_segments), + d_segments.size, + stream.handle if stream else 0) + + +def segmented_sort(keys, vals, segments, stream=0): + """Performs an inplace sort on small segments (N < 1e6). + + :type keys: numpy.ndarray + :param keys: Keys to sort inplace. + :type vals: numpy.ndarray + :param vals: Values to be reordered inplace along the sort. Only the + ``uint32`` dtype is supported in this implementation. + :type segments: numpy.ndarray + :param segments: Segment separation location. e.g. ``array([3, 6, 8])`` for + segments of ``keys[:3]``, ``keys[3:6]``, ``keys[6:8]``, + ``keys[8:]``. + :param stream: Optional. A cuda stream in which the kernels are executed. + """ + with _autodevice(keys, stream) as d_keys: + with _autodevice(vals, stream) as d_vals: + d_segments, _ = auto_device(segments, stream=stream) + _segmentedsort(d_keys, d_vals, d_segments, stream) + diff --git a/pyculib/sparse/__init__.py b/pyculib/sparse/__init__.py new file mode 100644 index 0000000..a84a6f5 --- /dev/null +++ b/pyculib/sparse/__init__.py @@ -0,0 +1,2 @@ +from __future__ import print_function, absolute_import +from .api import * diff --git a/pyculib/sparse/api.py b/pyculib/sparse/api.py new file mode 100644 index 0000000..68b1e1a --- /dev/null +++ b/pyculib/sparse/api.py @@ -0,0 +1,762 @@ +from __future__ import print_function, absolute_import, division +from contextlib import contextmanager +import numpy as np +import scipy.sparse as ss +from numba import cuda +from .binding import (cuSparse, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_INDEX_BASE_ONE) + +dtype_to_char = { + np.dtype(np.float32): 'S', + np.dtype(np.float64): 'D', + np.dtype(np.complex64): 'C', + np.dtype(np.complex128): 'Z', +} + + +def _sentry_ndim(ndim, **kws): + for k, a in kws.items(): + if a.ndim != ndim: + raise ValueError("%s.ndim must be %dD" % (k, ndim)) + + +def _sentry_dtype(dtype, **kws): + for k, a in kws.items(): + if a.dtype != dtype: + raise TypeError("%s.dtype is not %s" % (k, dtype)) + + +@contextmanager +def _readonly(*arys): + ds = [] + for a in arys: + dmem, _ = cuda._auto_device(a) + ds.append(dmem) + yield ds + + +@contextmanager +def _readwrite(*arys): + ds = [] + ws = [] + for a in arys: + dmem, conv = cuda._auto_device(a) + ds.append(dmem) + if conv: + ws.append((a, dmem)) + yield ds + for a, d in ws: + d.copy_to_host(a) + + +class Sparse(object): + """All cuSPARSE functions are available under the Sparse object. + + :param idxbase: The base for indexing, either 0 or 1. Optional, defaults + to 0. + """ + + @cuda.require_context + def __init__(self, idxbase=0): + """ + Args + ---- + - idxbase int + Index base. Must be 0 or 1 + """ + if idxbase not in (0, 1): + raise ValueError("Invalid index base") + + self.api = cuSparse() + self.idxbase = (CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_INDEX_BASE_ONE)[idxbase] + + @property + def stream(self): + return self.api.stream + + @stream.setter + def stream(self, stream): + self.api.stream = stream + + def _get_api(self, fname, dtype): + ch = dtype_to_char[np.dtype(dtype)] + fn = "%s%s" % (ch, fname) + return getattr(self.api, fn) + + def matdescr(self, indexbase=None, diagtype='N', fillmode='L', + matrixtype='G'): + descr = self.api.matdescr() + descr.indexbase = self.idxbase if indexbase is None else indexbase + descr.diagtype = diagtype + descr.fillmode = fillmode + descr.matrixtype = matrixtype + return descr + + # ------------------------------------------------------------------------ + # Level 1 API + + def axpyi(self, alpha, xVal, xInd, y): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-axpyi + """ + _sentry_ndim(1, xVal=xVal, xInd=xVal, y=y) + _sentry_dtype(np.int32, xInd=xInd) + _sentry_dtype(xVal.dtype, y=y) + fn = self._get_api("axpyi", xVal.dtype) + nnz = xVal.size + with _readonly(xVal, xInd) as [dxval, dxind]: + with _readwrite(y) as [dy]: + fn(nnz=nnz, alpha=alpha, xVal=dxval, xInd=dxind, y=dy, + idxBase=self.idxbase) + return y + + def doti(self, xVal, xInd, y): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-doti + """ + _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y) + _sentry_dtype(np.int32, xInd=xInd) + _sentry_dtype(xVal.dtype, y=y) + fn = self._get_api("doti", xVal.dtype) + nnz = xVal.size + with _readonly(xVal, xInd) as [dxval, dxind]: + with _readwrite(y) as [dy]: + result = fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy, + idxBase=self.idxbase) + return result + + def dotci(self, xVal, xInd, y): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-dotci + """ + _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y) + _sentry_dtype(np.int32, xInd=xInd) + _sentry_dtype(xVal.dtype, y=y) + fn = self._get_api("dotci", xVal.dtype) + nnz = xVal.size + with _readonly(xVal, xInd) as [dxval, dxind]: + with _readwrite(y) as [dy]: + result = fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy, + idxBase=self.idxbase) + return result + + def gthr(self, y, xVal, xInd): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-gthr + """ + _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y) + _sentry_dtype(np.int32, xInd=xInd) + _sentry_dtype(xVal.dtype, y=y) + fn = self._get_api("gthr", xVal.dtype) + nnz = xVal.size + with _readonly(y, xInd) as [dy, dxind]: + with _readwrite(xVal) as [dxval]: + fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy, idxBase=self.idxbase) + + def gthrz(self, y, xVal, xInd): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-gthrz + """ + _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y) + _sentry_dtype(np.int32, xInd=xInd) + _sentry_dtype(xVal.dtype, y=y) + fn = self._get_api("gthrz", xVal.dtype) + nnz = xVal.size + with _readonly(xInd) as [dxind]: + with _readwrite(y, xVal) as [dy, dxval]: + fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy, idxBase=self.idxbase) + + def roti(self, xVal, xInd, y, c, s): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-roti + """ + _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y) + _sentry_dtype(np.int32, xInd=xInd) + _sentry_dtype(xVal.dtype, y=y) + fn = self._get_api("roti", xVal.dtype) + nnz = xVal.size + with _readonly(xInd) as [dxind]: + with _readwrite(y, xVal) as [dy, dxval]: + fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy, c=c, s=s, + idxBase=self.idxbase) + + def sctr(self, xVal, xInd, y): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-sctr + """ + _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y) + _sentry_dtype(np.int32, xInd=xInd) + _sentry_dtype(xVal.dtype, y=y) + fn = self._get_api("sctr", xVal.dtype) + nnz = xVal.size + with _readonly(xVal, xInd) as [dxval, dxind]: + with _readwrite(y) as [dy]: + fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy, idxBase=self.idxbase) + + # ------------------------------------------------------------------------ + # Level 2 API + + def bsrmv_matrix(self, dir, trans, alpha, descr, bsrmat, x, beta, y): + bsrVal = bsrmat.data + bsrRowPtr = bsrmat.indptr + bsrColInd = bsrmat.indices + nnzb = bsrColInd.size + m, n = bsrmat.shape + blockDim, blockDim1 = bsrmat.blocksize + assert blockDim == blockDim1 + + mb = (m + blockDim - 1) // blockDim + nb = (n + blockDim - 1) // blockDim + + self.bsrmv(dir, trans, mb, nb, nnzb, alpha, descr, bsrVal, + bsrRowPtr, bsrColInd, blockDim, x, beta, y) + + def bsrmv(self, dir, trans, mb, nb, nnzb, alpha, descr, bsrVal, + bsrRowPtr, bsrColInd, blockDim, x, beta, y): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-bsrmv + """ + _sentry_ndim(1, x=x, y=y) + _sentry_dtype(bsrVal.dtype, x=x, y=y) + fn = self._get_api("bsrmv", bsrVal.dtype) + + with _readonly(bsrVal, bsrRowPtr, bsrColInd, x) \ + as [dbsrVal, dbsrRowPtr, dbsrColInd, dx]: + with _readwrite(y) as [dy]: + fn(dirA=dir, transA=trans, mb=mb, nb=nb, nnzb=nnzb, + alpha=alpha, descrA=descr, bsrValA=dbsrVal, + bsrRowPtrA=dbsrRowPtr, bsrColIndA=dbsrColInd, + blockDim=blockDim, x=dx, beta=beta, y=dy) + + def bsrxmv(self, dir, trans, sizeOfMask, mb, nb, nnzb, alpha, descr, + bsrVal, bsrMaskPtr, bsrRowPtr, bsrEndPtr, bsrColInd, blockDim, + x, beta, y): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-bsrxmv + """ + + _sentry_ndim(1, x=x, y=y) + _sentry_dtype(bsrVal.dtype, x=x, y=y) + fn = self._get_api("bsrxmv", bsrVal.dtype) + + with _readonly(bsrVal, bsrRowPtr, bsrColInd, bsrMaskPtr, bsrEndPtr, x) \ + as [dbsrVal, dbsrRowPtr, dbsrColInd, dbsrMaskPtr, dbsrEndPtr, dx]: + with _readwrite(y) as [dy]: + fn(dirA=dir, transA=trans, sizeOfMask=sizeOfMask, + mb=mb, nb=nb, nnzb=nnzb, alpha=alpha, descrA=descr, + bsrValA=dbsrVal, bsrRowPtrA=dbsrRowPtr, + bsrColIndA=dbsrColInd, bsrMaskPtrA=dbsrMaskPtr, + bsrEndPtrA=dbsrEndPtr, blockDim=blockDim, x=dx, beta=beta, + y=dy) + + def csrmv(self, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, + csrColInd, x, beta, y): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrmv + """ + _sentry_ndim(1, x=x, y=y) + _sentry_dtype(csrVal.dtype, x=x, y=y) + fn = self._get_api("csrmv", csrVal.dtype) + with _readonly(csrVal, csrRowPtr, csrColInd, x) \ + as [dcsrVal, dcsrRowPtr, dcsrColInd, dx]: + with _readwrite(y) as [dy]: + fn(transA=trans, m=m, n=n, nnz=nnz, + alpha=alpha, descrA=descr, csrValA=dcsrVal, + csrRowPtrA=dcsrRowPtr, csrColIndA=dcsrColInd, x=dx, + beta=beta, y=dy) + + def csrsv_analysis(self, trans, m, nnz, descr, csrVal, csrRowPtr, + csrColInd): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrsvanalysis + + Returns + ------- + SolveAnalysisInfo + """ + fn = self._get_api("csrsv_analysis", csrVal.dtype) + info = self.api.solve_analysis_info() + with _readonly(csrVal, csrRowPtr, csrColInd) \ + as [dcsrVal, dcsrRowPtr, dcsrColInd]: + fn(transA=trans, m=m, nnz=nnz, descrA=descr, csrValA=dcsrVal, + csrRowPtrA=dcsrRowPtr, csrColIndA=dcsrColInd, info=info) + return info + + def csrsv_solve(self, trans, m, alpha, descr, csrVal, csrRowPtr, + csrColInd, info, x, y): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrsvsolve + """ + _sentry_ndim(1, x=x, y=y) + _sentry_dtype(csrVal.dtype, x=x, y=y) + fn = self._get_api("csrsv_solve", csrVal.dtype) + with _readonly(csrVal, csrRowPtr, csrColInd, x) \ + as [dcsrVal, dcsrRowPtr, dcsrColInd, dx]: + with _readwrite(y) as [dy]: + fn(transA=trans, m=m, alpha=alpha, descrA=descr, + csrValA=dcsrVal, csrRowPtrA=dcsrRowPtr, + csrColIndA=dcsrColInd, info=info, x=dx, y=dy) + + hybmv = NotImplemented + hybmv_analysis = NotImplemented + hybmv_solve = NotImplemented + + # ------------------------------------------------------------------------ + # Level 3 API + + def csrmm(self, transA, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, + csrColIndA, B, ldb, beta, C, ldc): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrmm + """ + _sentry_dtype(csrValA.dtype, B=B, C=C) + fn = self._get_api("csrmm", csrValA.dtype) + with _readonly(csrValA, csrRowPtrA, csrColIndA, B) \ + as [dcsrValA, dcsrRowPtrA, dcsrColIndA, dB]: + with _readwrite(C) as [dC]: + fn(transA=transA, m=m, n=n, k=k, nnz=nnz, alpha=alpha, + descrA=descrA, csrValA=dcsrValA, csrRowPtrA=dcsrRowPtrA, + csrColIndA=dcsrColIndA, B=dB, ldb=ldb, beta=beta, C=dC, + ldc=ldc) + + def csrmm2(self, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, + csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrmm2 + """ + _sentry_dtype(csrValA.dtype, B=B, C=C) + fn = self._get_api("csrmm2", csrValA.dtype) + with _readonly(csrValA, csrRowPtrA, csrColIndA, B) \ + as [dcsrValA, dcsrRowPtrA, dcsrColIndA, dB]: + with _readwrite(C) as [dC]: + fn(transa=transA, transb=transB, m=m, n=n, k=k, nnz=nnz, + alpha=alpha, + descrA=descrA, csrValA=dcsrValA, csrRowPtrA=dcsrRowPtrA, + csrColIndA=dcsrColIndA, B=dB, ldb=ldb, beta=beta, C=dC, + ldc=ldc) + + def csrsm_analysis(self, transA, m, nnz, descrA, csrValA, csrRowPtrA, + csrColIndA): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrsmanalysis + """ + fn = self._get_api("csrsm_analysis", csrValA.dtype) + info = self.api.solve_analysis_info() + with _readonly(csrValA, csrRowPtrA, csrColIndA) \ + as [dcsrValA, dcsrRowPtrA, dcsrColIndA]: + fn(transA=transA, m=m, nnz=nnz, descrA=descrA, csrValA=dcsrValA, + csrRowPtrA=dcsrRowPtrA, csrColIndA=dcsrColIndA, info=info) + return info + + def csrsm_solve(self, transA, m, n, alpha, descrA, csrValA, csrRowPtrA, + csrColIndA, info, X, ldx, Y, ldy): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrsmsolve + """ + fn = self._get_api("csrsm_solve", csrValA.dtype) + with _readonly(csrValA, csrRowPtrA, csrColIndA, X) \ + as [dcsrValA, dcsrRowPtrA, dcsrColIndA, dX]: + with _readwrite(Y) as [dY]: + fn(transA=transA, m=m, n=n, alpha=alpha, descrA=descrA, + csrValA=dcsrValA, csrRowPtrA=dcsrRowPtrA, + csrColIndA=dcsrColIndA, info=info, x=dX, ldx=ldx, y=dY, + ldy=ldy) + + # ------------------------------------------------------------------------ + # Extra API + + def XcsrgeamNnz(self, m, n, descrA, nnzA, csrRowPtrA, csrColIndA, descrB, + nnzB, csrRowPtrB, csrColIndB, descrC, csrRowPtrC): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgeam + + Returns + ------- + int + nnzC + """ + fn = self.api.XcsrgeamNnz + with _readonly(csrRowPtrA, csrColIndA, csrRowPtrB, csrColIndB) \ + as (dcsrRowPtrA, dcsrColIndA, dcsrRowPtrB, dcsrColIndB): + with _readwrite(csrRowPtrC) as [dcsrRowPtrC]: + nnzC = fn(m=m, n=n, descrA=descrA, nnzA=nnzA, + csrRowPtrA=dcsrRowPtrA, + csrColIndA=dcsrColIndA, descrB=descrB, nnzB=nnzB, + csrRowPtrB=dcsrRowPtrB, csrColIndB=dcsrColIndB, + descrC=descrC, csrRowPtrC=dcsrRowPtrC, + nnzTotalDevHostPtr=0) + return nnzC + + def csrgeam(self, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, + csrColIndA, beta, descrB, nnzB, csrValB, csrRowPtrB, + csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgeam + """ + fn = self._get_api("csrgeam", csrValA.dtype) + with _readonly(csrValA, csrRowPtrA, csrColIndA, csrValB, csrRowPtrB, + csrColIndB, csrRowPtrC) \ + as [dcsrValA, dcsrRowPtrA, dcsrColIndA, dcsrValB, dcsrRowPtrB, + dcsrColIndB, dcsrRowPtrC]: + with _readwrite(csrValC, csrColIndC) as [dcsrValC, dcsrColIndC]: + fn(m=m, n=n, alpha=alpha, descrA=descrA, nnzA=nnzA, + csrValA=dcsrValA, csrRowPtrA=dcsrRowPtrA, + csrColIndA=dcsrColIndA, csrValB=dcsrValB, + descrB=descrB, nnzB=nnzB, beta=beta, + csrRowPtrB=dcsrRowPtrB, csrColIndB=dcsrColIndB, + descrC=descrC, csrValC=dcsrValC, + csrRowPtrC=dcsrRowPtrC, csrColIndC=dcsrColIndC) + + + def XcsrgemmNnz(self, transA, transB, m, n, k, descrA, nnzA, csrRowPtrA, + csrColIndA, descrB, nnzB, csrRowPtrB, csrColIndB, descrC, + csrRowPtrC): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm + + Returns + ------- + int + nnzC + """ + fn = self.api.XcsrgemmNnz + with _readonly(csrRowPtrA, csrColIndA, csrRowPtrB, csrColIndB) \ + as (dcsrRowPtrA, dcsrColIndA, dcsrRowPtrB, dcsrColIndB): + with _readwrite(csrRowPtrC) as [dcsrRowPtrC]: + nnzC = fn(transA=transA, transB=transB, k=k, m=m, n=n, + descrA=descrA, nnzA=nnzA, + csrRowPtrA=dcsrRowPtrA, + csrColIndA=dcsrColIndA, descrB=descrB, nnzB=nnzB, + csrRowPtrB=dcsrRowPtrB, csrColIndB=dcsrColIndB, + descrC=descrC, csrRowPtrC=dcsrRowPtrC, + nnzTotalDevHostPtr=0) + return nnzC + + def csrgemm(self, transA, transB, m, n, k, descrA, nnzA, csrValA, + csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, csrRowPtrB, + csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm + """ + fn = self._get_api("csrgemm", csrValA.dtype) + with _readonly(csrValA, csrRowPtrA, csrColIndA, csrValB, csrRowPtrB, + csrColIndB, csrRowPtrC) \ + as [dcsrValA, dcsrRowPtrA, dcsrColIndA, dcsrValB, dcsrRowPtrB, + dcsrColIndB, dcsrRowPtrC]: + with _readwrite(csrValC, csrColIndC) as [dcsrValC, dcsrColIndC]: + fn(transA=transA, transB=transB, m=m, n=n, k=k, descrA=descrA, + nnzA=nnzA, csrValA=dcsrValA, csrRowPtrA=dcsrRowPtrA, + csrColIndA=dcsrColIndA, csrValB=dcsrValB, + descrB=descrB, nnzB=nnzB, + csrRowPtrB=dcsrRowPtrB, csrColIndB=dcsrColIndB, + descrC=descrC, csrValC=dcsrValC, + csrRowPtrC=dcsrRowPtrC, csrColIndC=dcsrColIndC) + + def csrgemm_ez(self, matA, matB, transA='N', transB='N', descrA=None, + descrB=None, descrC=None): + """ + Raises ValueError if the result is entirely zero. + + Returns + ------- + CudaSparseMatrix + a csr matrix of the matrix product (matA * matB). + + Notes + ----- + Calls XcsrgemmNnz and csrgemm + """ + tmpdescr = self.matdescr() + descrA = descrA or tmpdescr + descrB = descrB or tmpdescr + descrC = descrC or tmpdescr + + dtype = matA.dtype + m, ka = matA.shape + kb, n = matB.shape + if ka != kb: + raise ValueError("incompatible matrices") + k = ka + + indptrC = cuda.device_array(m + 1, dtype='int32') + nnz = self.XcsrgemmNnz(transA, transB, m, n, k, descrA, matA.nnz, + matA.indptr, matA.indices, descrB, matB.nnz, + matB.indptr, matB.indices, descrC, indptrC) + + if nnz == 0: + raise ValueError("result is entirely zero") + + dataC = cuda.device_array(nnz, dtype=dtype) + indicesC = cuda.device_array(nnz, dtype='int32') + self.csrgemm(transA, transB, m, n, k, descrA, matA.nnz, matA.data, + matA.indptr, matA.indices, descrB, matB.nnz, matB.data, + matB.indptr, matB.indices, descrC, dataC, indptrC, + indicesC) + + return CudaCSRMatrix().from_attributes(data=dataC, indices=indicesC, + indptr=indptrC, shape=(m, n), + dtype=dtype, nnz=nnz) + + # ------------------------------------------------------------------------ + # Preconditioners + + def csric0(self, trans, m, descr, csrValM, csrRowPtrA, csrColIndA, info): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csric0 + """ + fn = self._get_api("csric0", csrValM.dtype) + with _readonly(csrRowPtrA, csrColIndA) as [dcsrRowPtrA, dcsrColIndA]: + with _readwrite(csrValM) as [dcsrValM]: + fn(trans=trans, m=m, descrA=descr, + csrValA_ValM=dcsrValM, csrRowPtrA=dcsrRowPtrA, + csrColIndA=dcsrColIndA, info=info) + + def csrilu0(self, trans, m, descr, csrValM, csrRowPtrA, csrColIndA, info): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrilu0 + """ + fn = self._get_api("csrilu0", csrValM.dtype) + with _readonly(csrRowPtrA, csrColIndA) as [dcsrRowPtrA, dcsrColIndA]: + with _readwrite(csrValM) as [dcsrValM]: + fn(trans=trans, m=m, descrA=descr, + csrValA_ValM=dcsrValM, csrRowPtrA=dcsrRowPtrA, + csrColIndA=dcsrColIndA, info=info) + + def gtsv(self, m, n, dl, d, du, B, ldb): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-gtsv + """ + fn = self._get_api("gtsv", B.dtype) + with _readonly(dl, d, du) as [ddl, dd, ddu]: + with _readwrite(B) as [dB]: + fn(m=m, n=n, dl=ddl, d=dd, du=ddu, B=dB, ldb=ldb) + + def gtsv_nopivot(self, m, n, dl, d, du, B, ldb): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-gtsv_nopivot + """ + fn = self._get_api("gtsv_nopivot", B.dtype) + with _readonly(dl, d, du) as [ddl, dd, ddu]: + with _readwrite(B) as [dB]: + fn(m=m, n=n, dl=ddl, d=dd, du=ddu, B=dB, ldb=ldb) + + def gtsvStridedBatch(self, m, dl, d, du, x, batchCount, batchStride): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-gtsvstridedbatch + """ + fn = self._get_api("gtsvStridedBatch", x.dtype) + with _readonly(dl, d, du) as [ddl, dd, ddu]: + with _readwrite(x) as [dx]: + fn(m=m, dl=ddl, d=dd, du=ddu, x=dx, + batchCount=batchCount, batchStride=batchStride) + + # ------------------------------------------------------------------------ + # Format Conversion + + def bsr2csr(self, dirA, mb, nb, descrA, bsrValA, bsrRowPtrA, bsrColIndA, + blockDim, descrC, csrValC, csrRowPtrC, csrColIndC): + fn = self._get_api('bsr2csr', bsrValA.dtype) + with _readonly(bsrValA, bsrRowPtrA, bsrColIndA) as [dbsrValA, + dbsrRowPtrA, + dbsrColIndA]: + with _readwrite(csrValC, csrRowPtrC, csrColIndC) as [dcsrValC, + dcsrRowPtrC, + dcsrColIndC]: + fn(dirA=dirA, mb=mb, nb=nb, descrA=descrA, bsrValA=dbsrValA, + bsrRowPtrA=dbsrRowPtrA, bsrColIndA=dbsrColIndA, + blockDim=blockDim, descrC=descrC, csrValC=dcsrValC, + csrRowPtrC=dcsrRowPtrC, csrColIndC=dcsrColIndC) + + def Xcoo2csr(self, cooRowInd, nnz, m, csrRowPtr): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-bsr2csr + """ + fn = self.api.Xcoo2csr + with _readonly(cooRowInd) as [dcooRowInd]: + with _readwrite(csrRowPtr) as [dcsrRowPtr]: + fn(cooRowInd=dcooRowInd, nnz=nnz, m=m, csrRowPtr=dcsrRowPtr, + idxBase=self.idxbase) + + def csc2dense(self, m, n, descrA, cscValA, cscRowIndA, cscColPtrA, A, lda): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csc2dense + """ + fn = self._get_api('csc2dense', cscValA.dtype) + with _readonly(cscValA, cscRowIndA, cscColPtrA) as [dcscValA, + dcscRowIndA, + dcscColPtrA]: + with _readwrite(A) as [dA]: + fn(m=m, n=n, descrA=descrA, cscValA=dcscValA, + cscRowIndA=dcscRowIndA, cscColPtrA=dcscColPtrA, A=dA, + lda=lda) + + csc2hyb = NotImplemented + + def Xcsr2bsrNnz(self, dirA, m, n, descrA, csrRowPtrA, csrColIndA, + blockDim, descrC, bsrRowPtrC): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csr2bsr + """ + fn = self.api.Xcsr2bsrNnz + with _readonly(csrRowPtrA, csrColIndA, bsrRowPtrC) as [dcsrRowPtrA, + dcsrColIndA, + dbsrRowPtrC]: + nnz = fn(dirA=dirA, m=m, n=n, descrA=descrA, + csrRowPtrA=dcsrRowPtrA, + csrColIndA=dcsrColIndA, + blockDim=blockDim, + descrC=descrC, bsrRowPtrC=dbsrRowPtrC, + nnzTotalDevHostPtr=0) + return nnz + + def csr2bsr(self, dirA, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, + blockDim, descrC, bsrValC, bsrRowPtrC, bsrColIndC): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csr2bsr + """ + fn = self._get_api('csr2bsr', csrValA.dtype) + with _readonly(csrValA, csrRowPtrA, csrColIndA) as [dcsrValA, + dcsrRowPtrA, + dcsrColIndA]: + with _readwrite(bsrValC, bsrRowPtrC, bsrColIndC) as [dbsrValC, + dbsrRowPtrC, + dbsrColIndC]: + nnz = fn(dirA=dirA, m=m, n=n, descrA=descrA, csrValA=dcsrValA, + csrRowPtrA=dcsrRowPtrA, csrColIndA=dcsrColIndA, + blockDim=blockDim, descrC=descrC, bsrValC=dbsrValC, + bsrRowPtrC=dbsrRowPtrC, bsrColIndC=dbsrColIndC) + return nnz + + def Xcsr2coo(self, csrRowPtr, nnz, m, cooRowInd): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csr2coo + """ + fn = self.api.Xcsr2coo + with _readonly(csrRowPtr) as [dcsrRowPtr]: + with _readwrite(cooRowInd) as [dcooRowInd]: + fn(csrRowPtr=dcsrRowPtr, nnz=nnz, m=m, cooRowInd=dcooRowInd, + idxBase=self.idxbase) + + def csr2csc(self, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, + cscRowInd, cscColPtr, copyValues): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csr2csc + """ + fn = self._get_api('csr2csc', csrVal.dtype) + with _readonly(csrVal, csrRowPtr, csrColInd) as [dcsrVal, dcsrRowPtr, + dcsrColInd]: + with _readwrite(cscVal, cscRowInd, cscColPtr) as [dcscVal, + dcscRowInd, + dcscColPtr]: + fn(m=m, n=n, nnz=nnz, csrVal=dcsrVal, csrRowPtr=dcsrRowPtr, + csrColInd=dcsrColInd, cscVal=dcscVal, cscRowInd=dcscRowInd, + cscColPtr=dcscColPtr, copyValues=copyValues, + idxBase=self.idxbase) + + def csr2dense(self, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csr2dense + """ + fn = self._get_api('csr2dense', csrValA.dtype) + with _readonly(csrValA, csrRowPtrA, csrColIndA) as [dcsrValA, + dcsrRowPtrA, + dcsrColIndA]: + with _readwrite(A) as [dA]: + fn(m=m, n=n, descrA=descrA, csrValA=dcsrValA, + csrRowPtrA=dcsrRowPtrA, csrColIndA=dcsrColIndA, A=dA, + lda=lda) + + csr2hyb = NotImplemented + + def dense2csc(self, m, n, descrA, A, lda, nnzPerCol, cscValA, cscRowIndA, + cscColPtrA): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-dense2csc + """ + fn = self._get_api('dense2csc', cscValA.dtype) + with _readonly(A, nnzPerCol) as [dA, nnzPerCol]: + with _readwrite(cscValA, cscRowIndA, cscColPtrA) as [dcscValA, + dcscRowIndA, + dcscColPtrA]: + fn(m=m, n=n, descrA=descrA, A=dA, lda=lda, + nnzPerCol=nnzPerCol, cscValA=dcscValA, + cscRowIndA=dcscRowIndA, + cscColPtrA=dcscColPtrA) + + def dense2csr(self, m, n, descrA, A, lda, nnzPerRow, csrValA, + csrRowPtrA, csrColIndA): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-dense2csr + + Returns + ------- + int + nnzTotalDevHostPtr + """ + fn = self._get_api('dense2csr', A.dtype) + with _readonly(A, nnzPerRow) as [dA, nnzPerRow]: + with _readwrite(csrValA, csrRowPtrA, csrColIndA) as [dcsrValA, + dcsrRowPtrA, + dcsrColIndA]: + fn(m=m, n=n, descrA=descrA, A=dA, lda=lda, + nnzPerRow=nnzPerRow, csrValA=dcsrValA, + csrRowPtrA=dcsrRowPtrA, csrColIndA=dcsrColIndA) + + dense2hyb = NotImplemented + hyb2csc = NotImplemented + hyb2csr = NotImplemented + hyb2dense = NotImplemented + + def nnz(self, dirA, m, n, descrA, A, lda, nnzPerRowCol): + """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-nnz + """ + fn = self._get_api('nnz', A.dtype) + with _readonly(A) as [dA]: + with _readwrite(nnzPerRowCol) as [dnnzPerRowCol]: + nnzTotal = fn(dirA=dirA, m=m, n=n, descrA=descrA, A=dA, + nnzPerRowCol=dnnzPerRowCol, lda=lda, + nnzTotalDevHostPtr=0) + return nnzTotal + + +# ------------------------------------------------------------------------ +# Matrix Ctors + +class CudaSparseMatrix(object): + def from_host_matrix(self, matrix, stream=0): + dtype = matrix.dtype + shape = matrix.shape + nnz = matrix.nnz + data = cuda.to_device(matrix.data, stream=stream) + indices = cuda.to_device(matrix.indices, stream=stream) + indptr = cuda.to_device(matrix.indptr, stream=stream) + self.from_attributes(dtype=dtype, shape=shape, nnz=nnz, data=data, + indices=indices, indptr=indptr) + return self + + def from_attributes(self, dtype, shape, nnz, data, indices, indptr): + self.dtype = dtype + self.shape = shape + self.ndim = len(shape) + self.nnz = nnz + self.data = data + self.indices = indices + self.indptr = indptr + return self + + def copy_to_host(self, stream=0): + data = self.data.copy_to_host(stream=stream) + indices = self.indices.copy_to_host(stream=stream) + indptr = self.indptr.copy_to_host(stream=stream) + return self.host_constructor((data, indices, indptr), shape=self.shape) + + +class CudaBSRMatrix(CudaSparseMatrix): + host_constructor = ss.bsr_matrix + + def from_host_matrix(self, matrix, stream=0): + super(CudaBSRMatrix, self).from_host_matrix(matrix, stream=stream) + self.blocksize = matrix.blocksize + return self + + +class CudaCSCMatrix(CudaSparseMatrix): + host_constructor = ss.csc_matrix + + +class CudaCSRMatrix(CudaSparseMatrix): + host_constructor = ss.csr_matrix + + +def bsr_matrix(*args, **kws): + """Takes the same arguments as ``scipy.sparse.bsr_matrix``. + + Returns a BSR CUDA matrix. + """ + mat = ss.bsr_matrix(*args, **kws) + return CudaBSRMatrix().from_host_matrix(mat) + + +def csc_matrix(*args, **kws): + """Takes the same arguments as ``scipy.sparse.csc_matrix``. + + Returns a CSC CUDA matrix. + """ + mat = ss.csc_matrix(*args, **kws) + return CudaCSCMatrix().from_host_matrix(mat) + + +def csr_matrix(*args, **kws): + """Takes the same arguments as ``scipy.sparse.csr_matrix``. + + Returns a CSR CUDA matrix. + """ + mat = ss.csr_matrix(*args, **kws) + return CudaCSRMatrix().from_host_matrix(mat) diff --git a/pyculib/sparse/binding.py b/pyculib/sparse/binding.py new file mode 100644 index 0000000..b1a44c5 --- /dev/null +++ b/pyculib/sparse/binding.py @@ -0,0 +1,1307 @@ +from __future__ import print_function, absolute_import, division + +from ctypes import c_float, c_double, byref, c_int, c_void_p + +from numba.cuda.cudadrv.drvapi import cu_stream +from numba.cuda.cudadrv.driver import device_pointer +from pyculib.utils import (Lib, ctype_function, finalizer, + c_complex, c_double_complex, memalign) + +from . import decls + +INV_STATUS = dict( + CUSPARSE_STATUS_SUCCESS=0, + CUSPARSE_STATUS_NOT_INITIALIZED=1, + CUSPARSE_STATUS_ALLOC_FAILED=2, + CUSPARSE_STATUS_INVALID_VALUE=3, + CUSPARSE_STATUS_ARCH_MISMATCH=4, + CUSPARSE_STATUS_MAPPING_ERROR=5, + CUSPARSE_STATUS_EXECUTION_FAILED=6, + CUSPARSE_STATUS_INTERNAL_ERROR=7, + CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED=8 +) + +STATUS = dict((v, k) for k, v in INV_STATUS.items()) + +CUSPARSE_INDEX_BASE_ZERO = 0 +CUSPARSE_INDEX_BASE_ONE = 1 + +CUSPARSE_DIAG_TYPE_NON_UNIT = 0 +CUSPARSE_DIAG_TYPE_UNIT = 1 + +CUSPARSE_FILL_MODE_LOWER = 0 +CUSPARSE_FILL_MODE_UPPER = 1 + +CUSPARSE_MATRIX_TYPE_GENERAL = 0 +CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1 +CUSPARSE_MATRIX_TYPE_HERMITIAN = 2 +CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 + +CUSPARSE_ACTION_SYMBOLIC = 0 +CUSPARSE_ACTION_NUMERIC = 1 + +CUSPARSE_POINTER_MODE_HOST = 0 +CUSPARSE_POINTER_MODE_DEVICE = 1 + +CUSPARSE_OPERATION_NON_TRANSPOSE = 0 +CUSPARSE_OPERATION_TRANSPOSE = 1 +CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 + +CUSPARSE_DIRECTION_ROW = 0 +CUSPARSE_DIRECTION_COLUMN = 1 + +# automatically decide how to split the +# data into regular/irregular part +CUSPARSE_HYB_PARTITION_AUTO = 0 +# store data into regular part up to a user +# specified treshhold +CUSPARSE_HYB_PARTITION_USER = 1 +# store all data in the regular part +CUSPARSE_HYB_PARTITION_MAX = 2 + +cusparseHandle_t = c_void_p +cusparseMatDescr_t = c_void_p +cusparseSolveAnalysisInfo_t = c_void_p +cusparseHybMat_t = c_void_p + +cusparseStatus_t = c_int +cusparseIndexBase_t = c_int +cusparsePointerMode_t = c_int +cusparseAction_t = c_int +cusparseFillMode_t = c_int +cusparseDiagType_t = c_int +cusparseOperation_t = c_int +cusparseDirection_t = c_int +cusparseHybPartition_t = c_int +cusparseMatrixType_t = c_int + +_c_types = { + 'int': c_int, + 'float': c_float, + 'double': c_double, + 'cuComplex': c_complex, + 'cuDoubleComplex': c_double_complex, + 'cudaStream_t': cu_stream, + 'cusparseStatus_t': cusparseStatus_t, + 'cusparseMatDescr_t': cusparseMatDescr_t, + 'cusparseSolveAnalysisInfo_t': cusparseSolveAnalysisInfo_t, + 'cusparseHybMat_t': cusparseHybMat_t, + 'cusparseHandle_t': cusparseHandle_t, + 'cusparsePointerMode_t': cusparsePointerMode_t, + 'cusparseAction_t': cusparseAction_t, + 'cusparseFillMode_t': cusparseFillMode_t, + 'cusparseDiagType_t': cusparseDiagType_t, + 'cusparseOperation_t': cusparseOperation_t, + 'cusparseDirection_t': cusparseDirection_t, + 'cusparseHybPartition_t': cusparseHybPartition_t, + 'cusparseIndexBase_t': cusparseIndexBase_t, + 'cusparseMatrixType_t': cusparseMatrixType_t, +} + + +class CuSparseError(Exception): + def __init__(self, code): + super(CuSparseError, self).__init__(STATUS[code]) + + +def _get_type(k): + try: + return _c_types[k] + except KeyError: + if k[-1] == '*': + return c_void_p + #return POINTER(_get_type(k[:-1])) + raise + + +def _init_ctype_function(name, decl): + res, args = decl + types = [_get_type(a) for _, a in args] + return ctype_function(_get_type(res), *types) + + +def _declarations(): + for k in dir(decls): + if not k.startswith('_'): + yield k, getattr(decls, k) + + +class _libcusparse(Lib): + lib = 'cusparse' + ErrorType = CuSparseError + + +def _init_libcusparse(): + gv = {} + for k, v in _declarations(): + gv[k] = _init_ctype_function(k, v) + base = _libcusparse + return type('libcusparse', (base,), gv) + + +libcusparse = _init_libcusparse() + +DIAGTYPE = (CUSPARSE_DIAG_TYPE_NON_UNIT, + CUSPARSE_DIAG_TYPE_UNIT) + +DIAGTYPECHAR = ('N', 'U') + +FILLMODE = (CUSPARSE_FILL_MODE_LOWER, + CUSPARSE_FILL_MODE_UPPER) + +FILLMODECHAR = ('L', 'U') + +MATRIXTYPE = (CUSPARSE_MATRIX_TYPE_GENERAL, + CUSPARSE_MATRIX_TYPE_SYMMETRIC, + CUSPARSE_MATRIX_TYPE_HERMITIAN, + CUSPARSE_MATRIX_TYPE_TRIANGULAR) + +MATRIXTYPECHAR = ('G', 'S', 'H', 'T') + + +class MatDescr(finalizer.OwnerMixin): + def __init__(self, api, handle): + self._api = api + self._handle = handle + self._finalizer_track((self._handle, self._api)) + + @classmethod + def _finalize(cls, res): + handle, api = res + api.cusparseDestroyMatDescr(handle) + + @property + def diagtype(self): + return DIAGTYPECHAR[self._api.cusparseGetMatDiagType(self._handle)] + + @diagtype.setter + def diagtype(self, val): + self._api.cusparseSetMatDiagType(self._handle, + DIAGTYPECHAR.index(val)) + + @property + def fillmode(self): + return FILLMODECHAR[self._api.cusparseGetMatFillMode(self._handle)] + + @fillmode.setter + def fillmode(self, val): + return self._api.cusparseSetMatFillMode(self._handle, + FILLMODECHAR.index(val)) + + @property + def indexbase(self): + return self._api.cusparseGetMatIndexBase(self._handle) + + @indexbase.setter + def indexbase(self, val): + return self._api.cusparseSetMatIndexBase(self._handle, val) + + @property + def matrixtype(self): + return MATRIXTYPECHAR[self._api.cusparseGetMatType(self._handle)] + + @matrixtype.setter + def matrixtype(self, val): + return self._api.cusparseSetMatType(self._handle, + MATRIXTYPECHAR.index(val)) + + +class SolveAnalysisInfo(finalizer.OwnerMixin): + def __init__(self, api, handle): + self._api = api + self._handle = handle + self._finalizer_track((self._handle, self._api)) + + @classmethod + def _finalize(cls, res): + handle, api = res + api.cusparseDestroySolveAnalysisInfo(handle) + + +class _cuSparse(finalizer.OwnerMixin): + def __init__(self): + self._api = libcusparse() + self._handle = cusparseHandle_t() + try: + self._api.cusparseCreate(byref(self._handle)) + except CuSparseError: + raise RuntimeError("Cannot initialize cuSparse. " + "Could be caused by insufficient GPU memory.") + self._finalizer_track((self._handle, self._api)) + # Default to NULL stream + self._stream = 0 + # Default to host pointer + self.use_host_pointer() + + def matdescr(self): + handle = cusparseMatDescr_t() + self._api.cusparseCreateMatDescr(byref(handle)) + return MatDescr(self._api, handle) + + def solve_analysis_info(self): + handle = cusparseSolveAnalysisInfo_t() + self._api.cusparseCreateSolveAnalysisInfo(byref(handle)) + return SolveAnalysisInfo(self._api, handle) + + @classmethod + def _finalize(cls, res): + handle, api = res + api.cusparseDestroy(handle) + + @property + def version(self): + ver = c_int() + self._api.cusparseGetVersion(self._handle, byref(ver)) + return ver.value + + @property + def stream(self): + return self._stream + + @stream.setter + def stream(self, stream): + self._stream = stream + self._api.cusparseSetStream(self._handle, self._stream.handle) + + @property + def pointer_mode(self): + mode = cusparsePointerMode_t() + self._api.cusparseGetPointerMode(self._handle, byref(mode)) + return mode.value + + @pointer_mode.setter + def pointer_mode(self, value): + self._api.cusparseSetPointerMode(self._handle, value) + + def use_host_pointer(self): + self.pointer_mode = CUSPARSE_POINTER_MODE_HOST + + def use_device_pointer(self): + self.pointer_mode = CUSPARSE_POINTER_MODE_DEVICE + + +_strip_prefix = 'cusparse' +_len_strip_prefix = len(_strip_prefix) + + +def mangle(name): + assert name.startswith(_strip_prefix) + name = name[_len_strip_prefix:] + return name + + +def _flatten_args(args, kws, argnames, defaults): + values = list(args) + for name in argnames[len(args):]: + if name in kws: + values.append(kws.pop(name)) + elif name in defaults: + values.append(defaults[name]) + else: + raise TypeError("missing '%s' arg" % name) + if kws: + raise TypeError("function has no keyword arguments: %s" % + tuple(kws.keys())) + return values + + +def _make_docstring(name, decl): + ret, args = decl + doc = [] + + doc.append("Wrapper for '%s'" % name) + + doc.append('') + doc.append("Args") + doc.append("----") + for a, t in args: + doc.append("%s: %s" % (a, t)) + + return '\n'.join(doc) + + +def _dummy_preparer(val): + return val, None + + +class _api_function(object): + __slots__ = 'fn', 'argtypes', 'argnames', 'defaults' + + def __init__(self, fn, decl): + self.fn = fn + self.argnames, self.argtypes = zip(*decl[1]) + self.defaults = {} + self.set_defaults() + assert self.argnames[0] == 'handle' + preparers = [] + for k in self.argnames: + pname = 'prepare_%s' % k + if hasattr(self, pname): + meth = getattr(self, pname) + preparers.append(meth) + else: + preparers.append(_dummy_preparer) + + self.preparers = tuple(preparers) + + def __call__(self, *args, **kws): + args = _flatten_args(args, kws, self.argnames, self.defaults) + rargs = [pre(val) for pre, val in zip(self.preparers, args)] + actual, hold = zip(*rargs) + #for k, v in zip(self.argnames, actual): + #print(k, v) + self.fn(*actual) + return self.return_value(*hold) + + def set_defaults(self): + if ('idxBase' in self.argnames and + 'cusparseIndexBase_t' in self.argtypes): + self.defaults['idxBase'] = CUSPARSE_INDEX_BASE_ZERO + + def return_value(self, *args): + return + + +def _make_api_function(name, base): + return type(name, (base,), {}) + + +def _prepare_array(self, val): + return device_pointer(val), val + + +def _prepare_hybpartition(self, val): + if val == 'A': + return CUSPARSE_HYB_PARTITION_AUTO, None + elif val == 'U': + return CUSPARSE_HYB_PARTITION_USER, None + elif val == 'M': + return CUSPARSE_HYB_PARTITION_MAX, None + else: + raise ValueError("Partition flag must be either 'A', 'U' or 'M'") + + +def _prepare_direction_flag(self, val): + if val == 'R': + return CUSPARSE_DIRECTION_ROW, None + elif val == 'C': + return CUSPARSE_DIRECTION_COLUMN, None + else: + raise ValueError("Direction flag must be either 'R' or 'C'") + + +def _prepare_operation_flag(self, val): + if val == 'N': + return CUSPARSE_OPERATION_NON_TRANSPOSE, None + elif val == 'T': + return CUSPARSE_OPERATION_TRANSPOSE, None + elif val == 'C': + return CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE, None + else: + raise ValueError("Operation flag must be either 'N', 'T' or 'C'") + + +def _prepare_matdescr(self, val): + return val._handle, None + + +def _prepare_hybmat(self, val): + raise NotImplementedError + + +def _prepare_action(self, val): + if val == 'N': + return CUSPARSE_ACTION_NUMERIC, None + elif val == 'S': + return CUSPARSE_ACTION_SYMBOLIC, None + else: + raise ValueError("Action must be either 'N' or 'S'") + + +def _prepare_solveinfo(self, val): + return val._handle, None + + +class _array_double_complex(object): + def __init__(self, val=None): + cval, data = memalign(c_double_complex, align=16) + self._data = data + self._cval = cval + self._as_parameter_ = byref(self._cval) + + if val is not None: + val = c_double_complex(val) + self._cval.real = val.real + self._cval.imag = val.imag + + @property + def value(self): + return self._cval.value + + +def _prepare_scalar(self, val): + if self.T == c_double_complex: + data = _array_double_complex(val) + return data, data + else: + v = self.T(val) + return byref(v), v + + +def _prepare_scalar_out(self, val): + if self.T == c_double_complex: + data = _array_double_complex(val) + return data, data + else: + v = self.T() + return byref(v), v + + +class _axpyi_v2(_api_function): + __slots__ = () + + prepare_alpha = _prepare_scalar + prepare_xVal = _prepare_array + prepare_xInd = _prepare_array + prepare_y = _prepare_array + + +class Saxpyi_v2(_axpyi_v2): + __slots__ = () + T = c_float + + +class Daxpyi_v2(_axpyi_v2): + __slots__ = () + T = c_double + + +class Caxpyi_v2(_axpyi_v2): + __slots__ = () + T = c_complex + + +class Zaxpyi_v2(_axpyi_v2): + __slots__ = () + T = c_double_complex + + +class _bsr2csr(_api_function): + __slots__ = () + + prepare_dirA = _prepare_direction_flag + + prepare_bsrValA = _prepare_array + prepare_bsrRowPtrA = _prepare_array + prepare_bsrColIndA = _prepare_array + prepare_csrValC = _prepare_array + prepare_csrRowPtrC = _prepare_array + prepare_csrColIndC = _prepare_array + + prepare_descrA = _prepare_matdescr + prepare_descrC = _prepare_matdescr + + +Sbsr2csr = Dbsr2csr = Cbsr2csr = Zbsr2csr = _bsr2csr + + +class _bsrmv(_api_function): + __slots__ = () + + prepare_dirA = _prepare_direction_flag + prepare_transA = _prepare_operation_flag + prepare_alpha = _prepare_scalar + prepare_beta = _prepare_scalar + + prepare_bsrValA = _prepare_array + prepare_bsrRowPtrA = _prepare_array + prepare_bsrColIndA = _prepare_array + prepare_x = _prepare_array + prepare_y = _prepare_array + + prepare_descrA = _prepare_matdescr + + +class Sbsrmv(_bsrmv): + __slots__ = () + T = c_float + + +class Dbsrmv(_bsrmv): + __slots__ = () + T = c_double + + +class Cbsrmv(_bsrmv): + __slots__ = () + T = c_complex + + +class Zbsrmv(_bsrmv): + __slots__ = () + T = c_double_complex + + +class _bsrxmv(_api_function): + __slots__ = () + + prepare_dirA = _prepare_direction_flag + prepare_transA = _prepare_operation_flag + + prepare_alpha = _prepare_scalar + prepare_beta = _prepare_scalar + + prepare_bsrValA = _prepare_array + prepare_bsrMaskPtrA = _prepare_array + prepare_bsrRowPtrA = _prepare_array + prepare_bsrEndPtrA = _prepare_array + prepare_bsrColIndA = _prepare_array + + prepare_x = _prepare_array + prepare_y = _prepare_array + + prepare_descrA = _prepare_matdescr + + +class Sbsrxmv(_bsrxmv): + __slots__ = () + T = c_float + + +class Dbsrxmv(_bsrxmv): + __slots__ = () + T = c_double + + +class Cbsrxmv(_bsrxmv): + __slots__ = () + T = c_complex + + +class Zbsrxmv(_bsrxmv): + __slots__ = () + T = c_double_complex + + +class _csc2dense(_api_function): + __slots__ = () + + prepare_dirA = _prepare_direction_flag + prepare_transA = _prepare_operation_flag + + prepare_alpha = _prepare_scalar + prepare_beta = _prepare_scalar + + prepare_cscValA = _prepare_array + prepare_cscRowIndA = _prepare_array + prepare_cscColPtrA = _prepare_array + prepare_A = _prepare_array + + prepare_descrA = _prepare_matdescr + + +Scsc2dense = Dcsc2dense = Ccsc2dense = Zcsc2dense = _csc2dense + + +class _csc2hyb(_api_function): + __slots__ = () + prepare_descrA = _prepare_matdescr + prepare_cscValA = _prepare_array + prepare_cscRowIndA = _prepare_array + prepare_cscColPtrA = _prepare_array + prepare_hybA = _prepare_hybmat + prepare_partitionType = _prepare_hybpartition + + +Scsc2hyb = Dcsc2hyb = Ccsc2hyb = Zcsc2hyb = _csc2hyb + + +class _csr2bsr(_api_function): + __slots__ = () + + prepare_dirA = _prepare_direction_flag + prepare_descrA = _prepare_matdescr + + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + + prepare_descrC = _prepare_matdescr + + prepare_bsrValC = _prepare_array + prepare_bsrRowPtrC = _prepare_array + prepare_bsrColIndC = _prepare_array + + +Scsr2bsr = Dcsr2bsr = Ccsr2bsr = Zcsr2bsr = _csr2bsr + + +class _csr2csc_v2(_api_function): + __slots__ = () + + prepare_csrVal = _prepare_array + prepare_csrRowPtr = _prepare_array + prepare_csrColInd = _prepare_array + prepare_cscVal = _prepare_array + prepare_cscRowInd = _prepare_array + prepare_cscColPtr = _prepare_array + + prepare_copyValues = _prepare_action + + +Scsr2csc_v2 = Dcsr2csc_v2 = Ccsr2csc_v2 = Zcsr2csc_v2 = _csr2csc_v2 + + +class _csr2dense(_api_function): + __slots__ = () + prepare_descrA = _prepare_matdescr + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_A = _prepare_array + + +Scsr2dense = Dcsr2dense = Ccsr2dense = Zcsr2dense = _csr2dense + + +class _csr2hyb(_api_function): + descrA = _prepare_matdescr + csrValA = _prepare_array + csrRowPtrA = _prepare_array + csrColIndA = _prepare_array + hybA = _prepare_hybmat + partitionType = _prepare_hybpartition + + +Scsr2hyb = Dcsr2hyb = Ccsr2hyb = Zcsr2hyb = _csr2hyb + + +class _csrgeam(_api_function): + prepare_alpha = _prepare_scalar + prepare_beta = _prepare_scalar + + prepare_descrA = _prepare_matdescr + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + + prepare_descrB = _prepare_matdescr + prepare_csrValB = _prepare_array + prepare_csrRowPtrB = _prepare_array + prepare_csrColIndB = _prepare_array + + prepare_descrC = _prepare_matdescr + prepare_csrValC = _prepare_array + prepare_csrRowPtrC = _prepare_array + prepare_csrColIndC = _prepare_array + + +class Scsrgeam(_csrgeam): + T = c_float + + +class Dcsrgeam(_csrgeam): + T = c_double + + +class Ccsrgeam(_csrgeam): + T = c_complex + + +class Zcsrgeam(_csrgeam): + T = c_double_complex + + +class _csrgemm(_api_function): + prepare_transA = _prepare_operation_flag + prepare_transB = _prepare_operation_flag + prepare_descrA = _prepare_matdescr + + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_descrB = _prepare_matdescr + + prepare_csrValB = _prepare_array + prepare_csrRowPtrB = _prepare_array + prepare_csrColIndB = _prepare_array + prepare_descrC = _prepare_matdescr + prepare_csrValC = _prepare_array + prepare_csrRowPtrC = _prepare_array + prepare_csrColIndC = _prepare_array + + +Scsrgemm = Dcsrgemm = Ccsrgemm = Zcsrgemm = _csrgemm + + +class _csric0(_api_function): + prepare_trans = _prepare_operation_flag + prepare_descrA = _prepare_matdescr + prepare_csrValA_ValM = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_info = _prepare_solveinfo + + +Scsric0 = Dcsric0 = Ccsric0 = Zcsric0 = _csric0 + + +class _csrilu0(_api_function): + prepare_trans = _prepare_operation_flag + prepare_descrA = _prepare_matdescr + prepare_csrValA_ValM = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_info = _prepare_solveinfo + + +Scsrilu0 = Dcsrilu0 = Ccsrilu0 = Zcsrilu0 = _csrilu0 + + +class _csrmm2(_api_function): + prepare_transa = _prepare_operation_flag + prepare_transb = _prepare_operation_flag + + prepare_alpha = _prepare_scalar + prepare_beta = _prepare_scalar + + prepare_descrA = _prepare_matdescr + + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + + prepare_B = _prepare_array + prepare_C = _prepare_array + + +class Scsrmm2(_csrmm2): + T = c_float + + +class Dcsrmm2(_csrmm2): + T = c_double + + +class Ccsrmm2(_csrmm2): + T = c_complex + + +class Zcsrmm2(_csrmm2): + T = c_double_complex + + +class _csrmm_v2(_api_function): + prepare_transA = _prepare_operation_flag + prepare_alpha = _prepare_scalar + prepare_descrA = _prepare_matdescr + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_B = _prepare_array + prepare_beta = _prepare_scalar + prepare_C = _prepare_array + + +class Scsrmm_v2(_csrmm_v2): + T = c_float + + +class Dcsrmm_v2(_csrmm_v2): + T = c_double + + +class Ccsrmm_v2(_csrmm_v2): + T = c_complex + + +class Zcsrmm_v2(_csrmm_v2): + T = c_double_complex + + +class _csrmv_v2(_api_function): + prepare_transA = _prepare_operation_flag + prepare_alpha = _prepare_scalar + prepare_descrA = _prepare_matdescr + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_x = _prepare_array + prepare_beta = _prepare_scalar + prepare_y = _prepare_array + + +class Scsrmv_v2(_csrmv_v2): + T = c_float + + +class Dcsrmv_v2(_csrmv_v2): + T = c_double + + +class Ccsrmv_v2(_csrmv_v2): + T = c_complex + + +class Zcsrmv_v2(_csrmv_v2): + T = c_double_complex + + +class _csrsm_analysis(_api_function): + prepare_transA = _prepare_operation_flag + prepare_descrA = _prepare_matdescr + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_info = _prepare_solveinfo + + +Scsrsm_analysis = Dcsrsm_analysis = _csrsm_analysis +Ccsrsm_analysis = Zcsrsm_analysis = _csrsm_analysis + + +class _csrsm_solve(_api_function): + prepare_transA = _prepare_operation_flag + prepare_alpha = _prepare_scalar + prepare_descrA = _prepare_matdescr + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_info = _prepare_solveinfo + prepare_x = _prepare_array + prepare_y = _prepare_array + + +class Scsrsm_solve(_csrsm_solve): + T = c_float + + +class Dcsrsm_solve(_csrsm_solve): + T = c_double + + +class Ccsrsm_solve(_csrsm_solve): + T = c_complex + + +class Zcsrsm_solve(_csrsm_solve): + T = c_double_complex + + +class _csrsv_analysis_v2(_api_function): + prepare_transA = _prepare_operation_flag + prepare_descrA = _prepare_matdescr + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_info = _prepare_solveinfo + + +Scsrsv_analysis_v2 = Dcsrsv_analysis_v2 = _csrsv_analysis_v2 +Ccsrsv_analysis_v2 = Zcsrsv_analysis_v2 = _csrsv_analysis_v2 + + +class _csrsv_solve_v2(_api_function): + prepare_transA = _prepare_operation_flag + prepare_alpha = _prepare_scalar + prepare_descrA = _prepare_matdescr + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_info = _prepare_solveinfo + prepare_x = _prepare_array + prepare_y = _prepare_array + + +class Scsrsv_solve_v2(_csrsv_solve_v2): + T = c_float + + +class Dcsrsv_solve_v2(_csrsv_solve_v2): + T = c_double + + +class Ccsrsv_solve_v2(_csrsv_solve_v2): + T = c_complex + + +class Zcsrsv_solve_v2(_csrsv_solve_v2): + T = c_double_complex + + +class _dense2csc(_api_function): + prepare_descrA = _prepare_matdescr + prepare_A = _prepare_array + prepare_cscValA = _prepare_array + prepare_cscRowIndA = _prepare_array + prepare_cscColPtrA = _prepare_array + prepare_nnzPerCol = _prepare_array + + +Sdense2csc = Ddense2csc = Cdense2csc = Zdense2csc = _dense2csc + + +class _dense2csr(_api_function): + prepare_descrA = _prepare_matdescr + prepare_A = _prepare_array + + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_nnzPerRow = _prepare_array + +Sdense2csr = Ddense2csr = Cdense2csr = Zdense2csr = _dense2csr + + +class _dense2hyb(_api_function): + prepare_descrA = _prepare_matdescr + prepare_A = _prepare_array + prepare_nnzPerRow = _prepare_array + prepare_hybA = _prepare_hybmat + prepare_partitionType = _prepare_hybpartition + + +Sdense2hyb = Ddense2hyb = Cdense2hyb = Zdense2hyb = _dense2hyb + + +class _dotci(_api_function): + prepare_xVal = _prepare_array + prepare_xInd = _prepare_array + prepare_y = _prepare_array + + prepare_resultDevHostPtr = _prepare_scalar_out + + def return_value(self, *args): + return args[self.argnames.index('resultDevHostPtr')].value + + def set_defaults(self): + super(_dotci, self).set_defaults() + self.defaults['resultDevHostPtr'] = 0 + + +class Cdotci(_dotci): + T = c_complex + + +class Zdotci(_dotci): + T = c_double_complex + + +class _doti(_api_function): + prepare_xVal = _prepare_array + prepare_xInd = _prepare_array + prepare_y = _prepare_array + prepare_resultDevHostPtr = _prepare_scalar_out + + def return_value(self, *args): + return args[self.argnames.index('resultDevHostPtr')].value + + def set_defaults(self): + super(_doti, self).set_defaults() + self.defaults['resultDevHostPtr'] = 0 + + +class Sdoti(_doti): + T = c_float + + +class Ddoti(_doti): + T = c_double + + +class Cdoti(_doti): + T = c_complex + + +class Zdoti(_doti): + T = c_double_complex + + +class _gthr(_api_function): + prepare_y = _prepare_array + prepare_xVal = _prepare_array + prepare_xInd = _prepare_array + + +Sgthr = Dgthr = Cgthr = Zgthr = _gthr + + +class _gthrz(_api_function): + prepare_y = _prepare_array + prepare_xVal = _prepare_array + prepare_xInd = _prepare_array + + +Sgthrz = Dgthrz = Cgthrz = Zgthrz = _gthrz + + +class _gtsv(_api_function): + prepare_dl = _prepare_array + prepare_d = _prepare_array + prepare_du = _prepare_array + prepare_B = _prepare_array + + +Sgtsv = Dgtsv = Cgtsv = Zgtsv = _gtsv + + +class _gtsvStridedBatch(_api_function): + prepare_dl = _prepare_array + prepare_d = _prepare_array + prepare_du = _prepare_array + prepare_x = _prepare_array + + +SgtsvStridedBatch = DgtsvStridedBatch = _gtsvStridedBatch +CgtsvStridedBatch = ZgtsvStridedBatch = _gtsvStridedBatch + + +class _gtsv_nopivot(_api_function): + prepare_dl = _prepare_array + prepare_d = _prepare_array + prepare_du = _prepare_array + prepare_B = _prepare_array + + +Sgtsv_nopivot = Dgtsv_nopivot = Cgtsv_nopivot = Zgtsv_nopivot = _gtsv_nopivot + + +class _hyb2csc(_api_function): + prepare_descrA = _prepare_matdescr + prepare_hybA = _prepare_hybmat + prepare_cscVal = _prepare_array + prepare_cscRowInd = _prepare_array + prepare_cscColPtr = _prepare_array + + +Shyb2csc = Dhyb2csc = Chyb2csc = Zhyb2csc = _hyb2csc + + +class _hyb2csr(_api_function): + prepare_descrA = _prepare_matdescr + prepare_hybA = _prepare_hybmat + prepare_csrValA = _prepare_array + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + + +Shyb2csr = Dhyb2csr = Chyb2csr = Zhyb2csr = _hyb2csr + + +class _hyb2dense(_api_function): + prepare_descrA = _prepare_matdescr + prepare_hybA = _prepare_hybmat + prepare_A = _prepare_array + + +Shyb2dense = Dhyb2dense = Chyb2dense = Zhyb2dense = _hyb2dense + + +class _hybmv(_api_function): + prepare_transA = _prepare_operation_flag + prepare_alpha = _prepare_scalar + prepare_descrA = _prepare_matdescr + prepare_hybA = _prepare_hybmat + prepare_x = _prepare_array + prepare_beta = _prepare_scalar + + +Shybmv = Dhybmv = Chybmv = Zhybmv = _hybmv + + +class _hybsv_analysis(_api_function): + prepare_transA = _prepare_operation_flag + prepare_descrA = _prepare_matdescr + prepare_hybA = _prepare_hybmat + prepare_info = _prepare_solveinfo + + +Shybsv_analysis = Dhybsv_analysis = _hybsv_analysis +Chybsv_analysis = Zhybsv_analysis = _hybsv_analysis + + +class _hybsv_solve(_api_function): + prepare_trans = _prepare_operation_flag + prepare_alpha = _prepare_scalar + prepare_descra = _prepare_matdescr + prepare_hybA = _prepare_hybmat + prepare_info = _prepare_solveinfo + prepare_x = _prepare_array + prepare_y = _prepare_array + + +Shybsv_solve = Dhybsv_solve = Chybsv_solve = Zhybsv_solve = _hybsv_solve + + +class _nnz(_api_function): + T = c_int + prepare_dirA = _prepare_direction_flag + prepare_descrA = _prepare_matdescr + prepare_A = _prepare_array + prepare_nnzPerRowCol = _prepare_array + prepare_nnzTotalDevHostPtr = _prepare_scalar_out + + def return_value(self, *args): + return args[self.argnames.index('nnzTotalDevHostPtr')].value + + +Snnz = Dnnz = Cnnz = Znnz = _nnz + + +class _sctr(_api_function): + prepare_xVal = _prepare_array + prepare_xInd = _prepare_array + prepare_y = _prepare_array + + +Ssctr = Dsctr = Csctr = Zsctr = _sctr + + +class _roti_v2(_api_function): + prepare_xVal = _prepare_array + prepare_xInd = _prepare_array + prepare_y = _prepare_array + prepare_c = _prepare_scalar + prepare_s = _prepare_scalar + + +class Sroti_v2(_roti_v2): + T = c_float + + +class Droti_v2(_roti_v2): + T = c_double + + +class Xcoo2csr(_api_function): + prepare_cooRowInd = _prepare_array + prepare_csrRowPtr = _prepare_array + + +class Xcsr2coo(_api_function): + prepare_csrRowPtr = _prepare_array + prepare_cooRowInd = _prepare_array + + +class Xcsr2bsrNnz(_api_function): + T = c_int + prepare_dirA = _prepare_direction_flag + prepare_descrA = _prepare_matdescr + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_descrC = _prepare_matdescr + prepare_bsrRowPtrC = _prepare_array + prepare_nnzTotalDevHostPtr = _prepare_scalar_out + + def return_value(self, *args): + return args[self.argnames.index('nnzTotalDevHostPtr')].value + + +class XcsrgeamNnz(_api_function): + T = c_int + prepare_descrA = _prepare_matdescr + + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_descrB = _prepare_matdescr + + prepare_csrRowPtrB = _prepare_array + prepare_csrColIndB = _prepare_array + prepare_descrC = _prepare_matdescr + prepare_csrRowPtrC = _prepare_array + prepare_nnzTotalDevHostPtr = _prepare_scalar_out + + def return_value(self, *args): + return args[self.argnames.index('nnzTotalDevHostPtr')].value + + +class XcsrgemmNnz(_api_function): + T = c_int + prepare_transA = _prepare_operation_flag + prepare_transB = _prepare_operation_flag + prepare_descrA = _prepare_matdescr + prepare_csrRowPtrA = _prepare_array + prepare_csrColIndA = _prepare_array + prepare_descrB = _prepare_matdescr + prepare_csrRowPtrB = _prepare_array + prepare_csrColIndB = _prepare_array + prepare_descrC = _prepare_matdescr + prepare_csrRowPtrC = _prepare_array + prepare_nnzTotalDevHostPtr = _prepare_scalar_out + + def return_value(self, *args): + return args[self.argnames.index('nnzTotalDevHostPtr')].value + + +def _init_api_function(name, decl): + lib = libcusparse() + mangled = mangle(name) + for k in globals().keys(): + if mangled.endswith(k): + base = globals()[k] + break + else: + # print("missing", name) + raise NotImplementedError(name) + # return mangled, None + + docs = _make_docstring(name, decl) + cls = _make_api_function(name, base) + + fn = getattr(lib, name) + + obj = cls(fn, decl) + + def method(self, *args, **kws): + return obj(self._handle, *args, **kws) + + method.__doc__ = docs + + return mangled, method + + +_bypassed = frozenset(''' +cusparseCreate +cusparseDestroy +cusparseCreateHybMat +cusparseCreateMatDescr +cusparseCreateSolveAnalysisInfo +cusparseDestroyHybMat +cusparseDestroyMatDescr +cusparseDestroySolveAnalysisInfo +cusparseGetMatDiagType +cusparseGetLevelInfo +cusparseGetMatFillMode +cusparseGetMatIndexBase +cusparseGetMatType +cusparseSetMatDiagType +cusparseSetMatFillMode +cusparseSetMatIndexBase +cusparseSetMatType +cusparseGetPointerMode +cusparseSetPointerMode +cusparseSetStream +cusparseGetVersion +'''.split()) + + +def _init_cuSparse(): + gv = {} + for k, v in _declarations(): + if k not in _bypassed: + name, func = _init_api_function(k, v) + assert name not in gv + gv[name] = func + + # rewrite _v2 names + for k in list(gv.keys()): + if k.endswith('_v2'): + stripped = k[:-3] + assert stripped + '_v2' == k + gv[stripped] = gv[k] + + base = _cuSparse + return type('cuSparse', (base,), gv) + + +cuSparse = _init_cuSparse() diff --git a/pyculib/sparse/decls.py b/pyculib/sparse/decls.py new file mode 100644 index 0000000..2e6cfab --- /dev/null +++ b/pyculib/sparse/decls.py @@ -0,0 +1,547 @@ +# This file was auto-generated + +cusparseCreate = ('cusparseStatus_t', (('handle', 'cusparseHandle_t*'),)) + + +cusparseDestroy = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'),)) + + +cusparseGetVersion = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('version', 'int*'),)) + + +cusparseSetStream = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('streamId', 'cudaStream_t'),)) + + +cusparseGetPointerMode = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('mode', 'cusparsePointerMode_t*'),)) + + +cusparseSetPointerMode = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('mode', 'cusparsePointerMode_t'),)) + + +cusparseCreateMatDescr = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t*'),)) + + +cusparseDestroyMatDescr = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t'),)) + + +cusparseSetMatType = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t'), ('type', 'cusparseMatrixType_t'),)) + + +cusparseGetMatType = ('cusparseMatrixType_t', (('descrA', 'cusparseMatDescr_t'),)) + + +cusparseSetMatFillMode = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t'), ('fillMode', 'cusparseFillMode_t'),)) + + +cusparseGetMatFillMode = ('cusparseFillMode_t', (('descrA', 'cusparseMatDescr_t'),)) + + +cusparseSetMatDiagType = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t'), ('diagType', 'cusparseDiagType_t'),)) + + +cusparseGetMatDiagType = ('cusparseDiagType_t', (('descrA', 'cusparseMatDescr_t'),)) + + +cusparseSetMatIndexBase = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t'), ('base', 'cusparseIndexBase_t'),)) + + +cusparseGetMatIndexBase = ('cusparseIndexBase_t', (('descrA', 'cusparseMatDescr_t'),)) + + +cusparseCreateSolveAnalysisInfo = ('cusparseStatus_t', (('info', 'cusparseSolveAnalysisInfo_t*'),)) + + +cusparseDestroySolveAnalysisInfo = ('cusparseStatus_t', (('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseGetLevelInfo = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('info', 'cusparseSolveAnalysisInfo_t'), ('nlevels', 'int*'), ('levelPtr', 'int**'), ('levelInd', 'int**'),)) + + +cusparseCreateHybMat = ('cusparseStatus_t', (('hybA', 'cusparseHybMat_t*'),)) + + +cusparseDestroyHybMat = ('cusparseStatus_t', (('hybA', 'cusparseHybMat_t'),)) + + +cusparseSaxpyi_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('alpha', 'float*'), ('xVal', 'float*'), ('xInd', 'int*'), ('y', 'float*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseDaxpyi_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('alpha', 'double*'), ('xVal', 'double*'), ('xInd', 'int*'), ('y', 'double*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseCaxpyi_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('alpha', 'cuComplex*'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('y', 'cuComplex*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseZaxpyi_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('alpha', 'cuDoubleComplex*'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('y', 'cuDoubleComplex*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseSdoti = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'float*'), ('xInd', 'int*'), ('y', 'float*'), ('resultDevHostPtr', 'float*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseDdoti = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'double*'), ('xInd', 'int*'), ('y', 'double*'), ('resultDevHostPtr', 'double*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseCdoti = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('y', 'cuComplex*'), ('resultDevHostPtr', 'cuComplex*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseZdoti = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('y', 'cuDoubleComplex*'), ('resultDevHostPtr', 'cuDoubleComplex*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseCdotci = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('y', 'cuComplex*'), ('resultDevHostPtr', 'cuComplex*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseZdotci = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('y', 'cuDoubleComplex*'), ('resultDevHostPtr', 'cuDoubleComplex*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseSgthr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'float*'), ('xVal', 'float*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseDgthr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'double*'), ('xVal', 'double*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseCgthr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'cuComplex*'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseZgthr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'cuDoubleComplex*'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseSgthrz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'float*'), ('xVal', 'float*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseDgthrz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'double*'), ('xVal', 'double*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseCgthrz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'cuComplex*'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseZgthrz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'cuDoubleComplex*'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseSsctr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'float*'), ('xInd', 'int*'), ('y', 'float*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseDsctr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'double*'), ('xInd', 'int*'), ('y', 'double*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseCsctr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('y', 'cuComplex*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseZsctr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('y', 'cuDoubleComplex*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseSroti_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'float*'), ('xInd', 'int*'), ('y', 'float*'), ('c', 'float*'), ('s', 'float*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseDroti_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'double*'), ('xInd', 'int*'), ('y', 'double*'), ('c', 'double*'), ('s', 'double*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseScsrmv_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('x', 'float*'), ('beta', 'float*'), ('y', 'float*'),)) + + +cusparseDcsrmv_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('x', 'double*'), ('beta', 'double*'), ('y', 'double*'),)) + + +cusparseCcsrmv_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('x', 'cuComplex*'), ('beta', 'cuComplex*'), ('y', 'cuComplex*'),)) + + +cusparseZcsrmv_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('x', 'cuDoubleComplex*'), ('beta', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),)) + + +cusparseShybmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('x', 'float*'), ('beta', 'float*'), ('y', 'float*'),)) + + +cusparseDhybmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('x', 'double*'), ('beta', 'double*'), ('y', 'double*'),)) + + +cusparseChybmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('x', 'cuComplex*'), ('beta', 'cuComplex*'), ('y', 'cuComplex*'),)) + + +cusparseZhybmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('x', 'cuDoubleComplex*'), ('beta', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),)) + + +cusparseSbsrmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'float*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'float*'), ('beta', 'float*'), ('y', 'float*'),)) + + +cusparseDbsrmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'double*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'double*'), ('beta', 'double*'), ('y', 'double*'),)) + + +cusparseCbsrmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuComplex*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'cuComplex*'), ('beta', 'cuComplex*'), ('y', 'cuComplex*'),)) + + +cusparseZbsrmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuDoubleComplex*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'cuDoubleComplex*'), ('beta', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),)) + + +cusparseSbsrxmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('sizeOfMask', 'int'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'float*'), ('bsrMaskPtrA', 'int*'), ('bsrRowPtrA', 'int*'), ('bsrEndPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'float*'), ('beta', 'float*'), ('y', 'float*'),)) + + +cusparseDbsrxmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('sizeOfMask', 'int'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'double*'), ('bsrMaskPtrA', 'int*'), ('bsrRowPtrA', 'int*'), ('bsrEndPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'double*'), ('beta', 'double*'), ('y', 'double*'),)) + + +cusparseCbsrxmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('sizeOfMask', 'int'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuComplex*'), ('bsrMaskPtrA', 'int*'), ('bsrRowPtrA', 'int*'), ('bsrEndPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'cuComplex*'), ('beta', 'cuComplex*'), ('y', 'cuComplex*'),)) + + +cusparseZbsrxmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('sizeOfMask', 'int'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuDoubleComplex*'), ('bsrMaskPtrA', 'int*'), ('bsrRowPtrA', 'int*'), ('bsrEndPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'cuDoubleComplex*'), ('beta', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),)) + + +cusparseScsrsv_analysis_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseDcsrsv_analysis_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseCcsrsv_analysis_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseZcsrsv_analysis_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseScsrsv_solve_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'float*'), ('y', 'float*'),)) + + +cusparseDcsrsv_solve_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'double*'), ('y', 'double*'),)) + + +cusparseCcsrsv_solve_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuComplex*'), ('y', 'cuComplex*'),)) + + +cusparseZcsrsv_solve_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),)) + + +cusparseShybsv_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseDhybsv_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseChybsv_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseZhybsv_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseShybsv_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('alpha', 'float*'), ('descra', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'float*'), ('y', 'float*'),)) + + +cusparseChybsv_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('alpha', 'cuComplex*'), ('descra', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuComplex*'), ('y', 'cuComplex*'),)) + + +cusparseDhybsv_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('alpha', 'double*'), ('descra', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'double*'), ('y', 'double*'),)) + + +cusparseZhybsv_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('alpha', 'cuDoubleComplex*'), ('descra', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),)) + + +cusparseScsrmm_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'float*'), ('ldb', 'int'), ('beta', 'float*'), ('C', 'float*'), ('ldc', 'int'),)) + + +cusparseDcsrmm_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'double*'), ('ldb', 'int'), ('beta', 'double*'), ('C', 'double*'), ('ldc', 'int'),)) + + +cusparseCcsrmm_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'cuComplex*'), ('ldb', 'int'), ('beta', 'cuComplex*'), ('C', 'cuComplex*'), ('ldc', 'int'),)) + + +cusparseZcsrmm_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'cuDoubleComplex*'), ('ldb', 'int'), ('beta', 'cuDoubleComplex*'), ('C', 'cuDoubleComplex*'), ('ldc', 'int'),)) + + +cusparseScsrmm2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transa', 'cusparseOperation_t'), ('transb', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'float*'), ('ldb', 'int'), ('beta', 'float*'), ('C', 'float*'), ('ldc', 'int'),)) + + +cusparseDcsrmm2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transa', 'cusparseOperation_t'), ('transb', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'double*'), ('ldb', 'int'), ('beta', 'double*'), ('C', 'double*'), ('ldc', 'int'),)) + + +cusparseCcsrmm2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transa', 'cusparseOperation_t'), ('transb', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'cuComplex*'), ('ldb', 'int'), ('beta', 'cuComplex*'), ('C', 'cuComplex*'), ('ldc', 'int'),)) + + +cusparseZcsrmm2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transa', 'cusparseOperation_t'), ('transb', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'cuDoubleComplex*'), ('ldb', 'int'), ('beta', 'cuDoubleComplex*'), ('C', 'cuDoubleComplex*'), ('ldc', 'int'),)) + + +cusparseScsrsm_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseDcsrsm_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseCcsrsm_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseZcsrsm_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseScsrsm_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'float*'), ('ldx', 'int'), ('y', 'float*'), ('ldy', 'int'),)) + + +cusparseDcsrsm_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'double*'), ('ldx', 'int'), ('y', 'double*'), ('ldy', 'int'),)) + + +cusparseCcsrsm_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuComplex*'), ('ldx', 'int'), ('y', 'cuComplex*'), ('ldy', 'int'),)) + + +cusparseZcsrsm_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuDoubleComplex*'), ('ldx', 'int'), ('y', 'cuDoubleComplex*'), ('ldy', 'int'),)) + + +cusparseScsrilu0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseDcsrilu0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseCcsrilu0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseZcsrilu0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseScsric0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseDcsric0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseCcsric0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseZcsric0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),)) + + +cusparseSgtsv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'float*'), ('d', 'float*'), ('du', 'float*'), ('B', 'float*'), ('ldb', 'int'),)) + + +cusparseDgtsv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'double*'), ('d', 'double*'), ('du', 'double*'), ('B', 'double*'), ('ldb', 'int'),)) + + +cusparseCgtsv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'cuComplex*'), ('d', 'cuComplex*'), ('du', 'cuComplex*'), ('B', 'cuComplex*'), ('ldb', 'int'),)) + + +cusparseZgtsv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'cuDoubleComplex*'), ('d', 'cuDoubleComplex*'), ('du', 'cuDoubleComplex*'), ('B', 'cuDoubleComplex*'), ('ldb', 'int'),)) + + +cusparseSgtsv_nopivot = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'float*'), ('d', 'float*'), ('du', 'float*'), ('B', 'float*'), ('ldb', 'int'),)) + + +cusparseDgtsv_nopivot = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'double*'), ('d', 'double*'), ('du', 'double*'), ('B', 'double*'), ('ldb', 'int'),)) + + +cusparseCgtsv_nopivot = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'cuComplex*'), ('d', 'cuComplex*'), ('du', 'cuComplex*'), ('B', 'cuComplex*'), ('ldb', 'int'),)) + + +cusparseZgtsv_nopivot = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'cuDoubleComplex*'), ('d', 'cuDoubleComplex*'), ('du', 'cuDoubleComplex*'), ('B', 'cuDoubleComplex*'), ('ldb', 'int'),)) + + +cusparseSgtsvStridedBatch = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('dl', 'float*'), ('d', 'float*'), ('du', 'float*'), ('x', 'float*'), ('batchCount', 'int'), ('batchStride', 'int'),)) + + +cusparseDgtsvStridedBatch = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('dl', 'double*'), ('d', 'double*'), ('du', 'double*'), ('x', 'double*'), ('batchCount', 'int'), ('batchStride', 'int'),)) + + +cusparseCgtsvStridedBatch = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('dl', 'cuComplex*'), ('d', 'cuComplex*'), ('du', 'cuComplex*'), ('x', 'cuComplex*'), ('batchCount', 'int'), ('batchStride', 'int'),)) + + +cusparseZgtsvStridedBatch = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('dl', 'cuDoubleComplex*'), ('d', 'cuDoubleComplex*'), ('du', 'cuDoubleComplex*'), ('x', 'cuDoubleComplex*'), ('batchCount', 'int'), ('batchStride', 'int'),)) + + +cusparseXcsrgemmNnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('transB', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrRowPtrC', 'int*'), ('nnzTotalDevHostPtr', 'int*'),)) + + +cusparseScsrgemm = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('transB', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'float*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'float*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseDcsrgemm = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('transB', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'double*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'double*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseCcsrgemm = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('transB', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'cuComplex*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseZcsrgemm = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('transB', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'cuDoubleComplex*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuDoubleComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseXcsrgeamNnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrRowPtrC', 'int*'), ('nnzTotalDevHostPtr', 'int*'),)) + + +cusparseScsrgeam = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('beta', 'float*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'float*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'float*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseDcsrgeam = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('beta', 'double*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'double*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'double*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseCcsrgeam = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('beta', 'cuComplex*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'cuComplex*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseZcsrgeam = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('beta', 'cuDoubleComplex*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'cuDoubleComplex*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuDoubleComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseSnnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'float*'), ('lda', 'int'), ('nnzPerRowCol', 'int*'), ('nnzTotalDevHostPtr', 'int*'),)) + + +cusparseDnnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'double*'), ('lda', 'int'), ('nnzPerRowCol', 'int*'), ('nnzTotalDevHostPtr', 'int*'),)) + + +cusparseCnnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuComplex*'), ('lda', 'int'), ('nnzPerRowCol', 'int*'), ('nnzTotalDevHostPtr', 'int*'),)) + + +cusparseZnnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuDoubleComplex*'), ('lda', 'int'), ('nnzPerRowCol', 'int*'), ('nnzTotalDevHostPtr', 'int*'),)) + + +cusparseSdense2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'float*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),)) + + +cusparseDdense2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'double*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),)) + + +cusparseCdense2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuComplex*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),)) + + +cusparseZdense2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuDoubleComplex*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),)) + + +cusparseScsr2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('A', 'float*'), ('lda', 'int'),)) + + +cusparseDcsr2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('A', 'double*'), ('lda', 'int'),)) + + +cusparseCcsr2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('A', 'cuComplex*'), ('lda', 'int'),)) + + +cusparseZcsr2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('A', 'cuDoubleComplex*'), ('lda', 'int'),)) + + +cusparseSdense2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'float*'), ('lda', 'int'), ('nnzPerCol', 'int*'), ('cscValA', 'float*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'),)) + + +cusparseDdense2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'double*'), ('lda', 'int'), ('nnzPerCol', 'int*'), ('cscValA', 'double*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'),)) + + +cusparseCdense2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuComplex*'), ('lda', 'int'), ('nnzPerCol', 'int*'), ('cscValA', 'cuComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'),)) + + +cusparseZdense2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuDoubleComplex*'), ('lda', 'int'), ('nnzPerCol', 'int*'), ('cscValA', 'cuDoubleComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'),)) + + +cusparseScsc2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'float*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('A', 'float*'), ('lda', 'int'),)) + + +cusparseDcsc2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'double*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('A', 'double*'), ('lda', 'int'),)) + + +cusparseCcsc2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'cuComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('A', 'cuComplex*'), ('lda', 'int'),)) + + +cusparseZcsc2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'cuDoubleComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('A', 'cuDoubleComplex*'), ('lda', 'int'),)) + + +cusparseXcoo2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('cooRowInd', 'int*'), ('nnz', 'int'), ('m', 'int'), ('csrRowPtr', 'int*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseXcsr2coo = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('csrRowPtr', 'int*'), ('nnz', 'int'), ('m', 'int'), ('cooRowInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseScsr2csc_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('csrVal', 'float*'), ('csrRowPtr', 'int*'), ('csrColInd', 'int*'), ('cscVal', 'float*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'), ('copyValues', 'cusparseAction_t'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseDcsr2csc_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('csrVal', 'double*'), ('csrRowPtr', 'int*'), ('csrColInd', 'int*'), ('cscVal', 'double*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'), ('copyValues', 'cusparseAction_t'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseCcsr2csc_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('csrVal', 'cuComplex*'), ('csrRowPtr', 'int*'), ('csrColInd', 'int*'), ('cscVal', 'cuComplex*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'), ('copyValues', 'cusparseAction_t'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseZcsr2csc_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('csrVal', 'cuDoubleComplex*'), ('csrRowPtr', 'int*'), ('csrColInd', 'int*'), ('cscVal', 'cuDoubleComplex*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'), ('copyValues', 'cusparseAction_t'), ('idxBase', 'cusparseIndexBase_t'),)) + + +cusparseSdense2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'float*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseDdense2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'double*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseCdense2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuComplex*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseZdense2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuDoubleComplex*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseShyb2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('A', 'float*'), ('lda', 'int'),)) + + +cusparseDhyb2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('A', 'double*'), ('lda', 'int'),)) + + +cusparseChyb2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('A', 'cuComplex*'), ('lda', 'int'),)) + + +cusparseZhyb2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('A', 'cuDoubleComplex*'), ('lda', 'int'),)) + + +cusparseScsr2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseDcsr2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseCcsr2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseZcsr2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseShyb2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),)) + + +cusparseDhyb2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),)) + + +cusparseChyb2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),)) + + +cusparseZhyb2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),)) + + +cusparseScsc2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'float*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseDcsc2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'double*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseCcsc2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'cuComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseZcsc2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'cuDoubleComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),)) + + +cusparseShyb2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('cscVal', 'float*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'),)) + + +cusparseDhyb2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('cscVal', 'double*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'),)) + + +cusparseChyb2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('cscVal', 'cuComplex*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'),)) + + +cusparseZhyb2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('cscVal', 'cuDoubleComplex*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'),)) + + +cusparseXcsr2bsrNnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('bsrRowPtrC', 'int*'), ('nnzTotalDevHostPtr', 'int*'),)) + + +cusparseScsr2bsr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('bsrValC', 'float*'), ('bsrRowPtrC', 'int*'), ('bsrColIndC', 'int*'),)) + + +cusparseDcsr2bsr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('bsrValC', 'double*'), ('bsrRowPtrC', 'int*'), ('bsrColIndC', 'int*'),)) + + +cusparseCcsr2bsr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('bsrValC', 'cuComplex*'), ('bsrRowPtrC', 'int*'), ('bsrColIndC', 'int*'),)) + + +cusparseZcsr2bsr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('bsrValC', 'cuDoubleComplex*'), ('bsrRowPtrC', 'int*'), ('bsrColIndC', 'int*'),)) + + +cusparseSbsr2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('mb', 'int'), ('nb', 'int'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'float*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'float*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseDbsr2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('mb', 'int'), ('nb', 'int'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'double*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'double*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseCbsr2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('mb', 'int'), ('nb', 'int'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuComplex*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + + +cusparseZbsr2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('mb', 'int'), ('nb', 'int'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuDoubleComplex*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuDoubleComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),)) + diff --git a/pyculib/tests/__init__.py b/pyculib/tests/__init__.py new file mode 100644 index 0000000..c9d3670 --- /dev/null +++ b/pyculib/tests/__init__.py @@ -0,0 +1,12 @@ +from os.path import dirname, join +from . import (test_blas_low_level, test_blas, test_fft, test_rand, + test_sorting, test_sparse) + +test_cases = ( + test_blas_low_level.test_cases + + test_blas.test_cases + + test_fft.test_cases + + test_rand.test_cases + + test_sorting.test_cases + + test_sparse.test_cases +) diff --git a/pyculib/tests/base.py b/pyculib/tests/base.py new file mode 100644 index 0000000..73e0d52 --- /dev/null +++ b/pyculib/tests/base.py @@ -0,0 +1,22 @@ +from unittest import TestCase, skipIf +from numba import cuda +from numba.cuda.cudadrv.error import CudaSupportError + + +def skip_cuda_tests(): + + try: + if cuda.is_available(): + gpus = cuda.list_devices() + if gpus and gpus[0].compute_capability >= (2, 0): + return False + else: + return True + return True + except CudaSupportError: + return True + + +@skipIf(skip_cuda_tests(), "CUDA not supported on this platform.") +class CUDATestCase(TestCase): + pass diff --git a/pyculib/tests/blas.py b/pyculib/tests/blas.py new file mode 100644 index 0000000..5face7a --- /dev/null +++ b/pyculib/tests/blas.py @@ -0,0 +1,395 @@ +from __future__ import print_function, absolute_import, division + +import unittest +import numpy as np +import scipy.linalg +from pyculib import warnings, config +from numba.testing.ddt import ddt, unpack, data +import time + +def create_array(dtype, shape, slices=None, empty=False): + """Create a test array of the given dtype and shape. + if slices is given, the returned array aliases a bigger parent array + using the specified start and step values. (The stop member is expected to + be appropriate to yield the given length.)""" + + from numpy.random import normal, seed + seed(1234) + + def total_size(s): + # this function doesn't support slices whose members are 'None' + return s.start + (s.stop - s.start)*np.abs(s.step) + + if not slices: + a = np.empty(dtype=dtype, shape=shape) + else: + if type(shape) is not tuple: # 1D + pshape = total_size(slices) + else: + pshape = tuple([total_size(s) for s in slices]) + parent = np.empty(dtype=dtype, shape=pshape) + a = parent[slices] + + if not empty: + mult = np.array(1, dtype=dtype) + a[:] = normal(0.,1.,shape).astype(dtype) * mult + return a + + +class TestBLAS: + """Create test cases by deriving from this (as well as unittest.TestCase. + In the derived class, set the 'blas' attribute to the BLAS module that is + to be tested.""" + + blas = None + + def _test_dot(self, dtype, shape, slice, rtol=1e-07): + + x = create_array(dtype, shape, slice) + y = create_array(dtype, shape, slice) + res = self.blas.dot(x, y) + ref = np.dot(x, y) + np.testing.assert_allclose(res, ref, rtol=rtol) + + def _test_gemv(self, dtype, op, shape, slices, rtol=1e-07): + + sp_gemv = scipy.linalg.get_blas_funcs('gemv', dtype=dtype) + # f2py convention... + sp_trans = {'N':0, 'T':1, 'C':2} + + alpha = 2. + A = create_array(dtype, shape, slices, empty=True) + A[:] = np.arange(np.prod(shape), dtype=dtype).reshape(shape) + x = np.arange(op == 'N' and shape[1] or shape[0], dtype=dtype) + beta = 2. + y = np.arange(op == 'N' and shape[0] or shape[1], dtype=dtype) + res = self.blas.gemv(op, alpha, A, x, beta, y) + y = np.arange(op == 'N' and shape[0] or shape[1], dtype=dtype) + ref = sp_gemv(alpha, A, x, beta, y, trans=sp_trans[op]) + np.testing.assert_allclose(res, ref, rtol=rtol) + + def _test_axpy(self, dtype, size, slice, rtol=1e-07): + + sp_axpy = scipy.linalg.get_blas_funcs('axpy', dtype=dtype) + alpha = 2. + x = create_array(dtype, size, slice) + y = create_array(dtype, size, slice) + yr = np.copy(y) + res = self.blas.axpy(alpha, x, y) + ref = sp_axpy(x, yr, size, alpha) + np.testing.assert_allclose(res, ref, rtol=rtol) + + def _test_gemm(self, dtype, opa, opb, ashape, aslices, bshape, bslices, rtol=1e-07): + + M = opa == 'N' and ashape[0] or ashape[1] + N = opb == 'N' and bshape[1] or bshape[0] + cshape = (M, N) + sp_gemm = scipy.linalg.get_blas_funcs('gemm', dtype=dtype) + # f2py convention... + sp_trans = {'N':0, 'T':1, 'C':2} + + alpha = 2. + A = create_array(dtype, ashape, aslices, empty=True) + A[:] = np.arange(np.prod(ashape), dtype=dtype).reshape(ashape) + B = create_array(dtype, bshape, bslices, empty=True) + B[:] = np.arange(np.prod(bshape), dtype=dtype).reshape(bshape) + beta = 5. + C = create_array(dtype, cshape, empty=True) + C[:] = np.arange(np.prod(cshape), dtype=dtype).reshape(cshape) + res = self.blas.gemm(opa, opb, alpha, A, B, beta, C) + # C may have been overwritten in the previous operation. + C = np.arange(np.prod(cshape), dtype=dtype).reshape(cshape) + ref = sp_gemm(alpha, A, B, beta, C, trans_a=sp_trans[opa], trans_b=sp_trans[opb]) + np.testing.assert_allclose(res, ref, rtol=rtol) + + def test_dot_invalid(self): + + x = np.arange(1024, dtype=np.float32) + y = np.arange(1024, dtype=np.float32) + # First make sure the original works... + res = self.blas.dot(x, y) + ref = np.dot(x, y) + np.testing.assert_allclose(res, ref, rtol=1e6) + # then check for various types of invalid input + with self.assertRaises(TypeError): # invalid type + self.blas.dot(np.arange(10), np.arange(10)) + with self.assertRaises(ValueError): + self.blas.dot(x, y.reshape(64, 16)) # invalid dim + with self.assertRaises(ValueError): + self.blas.dot(x, y[4:]) # invalid size + + def test_axpy_invalid(self): + + # First make sure the original works... + sp_axpy = scipy.linalg.get_blas_funcs('axpy', dtype=np.float32) + alpha = np.float32(2.) + x = np.arange(64, dtype=np.float32) + y = np.arange(64, dtype=np.float32) + res = self.blas.axpy(alpha, x, y) + y = np.arange(64, dtype=np.float32) + ref = sp_axpy(x, y, 64, alpha) + np.testing.assert_allclose(res, ref) + # then check for various types of invalid input + with self.assertRaises(TypeError): # invalid type + self.blas.axpy(7, np.arange(64), y) + with self.assertRaises(TypeError): # invalid type + self.blas.axpy(7, x, np.arange(64)) + with self.assertRaises(ValueError): + self.blas.axpy([1], x, y) # invalid scalar + with self.assertRaises(ValueError): + self.blas.axpy(alpha, x, y.reshape(8, 8)) # invalid dim + with self.assertRaises(ValueError): + self.blas.axpy(alpha, x, y[4:]) # invalid size + + def test_gemv_invalid(self): + + # First make sure the original works... + sp_gemv = scipy.linalg.get_blas_funcs('gemv', dtype=np.float32) + alpha = 2. + A = np.arange(64, dtype=np.float32).reshape(8,8) + x = np.arange(8, dtype=np.float32) + beta = 2. + y = np.arange(8, dtype=np.float32) + res = self.blas.gemv('N', alpha, A, x, beta, y) + y = np.arange(8, dtype=np.float32) + ref = sp_gemv(alpha, A, x, beta, y) + np.testing.assert_allclose(res, ref) + # then check for various types of invalid input + i8x8 = np.arange(64).reshape(8,8) + i8 = np.arange(8) + with self.assertRaises(TypeError): + self.blas.gemv('N', alpha, i8x8, x, beta, y) # invalid type + with self.assertRaises(TypeError): + self.blas.gemv('N', alpha, A, i8, beta, y) # invalid type + with self.assertRaises(TypeError): + self.blas.gemv('N', alpha, A, x, beta, i8) # invalid type + with self.assertRaises(ValueError): + self.blas.gemv('X', alpha, A, x, beta, y) # invalid op + with self.assertRaises(ValueError): + self.blas.gemv( 'N', [1], A, x, beta, y) # invalid scalar + with self.assertRaises(ValueError): + self.blas.gemv('N', alpha, A[0], x) # invalid dim + with self.assertRaises(ValueError): + self.blas.gemv('N', alpha, A, x.reshape(2, 4), beta, y) # invalid dim + with self.assertRaises(ValueError): + self.blas.gemv('N', alpha, A, x, beta, y.reshape(2, 4)) # invalid dim + with self.assertRaises(ValueError): + self.blas.gemv('N', alpha, A.reshape(64), x, beta, y) # invalid dim + with self.assertRaises(ValueError): + self.blas.gemv('N', alpha, A[1:,:], x, beta, y) # invalid size + with self.assertRaises(ValueError): + self.blas.gemv('N', alpha, A, x[1:], beta, y) # invalid size + with self.assertRaises(ValueError): + self.blas.gemv('N', alpha, A, x, beta, y[1:]) # invalid size + with self.assertRaises(ValueError): + self.blas.gemv('T', alpha, A, x[1:], beta, y) # invalid size + with self.assertRaises(ValueError): + self.blas.gemv('T', alpha, A, x, beta, y[1:]) # invalid size + + + def test_gemm_invalid(self): + + # First make sure the original works... + sp_gemm = scipy.linalg.get_blas_funcs('gemm', dtype=np.float32) + + alpha = 2. + A = np.arange(64, dtype=np.float32).reshape(8, 8) + B = np.arange(64, dtype=np.float32).reshape(8, 8) + beta = 5. + C = np.arange(64, dtype=np.float32).reshape(8, 8) + res = self.blas.gemm('N', 'N', alpha, A, B, beta, C) + # C may have been overwritten in the previous operation. + C = np.arange(64, dtype=np.float32).reshape(8, 8) + ref = sp_gemm(alpha, A, B, beta, C) + np.testing.assert_allclose(res, ref) + # then check for various types of invalid input + i8x8 = np.arange(64).reshape(8,8) + with self.assertRaises(TypeError): + self.blas.gemm('N', 'N', alpha, i8x8, B, beta, C) # invalid type + with self.assertRaises(TypeError): + self.blas.gemm('N', 'N', alpha, A, i8x8, beta, C) # invalid type + with self.assertRaises(TypeError): + self.blas.gemm('N', 'N', alpha, A, B, beta, i8x8) # invalid type + with self.assertRaises(ValueError): + self.blas.gemm('X', 'N', alpha, A, B, beta, C) # invalid op + with self.assertRaises(ValueError): + self.blas.gemm('N', 'X', alpha, A, B, beta, C) # invalid op + with self.assertRaises(ValueError): + self.blas.gemm('N', 'N', [1], A, B, beta, C) # invalid scalar + with self.assertRaises(ValueError): + self.blas.gemm('N', 'N', alpha, A, B, [1], C) # invalid scalar + with self.assertRaises(ValueError): + self.blas.gemm('C', 'N', alpha, A[0], B) # invalid dim + with self.assertRaises(ValueError): + self.blas.gemm('N', 'N', alpha, A, B[0]) # invalid dim + with self.assertRaises(ValueError): + self.blas.gemm('N', 'N', alpha, A.reshape(64), B, beta, C) # invalid dim + with self.assertRaises(ValueError): + self.blas.gemm('N', 'N', alpha, A, B.reshape(64), beta, C) # invalid dim + with self.assertRaises(ValueError): + self.blas.gemm('N', 'N', alpha, A, B, beta, C.reshape(64)) # invalid dim + with self.assertRaises(ValueError): + self.blas.gemm('N', 'N', alpha, A[1:,:], B, beta, C) # invalid size + with self.assertRaises(ValueError): + self.blas.gemm('N', 'N', alpha, A, B[1:,:], beta, C) # invalid size + with self.assertRaises(ValueError): + self.blas.gemm('N', 'N', alpha, A, B, beta, C[1:,:]) # invalid size + with self.assertRaises(ValueError): + self.blas.gemm('N', 'N', alpha, A, B, beta, C[:,1:]) # invalid size + + def test_gemv_default(self): + + # Check that default argument and aliasing rules work as expected + alpha = 2. + A = np.arange(64, dtype=np.float64).reshape(8,8) + x = np.arange(8, dtype=np.float64) + beta = 0. + yres = self.blas.gemv('N', alpha, A, x) + y = np.asfortranarray(np.arange(8, dtype=np.float64)) + res = self.blas.gemv('N', alpha, A, x, beta, y) + # Make sure the result is the same even with no default y... + np.testing.assert_allclose(yres, res) + # ...and res indeed aliases y + self.assertIs(res, y) + # Make sure this also works for non-contiguous y + p = np.arange(16, dtype=np.float64) / 2 + y = p[::2] + res = self.blas.gemv('N', alpha, A, x, beta, y) + np.testing.assert_allclose(yres, y) + np.testing.assert_allclose(res, y) + + def test_gemm_default(self): + + # Check that default argument and aliasing rules work as expected + alpha = 2. + A = np.arange(64, dtype=np.float64).reshape(8,8) + B = np.arange(64, dtype=np.float64).reshape(8,8) + beta = 0. + Cres = self.blas.gemm('N', 'N', alpha, A, B) + C = np.arange(64, dtype=np.float64).reshape(8, 8, order='F') + res = self.blas.gemm('N', 'N', alpha, A, B, beta, C) + # Make sure the result is the same even with no default C... + np.testing.assert_allclose(Cres, res) + # ...and res indeed aliases C + self.assertIs(res, C) + # Make sure this also works for non-contiguous C + p = np.arange(256, dtype=np.float64).reshape(16, 16) / 2 + C = p[::2,::2] + res = self.blas.gemm('N', 'N', alpha, A, B, beta, C) + np.testing.assert_allclose(Cres, C) + np.testing.assert_allclose(res, C) + + def test_dot_type_promotion(self): + + #Make sure the result has the appropriate type for mixed input types. + x = np.arange(4, dtype=np.float64) + y = np.arange(4, dtype=np.float32) + self.assertIs(type(self.blas.dot(x, y)), np.float64) + x = np.arange(4, dtype=np.float64) + y = np.arange(4, dtype=np.complex64) + self.assertIs(type(self.blas.dot(x, y)), np.complex128) + + @unittest.skipIf(not config.WARNINGS, "warnings are disabled") + def test_dot_warnings(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("error") + x = np.arange(4, dtype=np.float64) + y = np.arange(4, dtype=np.float32) + with self.assertRaises(warnings.PerformanceWarning): + self.blas.dot(x, y) + + def test_axpy_type_promotion(self): + + #Make sure the result has the appropriate type for mixed input types. + alpha = 2. + x = np.arange(4, dtype=np.float64) + y = np.arange(4, dtype=np.float32) + self.assertIs(self.blas.axpy(alpha, x, y).dtype.type, np.float64) + x = x.astype(np.complex64) + self.assertIs(self.blas.axpy(alpha, x, y).dtype.type, np.complex128) + alpha = 2.+1j + x = np.arange(4, dtype=np.float64) + y = np.arange(4, dtype=np.float32) + self.assertIs(self.blas.axpy(alpha, x, y).dtype.type, np.complex128) + + @unittest.skipIf(not config.WARNINGS, "warnings are disabled") + def test_axpy_warnings(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("error") + alpha = 1. + x = np.arange(4, dtype=np.float64) + y = np.arange(4, dtype=np.float32) + with self.assertRaises(warnings.PerformanceWarning): + self.blas.axpy(alpha, x, y) # type promotion + y = y.astype(np.float64) + with self.assertRaises(warnings.PerformanceWarning): + self.blas.axpy(alpha, x[::2], y[::2]) # non-unit-stride + + def test_gemv_type_promotion(self): + + #Make sure the result has the appropriate type for mixed input types. + alpha = 2. + A = np.arange(16, dtype=np.float64).reshape(4,4) + x = np.arange(4, dtype=np.float64) + beta = 0. + self.assertIs(self.blas.gemv('N', alpha, A, x).dtype.type, np.float64) + x = x.astype(np.complex64) + self.assertIs(self.blas.gemv('N', alpha, A, x).dtype.type, np.complex128) + y = np.asfortranarray(np.arange(4, dtype=np.float64)) + self.assertIs(self.blas.gemv('N', alpha, A, x, beta, y).dtype.type, np.complex128) + + @unittest.skipIf(not config.WARNINGS, "warnings are disabled") + def test_gemv_warnings(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("error") + alpha = 1. + A = np.arange(16, dtype=np.float64).reshape(4,4) + x = np.arange(4, dtype=np.float32) + beta = 0. + y = np.arange(4, dtype=np.float32) + with self.assertRaises(warnings.PerformanceWarning): + self.blas.gemv('N', alpha, A, x) # type promotion + x = x.astype(np.float64) + with self.assertRaises(warnings.PerformanceWarning): + self.blas.gemv('N', alpha, A, x, beta, y) # type promotion + y = y.astype(np.float64) + with self.assertRaises(warnings.PerformanceWarning): + self.blas.gemv('N', alpha, A[::2,::2], x[::2], beta, y[::2]) # non-unit-stride + + def test_gemm_type_promotion(self): + + #Make sure the result has the appropriate type for mixed input types. + alpha = 2. + A = np.arange(16, dtype=np.float64).reshape(4,4) + B = np.arange(16, dtype=np.float64).reshape(4,4) + beta = 0. + self.assertIs(self.blas.gemm('N', 'N', alpha, A, B).dtype.type, np.float64) + A = A.astype(np.complex64) + self.assertIs(self.blas.gemm('N', 'N', alpha, A, B).dtype.type, np.complex128) + C = np.asfortranarray(np.arange(16, dtype=np.float64).reshape(4,4)) + self.assertIs(self.blas.gemm('N', 'N', alpha, A, B, beta, C).dtype.type, np.complex128) + C = C.astype(np.complex128) + self.assertIs(self.blas.gemm('N', 'N', alpha, A, B, beta, C).dtype.type, np.complex128) + + @unittest.skipIf(not config.WARNINGS, "warnings are disabled") + def test_gemm_warnings(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("error") + alpha = 1. + A = np.arange(16, dtype=np.float32).reshape(4,4) + B = np.arange(16, dtype=np.float64).reshape(4,4) + beta = 0. + C = np.arange(16, dtype=np.complex64).reshape(4,4) + with self.assertRaises(warnings.PerformanceWarning): + self.blas.gemm('N', 'N', alpha, A, B) # type promotion + A = A.astype(np.float64) + with self.assertRaises(warnings.PerformanceWarning): + self.blas.gemm('N', 'N', alpha, A, B, beta, C) # type promotion + C = np.arange(16, dtype=np.float64).reshape(4,4) + with self.assertRaises(warnings.PerformanceWarning): + self.blas.gemm('N', 'N', alpha, A[::2,::2], B[::2,::2], beta, C[::2,::2]) # non-unit-stride + + + +if __name__ == '__main__': + unittest.main() diff --git a/pyculib/tests/test_blas.py b/pyculib/tests/test_blas.py new file mode 100644 index 0000000..b54c67b --- /dev/null +++ b/pyculib/tests/test_blas.py @@ -0,0 +1,126 @@ +from __future__ import print_function, absolute_import, division + +import unittest +import numpy as np +from numba.testing.ddt import ddt, unpack, data +from pyculib.tests.base import CUDATestCase +from pyculib import blas as cublas +from pyculib.tests import blas + +@ddt +class TestCUDABLAS(blas.TestBLAS, CUDATestCase): + + blas = cublas + + @data((np.float32, 1024, slice(0, 1024, 1)), + (np.float64, 1024, slice(0, 1024, 1)), + (np.complex64, 1024, slice(0, 1024, 1)), + (np.complex128, 1024, slice(0, 1024, 1)), + (np.complex128, 1024, slice(1, 1025, 1)), + (np.float32, 1024, slice(0, 2048, 2)), + (np.float64, 1024, slice(0, 2048, 2)), + (np.complex64, 1024, slice(0, 2048, 2)), + (np.complex128, 1024, slice(0, 2048, 2))) + @unpack + def test_dot(self, dtype, shape, slice): + + eps = np.finfo(dtype).eps + self._test_dot(dtype, shape, slice, rtol=eps*10) + + @data((np.float32, 'N', (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.float64, 'N', (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.float64, 'T', (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.complex64, 'N', (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.complex128, 'N', (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.complex128, 'N', (64, 67), (slice(1, 65, 1), slice(2, 69, 1))), + (np.complex128, 'T', (67, 64), (slice(2, 69, 1), slice(1, 65, 1))), + (np.complex128, 'C', (67, 64), (slice(2, 69, 1), slice(1, 65, 1))), + (np.float32, 'N', (64, 67), (slice(0, 128, 2), slice(0, 134, 2))), + (np.float64, 'N', (64, 67), (slice(0, 128, 2), slice(0, 134, 2))), + (np.complex64, 'N', (64, 67), (slice(0, 128, 2), slice(0, 134, 2))), + (np.complex128, 'N', (64, 67), (slice(0, 128, 2), slice(0, 134, 2))) + ) + @unpack + def test_gemv(self, dtype, op, shape, slices): + + self._test_gemv(dtype, op, shape, slices) + + @data((np.float32, 64, slice(0, 64, 1)), + (np.float64, 64, slice(0, 64, 1)), + (np.complex64, 64, slice(0, 64, 1)), + (np.complex128, 64, slice(0, 64, 1)), + (np.complex128, 64, slice(1, 65, 1)), + (np.float32, 64, slice(0, 128, 2)), + (np.float64, 64, slice(0, 128, 2)), + (np.complex64, 64, slice(0, 128, 2)), + (np.complex128, 64, slice(0, 128, 2))) + @unpack + def test_axpy(self, dtype, size, slice): + + self._test_axpy(dtype, size, slice) + + @data((np.float32, 'N', 'N', + (64, 67), (slice(0, 64, 1), slice(0, 67, 1)), + (67, 63), (slice(0, 67, 1), slice(0, 63, 1))), + (np.float64, 'N', 'N', + (64, 67), (slice(0, 64, 1), slice(0, 67, 1)), + (67, 64), (slice(0, 67, 1), slice(0, 64, 1))), + (np.float64, 'T', 'N', + (67, 64), (slice(0, 67, 1), slice(0, 64, 1)), + (67, 64), (slice(0, 67, 1), slice(0, 64, 1))), + (np.float64, 'N', 'T', + (64, 67), (slice(0, 64, 1), slice(0, 67, 1)), + (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.float64, 'T', 'T', + (67, 64), (slice(0, 67, 1), slice(0, 64, 1)), + (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.complex64, 'N', 'N', + (64, 67), (slice(0, 64, 1), slice(0, 67, 1)), + (67, 64), (slice(0, 67, 1), slice(0, 64, 1))), + (np.complex64, 'T', 'N', + (67, 64), (slice(0, 67, 1), slice(0, 64, 1)), + (67, 64), (slice(0, 67, 1), slice(0, 64, 1))), + (np.complex64, 'N', 'T', + (64, 67), (slice(0, 64, 1), slice(0, 67, 1)), + (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.complex64, 'T', 'T', + (67, 64), (slice(0, 67, 1), slice(0, 64, 1)), + (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.complex64, 'C', 'N', + (67, 64), (slice(0, 67, 1), slice(0, 64, 1)), + (67, 64), (slice(0, 67, 1), slice(0, 64, 1))), + (np.complex64, 'N', 'C', + (64, 67), (slice(0, 64, 1), slice(0, 67, 1)), + (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.complex64, 'C', 'C', + (67, 64), (slice(0, 67, 1), slice(0, 64, 1)), + (64, 67), (slice(0, 64, 1), slice(0, 67, 1))), + (np.complex128, 'N', 'N', + (64, 67), (slice(0, 64, 1), slice(0, 67, 1)), + (67, 64), (slice(0, 67, 1), slice(0, 64, 1))), + (np.complex128, 'N', 'N', + (64, 67), (slice(1, 65, 1), slice(2, 69, 1)), + (67, 64), (slice(1, 68, 1), slice(2, 66, 1))), + (np.float32, 'N', 'N', + (64, 65), (slice(0, 128, 2), slice(0, 130, 2)), + (65, 63), (slice(0, 130, 2), slice(0, 126, 2))), + (np.float64, 'N', 'N', + (64, 65), (slice(0, 128, 2), slice(0, 130, 2)), + (65, 63), (slice(0, 130, 2), slice(0, 126, 2))), + (np.complex64, 'N', 'N', + (64, 65), (slice(0, 128, 2), slice(0, 130, 2)), + (65, 63), (slice(0, 130, 2), slice(0, 126, 2))), + (np.complex128, 'N', 'N', + (64, 65), (slice(0, 128, 2), slice(0, 130, 2)), + (65, 63), (slice(0, 130, 2), slice(0, 126, 2)))) + @unpack + def test_gemm(self, dtype, opa, opb, ashape, aslices, bshape, bslices): + + eps = np.finfo(dtype).eps + self._test_gemm(dtype, opa, opb, ashape, aslices, bshape, bslices, rtol=eps*10) + + +test_cases = (TestCUDABLAS,) + +if __name__ == '__main__': + unittest.main() diff --git a/pyculib/tests/test_blas_low_level.py b/pyculib/tests/test_blas_low_level.py new file mode 100644 index 0000000..fb07ac2 --- /dev/null +++ b/pyculib/tests/test_blas_low_level.py @@ -0,0 +1,2003 @@ +from __future__ import print_function, absolute_import, division +import numpy as np +import unittest +from .base import CUDATestCase +from numba import cuda + + +class TestCuBlasBinding(CUDATestCase): + def test_lib(self): + from pyculib.blas.binding import (cuBlas, + CUBLAS_POINTER_MODE_HOST, + CUBLAS_ATOMICS_NOT_ALLOWED) + + stream = cuda.stream() + blas = cuBlas() + blas.stream = stream + self.assertTrue(blas.stream is stream) + blas.pointer_mode = CUBLAS_POINTER_MODE_HOST + self.assertTrue(blas.pointer_mode == CUBLAS_POINTER_MODE_HOST) + blas.atomics_mode = CUBLAS_ATOMICS_NOT_ALLOWED + self.assertTrue(blas.atomics_mode == CUBLAS_ATOMICS_NOT_ALLOWED) + + def Tnrm2(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + x = np.random.random(10).astype(dtype) + d_x = cuda.to_device(x) + + blas = cuBlas() + got = getattr(blas, fn)(x.size, d_x, 1) + exp = np.linalg.norm(x) + self.assertTrue(np.allclose(got, exp)) + + def test_Snrm2(self): + self.Tnrm2('Snrm2', np.float32) + + def test_Dnrm2(self): + self.Tnrm2('Dnrm2', np.float64) + + def test_Scnrm2(self): + self.Tnrm2('Scnrm2', np.complex64) + + def test_Dznrm2(self): + self.Tnrm2('Dznrm2', np.complex128) + + def Tdot(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + x = np.random.random(10).astype(dtype) + y = np.random.random(10).astype(dtype) + d_x = cuda.to_device(x) + d_y = cuda.to_device(y) + + blas = cuBlas() + got = getattr(blas, fn)(x.size, d_x, 1, d_y, 1) + if fn.endswith('c'): + exp = np.vdot(x, y) + else: + exp = np.dot(x, y) + self.assertTrue(np.allclose(got, exp)) + + def test_Sdot(self): + self.Tdot('Sdot', np.float32) + + def test_Ddot(self): + self.Tdot('Ddot', np.float64) + + def test_Cdotu(self): + self.Tdot('Cdotu', np.complex64) + + def test_Zdotu(self): + self.Tdot('Zdotu', np.complex128) + + def test_Cdotc(self): + self.Tdot('Cdotc', np.complex64) + + def test_Zdotc(self): + self.Tdot('Zdotc', np.complex128) + + def Tscal(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + alpha = 1.234 + x = np.random.random(10).astype(dtype) + x0 = x.copy() + d_x = cuda.to_device(x) + + blas = cuBlas() + getattr(blas, fn)(x.size, alpha, d_x, 1) + + d_x.copy_to_host(x) + + self.assertTrue(np.allclose(x0 * alpha, x)) + + def test_Sscal(self): + self.Tscal('Sscal', np.float32) + + def test_Dscal(self): + self.Tscal('Dscal', np.float64) + + def test_Cscal(self): + self.Tscal('Cscal', np.complex64) + + def test_Zscal(self): + self.Tscal('Zscal', np.complex128) + + def test_Csscal(self): + self.Tscal('Csscal', np.complex64) + + def test_Zdscal(self): + self.Tscal('Zdscal', np.complex128) + + def Taxpy(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + alpha = 1.234 + x = np.random.random(10).astype(dtype) + y = np.random.random(10).astype(dtype) + y0 = y.copy() + + d_x = cuda.to_device(x) + d_y = cuda.to_device(y) + + blas = cuBlas() + getattr(blas, fn)(x.size, alpha, d_x, 1, d_y, 1) + + d_y.copy_to_host(y) + + self.assertTrue(np.allclose(alpha * x + y0, y)) + + def test_Saxpy(self): + self.Taxpy('Saxpy', np.float32) + + def test_Daxpy(self): + self.Taxpy('Daxpy', np.float64) + + def test_Caxpy(self): + self.Taxpy('Caxpy', np.complex64) + + def test_Zaxpy(self): + self.Taxpy('Zaxpy', np.complex128) + + def Itamax(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + x = np.random.random(10).astype(dtype) + + d_x = cuda.to_device(x) + + blas = cuBlas() + got = getattr(blas, fn)(x.size, d_x, 1) - 1 + self.assertTrue(np.allclose(np.argmax(x), got)) + + def test_Isamax(self): + self.Itamax('Isamax', np.float32) + + def test_Idamax(self): + self.Itamax('Idamax', np.float64) + + def test_Icamax(self): + self.Itamax('Icamax', np.complex64) + + def test_Izamax(self): + self.Itamax('Izamax', np.complex128) + + def Itamin(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + x = np.random.random(10).astype(dtype) + + d_x = cuda.to_device(x) + + blas = cuBlas() + got = getattr(blas, fn)(x.size, d_x, 1) - 1 + self.assertTrue(np.allclose(np.argmin(x), got)) + + def test_Isamin(self): + self.Itamin('Isamin', np.float32) + + def test_Idamin(self): + self.Itamin('Idamin', np.float64) + + def test_Icamin(self): + self.Itamin('Icamin', np.complex64) + + def test_Izamin(self): + self.Itamin('Izamin', np.complex128) + + def Tasum(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + x = np.random.random(10).astype(dtype) + + d_x = cuda.to_device(x) + + blas = cuBlas() + got = getattr(blas, fn)(x.size, d_x, 1) + self.assertTrue(np.allclose(np.sum(x), got)) + + def test_Sasum(self): + self.Tasum('Sasum', np.float32) + + def test_Dasum(self): + self.Tasum('Dasum', np.float64) + + def test_Scasum(self): + self.Tasum('Scasum', np.complex64) + + def test_Dzasum(self): + self.Tasum('Dzasum', np.complex128) + + def Trot(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + x = np.random.random(10).astype(dtype) + y = np.random.random(10).astype(dtype) + angle = 1.342 + c = np.cos(angle) + s = np.sin(angle) + + x0, y0 = c * x + s * y, -s * x + c * y + + d_x = cuda.to_device(x) + d_y = cuda.to_device(y) + + blas = cuBlas() + getattr(blas, fn)(x.size, d_x, 1, d_y, 1, c, s) + + d_x.copy_to_host(x) + d_y.copy_to_host(y) + + self.assertTrue(np.allclose(x, x0)) + self.assertTrue(np.allclose(y, y0)) + + def test_Srot(self): + self.Trot('Srot', np.float32) + + def test_Drot(self): + self.Trot('Drot', np.float64) + + def test_Crot(self): + self.Trot('Crot', np.complex64) + + def test_Zrot(self): + self.Trot('Zrot', np.complex128) + + def test_Csrot(self): + self.Trot('Csrot', np.complex64) + + def test_Zdrot(self): + self.Trot('Zdrot', np.complex128) + + def Trotg(self, fn): + from pyculib.blas.binding import cuBlas + + a, b = np.random.random(), np.random.random() + blas = cuBlas() + r, z, c, s = getattr(blas, fn)(a, b) + + rot = np.array([[c, s], + [-np.conj(s), c]]) + vec = np.array([[a], + [b]]) + exp = np.dot(rot, vec) + got = np.array([[r], + [0.0]]) + self.assertTrue(np.allclose(exp, got, atol=1e-6)) + + def test_Srotg(self): + self.Trotg('Srotg') + + def test_Drotg(self): + self.Trotg('Drotg') + + def test_Crotg(self): + self.Trotg('Crotg') + + def test_Zrotg(self): + self.Trotg('Zrotg') + + def Trotm(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + x = np.random.random(10).astype(dtype) + y = np.random.random(10).astype(dtype) + + param = np.random.random(5).astype(dtype) + param[0] = -1.0 + h11, h21, h12, h22 = param[1:].tolist() + + x0, y0 = h11 * x + h12 * y, h21 * x + h22 * y + + d_x = cuda.to_device(x) + d_y = cuda.to_device(y) + + blas = cuBlas() + getattr(blas, fn)(x.size, d_x, 1, d_y, 1, param) + + d_x.copy_to_host(x) + d_y.copy_to_host(y) + + self.assertTrue(np.allclose(x, x0)) + self.assertTrue(np.allclose(y, y0)) + + def test_Srotm(self): + self.Trotm('Srotm', np.float32) + + def test_Drotm(self): + self.Trotm('Drotm', np.float64) + + def Trotmg(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + d1, d2, x1, y1 = np.random.random(4).tolist() + + blas = cuBlas() + param = getattr(blas, fn)(d1, d2, x1, y1) + + flag, h11, h21, h12, h22 = param.tolist() + + if flag == -1.0: + pass # don't know how to check + elif flag == 0.0: + self.assertEqual(h11, 0) + self.assertEqual(h22, 0) + elif flag == 1.0: + self.assertEqual(h12, 0) + self.assertEqual(h21, 0) + else: + self.assertEqual(flag, -2.0) + self.assertEqual(h11, 0) + self.assertEqual(h12, 0) + self.assertEqual(h21, 0) + self.assertEqual(h22, 0) + + def test_Srotmg(self): + self.Trotmg('Srotmg', np.float32) + + def test_Drotmg(self): + self.Trotmg('Drotmg', np.float64) + + # + # Level 2 tests + # They just simply test to see if the binding works; doesn't check for + # correct result. + # + + def Tgbmv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + blas = cuBlas() + kl = 0 + ku = 0 + alpha = 1. + beta = 0. + A = np.array([[1, 0, 0], + [0, 2, 0], + [0, 0, 3]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([1, 2, 3], dtype=dtype) + lda, n = A.shape + m = lda + y0 = y.copy() + dA = cuda.to_device(A) + dx = cuda.to_device(x) + dy = cuda.to_device(y) + getattr(blas, fn)('N', m, n, kl, ku, alpha, dA, lda, dx, 1, beta, dy, 1) + dy.copy_to_host(y) + self.assertFalse(all(y0 == y)) + + def test_Sgbmv(self): + self.Tgbmv('Sgbmv', np.float32) + + def test_Dgbmv(self): + self.Tgbmv('Dgbmv', np.float64) + + def test_Cgbmv(self): + self.Tgbmv('Cgbmv', np.complex64) + + def test_Zgbmv(self): + self.Tgbmv('Zgbmv', np.complex128) + + def Tgemv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + blas = cuBlas() + alpha = 1. + beta = 0. + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([1, 2, 3], dtype=dtype) + m, n = A.shape + lda = m + y0 = y.copy() + dA = cuda.to_device(A) + dx = cuda.to_device(x) + dy = cuda.to_device(y) + getattr(blas, fn)('N', m, n, alpha, dA, lda, dx, 1, beta, dy, 1) + dy.copy_to_host(y) + self.assertFalse(all(y0 == y)) + + def test_Sgemv(self): + self.Tgemv('Sgemv', np.float32) + + def test_Dgemv(self): + self.Tgemv('Dgemv', np.float64) + + def test_Cgemv(self): + self.Tgemv('Cgemv', np.complex64) + + def test_Zgemv(self): + self.Tgemv('Zgemv', np.complex128) + + def Ttrmv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + blas = cuBlas() + uplo = 'U' + trans = 'N' + diag = True + n = 3 + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + lda = n + x0 = x.copy() + inc = 1 + dA = cuda.to_device(A) + dx = cuda.to_device(x) + getattr(blas, fn)(uplo, trans, diag, n, dA, lda, dx, inc) + dx.copy_to_host(x) + self.assertFalse(all(x == x0)) + + def test_Strmv(self): + self.Ttrmv('Strmv', np.float32) + + def test_Dtrmv(self): + self.Ttrmv('Dtrmv', np.float64) + + def test_Ctrmv(self): + self.Ttrmv('Ctrmv', np.complex64) + + def test_Ztrmv(self): + self.Ttrmv('Ztrmv', np.complex128) + + def Ttbmv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + dA = cuda.to_device(A) + dx = cuda.to_device(x) + + blas = cuBlas() + uplo = 'U' + trans = 'N' + diag = False + n = 3 + lda = n + x0 = x.copy() + inc = 1 + k = 0 + getattr(blas, fn)(uplo, trans, diag, n, k, dA, lda, dx, inc) + dx.copy_to_host(x) + + self.assertFalse(all(x == x0)) + + def test_Stbmv(self): + self.Ttbmv('Stbmv', np.float32) + + def test_Dtbmv(self): + self.Ttbmv('Dtbmv', np.float64) + + def test_Ctbmv(self): + self.Ttbmv('Ctbmv', np.complex64) + + def test_Ztbmv(self): + self.Ttbmv('Ztbmv', np.complex128) + + def Ttpmv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + AP = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + dAP = cuda.to_device(AP) + dx = cuda.to_device(x) + + blas = cuBlas() + uplo = 'U' + trans = 'N' + diag = False + n = 3 + x0 = x.copy() + inc = 1 + getattr(blas, fn)(uplo, trans, diag, n, dAP, dx, inc) + dx.copy_to_host(x) + + self.assertFalse(all(x == x0)) + + def test_Stpmv(self): + self.Ttpmv('Stpmv', np.float32) + + def test_Dtpmv(self): + self.Ttpmv('Dtpmv', np.float64) + + def test_Ctpmv(self): + self.Ttpmv('Ctpmv', np.complex64) + + def test_Ztpmv(self): + self.Ttpmv('Ztpmv', np.complex128) + + def Ttrsv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + dA = cuda.to_device(A) + dx = cuda.to_device(x) + + blas = cuBlas() + uplo = 'U' + trans = 'N' + diag = False + lda = n = 3 + x0 = x.copy() + inc = 1 + getattr(blas, fn)(uplo, trans, diag, n, dA, lda, dx, inc) + dx.copy_to_host(x) + + self.assertFalse(all(x == x0)) + + def test_Strsv(self): + self.Ttrsv('Strsv', np.float32) + + def test_Dtrsv(self): + self.Ttrsv('Dtrsv', np.float64) + + def test_Ctrsv(self): + self.Ttrsv('Ctrsv', np.complex64) + + def test_Ztrsv(self): + self.Ttrsv('Ztrsv', np.complex128) + + def _Ttpsv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + dA = cuda.to_device(A) + dx = cuda.to_device(x) + + blas = cuBlas() + uplo = 'U' + trans = 'N' + diag = False + n = 3 + x0 = x.copy() + inc = 1 + getattr(blas, fn)(uplo, trans, diag, n, dA, dx, inc) + dx.copy_to_host(x) + + self.assertFalse(all(x == x0)) + + def test_Stpsv(self): + self._Ttpsv('Stpsv', np.float32) + + def test_Dtpsv(self): + self._Ttpsv('Dtpsv', np.float64) + + def test_Ctpsv(self): + self._Ttpsv('Ctpsv', np.complex64) + + def test_Ztpsv(self): + self._Ttpsv('Ztpsv', np.complex128) + + def _Ttbsv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + dA = cuda.to_device(A) + dx = cuda.to_device(x) + + blas = cuBlas() + uplo = 'U' + trans = 'N' + diag = False + lda = n = 3 + k = 0 + x0 = x.copy() + inc = 1 + getattr(blas, fn)(uplo, trans, diag, n, k, dA, lda, dx, inc) + dx.copy_to_host(x) + + self.assertFalse(all(x == x0)) + + def test_Stbsv(self): + self._Ttbsv('Stbsv', np.float32) + + def test_Dtbsv(self): + self._Ttbsv('Dtbsv', np.float64) + + def test_Ctbsv(self): + self._Ttbsv('Ctbsv', np.complex64) + + def test_Ztbsv(self): + self._Ttbsv('Ztbsv', np.complex128) + + def _Tsymv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + dA = cuda.to_device(A) + dx = cuda.to_device(x) + dy = cuda.to_device(y) + + alpha = 1.2 + beta = .34 + blas = cuBlas() + uplo = 'U' + lda = n = 3 + y0 = y.copy() + incx = incy = 1 + getattr(blas, fn)(uplo, n, alpha, dA, lda, dx, incx, beta, dy, incy) + dy.copy_to_host(y) + + self.assertFalse(all(y == y0)) + + def test_Ssymv(self): + self._Tsymv('Ssymv', np.float32) + + def test_Dsymv(self): + self._Tsymv('Dsymv', np.float64) + + def test_Csymv(self): + self._Tsymv('Csymv', np.complex64) + + def test_Zsymv(self): + self._Tsymv('Zsymv', np.complex128) + + _Themv = _Tsymv + + def test_Chemv(self): + self._Themv('Chemv', np.complex64) + + def test_Zhemv(self): + self._Themv('Zhemv', np.complex128) + + def _Tsbmv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + dA = cuda.to_device(A) + dx = cuda.to_device(x) + dy = cuda.to_device(y) + + alpha = 1.2 + beta = .34 + blas = cuBlas() + uplo = 'U' + lda = n = 3 + k = 0 + y0 = y.copy() + incx = incy = 1 + getattr(blas, fn)(uplo, n, k, alpha, dA, lda, dx, incx, beta, dy, incy) + dy.copy_to_host(y) + + self.assertFalse(all(y == y0)) + + def test_Ssbmv(self): + self._Tsbmv('Ssbmv', np.float32) + + def test_Dsbmv(self): + self._Tsbmv('Dsbmv', np.float64) + + _Thbmv = _Tsbmv + + def test_Chbmv(self): + self._Thbmv('Chbmv', np.complex64) + + def test_Zhbmv(self): + self._Thbmv('Zhbmv', np.complex128) + + def _Tspmv(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + AP = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + dAP = cuda.to_device(AP) + dx = cuda.to_device(x) + dy = cuda.to_device(y) + + alpha = 1.2 + beta = .34 + blas = cuBlas() + uplo = 'U' + n = 3 + y0 = y.copy() + incx = incy = 1 + getattr(blas, fn)(uplo, n, alpha, dAP, dx, incx, beta, dy, incy) + dy.copy_to_host(y) + + self.assertFalse(all(y == y0)) + + def test_Sspmv(self): + self._Tspmv('Sspmv', np.float32) + + def test_Dspmv(self): + self._Tspmv('Dspmv', np.float64) + + _Thpmv = _Tspmv + + def test_Chpmv(self): + self._Thpmv('Chpmv', np.complex64) + + def test_Zspmv(self): + self._Tspmv('Zhpmv', np.complex128) + + def _Tger(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + dA = cuda.to_device(A) + dx = cuda.to_device(x) + dy = cuda.to_device(y) + + alpha = 1.2 + + blas = cuBlas() + + lda = m = n = 3 + A0 = A.copy() + incx = incy = 1 + getattr(blas, fn)(m, n, alpha, dx, incx, dy, incy, dA, lda) + dA.copy_to_host(A) + + self.assertFalse(np.all(A == A0)) + + def test_Sger(self): + self._Tger('Sger', np.float32) + + def test_Dger(self): + self._Tger('Dger', np.float64) + + def test_Cgeru(self): + self._Tger('Cgeru', np.complex64) + + def test_Cgerc(self): + self._Tger('Cgerc', np.complex64) + + def test_Zgeru(self): + self._Tger('Zgeru', np.complex128) + + def test_Zgerc(self): + self._Tger('Zgerc', np.complex128) + + + def _Tsyr(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + + dA = cuda.to_device(A) + dx = cuda.to_device(x) + + alpha = 1.2 + uplo = 'U' + blas = cuBlas() + + lda = n = 3 + A0 = A.copy() + incx = 1 + getattr(blas, fn)(uplo, n, alpha, dx, incx, dA, lda) + dA.copy_to_host(A) + + self.assertFalse(np.all(A == A0)) + + def test_Ssyr(self): + self._Tsyr('Ssyr', np.float32) + + def test_Dsyr(self): + self._Tsyr('Dsyr', np.float64) + + def test_Csyr(self): + self._Tsyr('Csyr', np.complex64) + + def test_Zsyr(self): + self._Tsyr('Zsyr', np.complex128) + + def _Ther(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + + dA = cuda.to_device(A) + dx = cuda.to_device(x) + + alpha = 1.2 + uplo = 'U' + blas = cuBlas() + + lda = n = 3 + A0 = A.copy() + incx = 1 + getattr(blas, fn)(uplo, n, alpha, dx, incx, dA, lda) + dA.copy_to_host(A) + + self.assertFalse(np.all(A == A0)) + + def test_Cher(self): + self._Ther('Cher', np.complex64) + + def test_Zher(self): + self._Ther('Zher', np.complex128) + + def _Tspr(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + AP = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + + dAP = cuda.to_device(AP) + dx = cuda.to_device(x) + + alpha = 1.2 + uplo = 'U' + blas = cuBlas() + + n = 3 + AP0 = AP.copy() + incx = 1 + getattr(blas, fn)(uplo, n, alpha, dx, incx, dAP) + dAP.copy_to_host(AP) + + self.assertFalse(np.all(AP == AP0)) + + def test_Sspr(self): + self._Tspr('Sspr', np.float32) + + def test_Dspr(self): + self._Tspr('Dspr', np.float64) + + def test_Chpr(self): + self._Tspr('Chpr', np.complex64) + + def test_Zhpr(self): + self._Tspr('Zhpr', np.complex128) + + def _Tsyr2(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + + dA = cuda.to_device(A) + dx = cuda.to_device(x) + dy = cuda.to_device(y) + + alpha = 1.2 + uplo = 'U' + blas = cuBlas() + + lda = n = 3 + A0 = A.copy() + incx = incy = 1 + getattr(blas, fn)(uplo, n, alpha, dx, incx, dy, incy, dA, lda) + dA.copy_to_host(A) + + self.assertFalse(np.all(A == A0)) + + _Ther2 = _Tsyr2 + + def test_Ssyr2(self): + self._Tsyr2('Ssyr2', np.float32) + + def test_Dsyr2(self): + self._Tsyr2('Dsyr2', np.float64) + + def test_Csyr2(self): + self._Tsyr2('Csyr2', np.complex64) + + def test_Zsyr2(self): + self._Tsyr2('Zsyr2', np.complex128) + + def test_Cher2(self): + self._Ther2('Cher2', np.complex64) + + def test_Zher2(self): + self._Ther2('Zher2', np.complex128) + + def _Tspr2(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + + dA = cuda.to_device(A) + dx = cuda.to_device(x) + dy = cuda.to_device(y) + + alpha = 1.2 + uplo = 'U' + blas = cuBlas() + + n = 3 + A0 = A.copy() + incx = incy = 1 + getattr(blas, fn)(uplo, n, alpha, dx, incx, dy, incy, dA) + dA.copy_to_host(A) + + self.assertFalse(np.all(A == A0)) + + _Thpr2 = _Tspr2 + + def test_Sspr2(self): + self._Tspr2('Sspr2', np.float32) + + def test_Dspr2(self): + self._Tspr2('Sspr2', np.float64) + + def test_Chpr2(self): + self._Thpr2('Chpr2', np.complex64) + + def test_Zhpr2(self): + self._Thpr2('Zhpr2', np.complex128) + + # Level 3 + + def _Tgemm(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + B = np.array([[2, 2, 0], + [7, 0, 0], + [1, 4, 1]], order='F', dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + + dA = cuda.to_device(A) + dB = cuda.to_device(B) + dC = cuda.to_device(C) + + alpha = 1.2 + beta = .34 + + transa = 'N' + transb = 'N' + blas = cuBlas() + + lda = ldb = ldc = m = n = k = 3 + C0 = C.copy() + getattr(blas, fn)(transa, transb, m, n, k, alpha, dA, lda, dB, ldb, + beta, dC, ldc) + dC.copy_to_host(C) + + self.assertFalse(np.all(C == C0)) + + def test_Sgemm(self): + self._Tgemm('Sgemm', np.float32) + + def test_Dgemm(self): + self._Tgemm('Dgemm', np.float64) + + def test_Cgemm(self): + self._Tgemm('Cgemm', np.complex64) + + + def test_Zgemm(self): + self._Tgemm('Zgemm', np.complex128) + + + def _Tsyrk(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + + dA = cuda.to_device(A) + dC = cuda.to_device(C) + + alpha = 1.2 + beta = .34 + + uplo = 'U' + trans = 'N' + + blas = cuBlas() + + lda = ldc = n = k = 3 + C0 = C.copy() + getattr(blas, fn)(uplo, trans, n, k, alpha, dA, lda, beta, dC, ldc) + dC.copy_to_host(C) + + self.assertFalse(np.all(C == C0)) + + def test_Ssyrk(self): + self._Tsyrk('Ssyrk', np.float32) + + def test_Dsyrk(self): + self._Tsyrk('Dsyrk', np.float64) + + def test_Csyrk(self): + self._Tsyrk('Csyrk', np.complex64) + + def test_Zsyrk(self): + self._Tsyrk('Zsyrk', np.complex128) + + _Therk = _Tsyrk + + def test_Cherk(self): + self._Therk('Cherk', np.complex64) + + def test_Zherk(self): + self._Therk('Zherk', np.complex128) + + def _Tsymm(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + B = np.array([[2, 2, 0], + [7, 0, 0], + [1, 4, 1]], order='F', dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + + dA = cuda.to_device(A) + dB = cuda.to_device(B) + dC = cuda.to_device(C) + + alpha = 1.2 + beta = .34 + + side = 'L' + uplo = 'U' + + blas = cuBlas() + + lda = ldb = ldc = m = n = 3 + C0 = C.copy() + getattr(blas, fn)(side, uplo, m, n, alpha, dA, lda, dB, ldb, beta, dC, + ldc) + dC.copy_to_host(C) + + self.assertFalse(np.all(C == C0)) + + def test_Ssymm(self): + self._Tsymm('Ssymm', np.float32) + + def test_Dsymm(self): + self._Tsymm('Dsymm', np.float64) + + def test_Csymm(self): + self._Tsymm('Csymm', np.complex64) + + def test_Zsymm(self): + self._Tsymm('Zsymm', np.complex128) + + _Themm = _Tsymm + + def test_Chemm(self): + self._Themm('Chemm', np.complex64) + + def test_Zhemm(self): + self._Themm('Zhemm', np.complex128) + + def _Ttrsm(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + B = np.array([[2, 2, 0], + [7, 0, 0], + [1, 4, 1]], order='F', dtype=dtype) + dA = cuda.to_device(A) + dB = cuda.to_device(B) + + alpha = 1.2 + side = 'L' + uplo = 'U' + trans = 'N' + diag = False + + blas = cuBlas() + + lda = ldb = m = n = 3 + B0 = B.copy() + getattr(blas, fn)(side, uplo, trans, diag, m, n, alpha, dA, lda, dB, + ldb) + dB.copy_to_host(B) + + self.assertFalse(np.all(B == B0)) + + def test_Strsm(self): + self._Ttrsm('Strsm', np.float32) + + def test_Dtrsm(self): + self._Ttrsm('Dtrsm', np.float64) + + def test_Ctrsm(self): + self._Ttrsm('Ctrsm', np.complex64) + + def test_Ztrsm(self): + self._Ttrsm('Ztrsm', np.complex128) + + def _Ttrmm(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + B = np.array([[2, 2, 0], + [7, 0, 0], + [1, 4, 1]], order='F', dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + + dA = cuda.to_device(A) + dB = cuda.to_device(B) + dC = cuda.to_device(C) + + alpha = 1.2 + + side = 'L' + uplo = 'U' + trans = 'N' + diag = False + + blas = cuBlas() + + lda = ldb = ldc = m = n = 3 + C0 = C.copy() + getattr(blas, fn)(side, uplo, trans, diag, m, n, alpha, dA, lda, dB, + ldb, dC, ldc) + dC.copy_to_host(C) + + self.assertFalse(np.all(C == C0)) + + def test_Strmm(self): + self._Ttrmm('Strmm', np.float32) + + def test_Dtrmm(self): + self._Ttrmm('Dtrmm', np.float64) + + def test_Ctrmm(self): + self._Ttrmm('Ctrmm', np.complex64) + + def test_Ztrmm(self): + self._Ttrmm('Ztrmm', np.complex128) + + + def _Tdgmm(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 2.4], dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + + dA = cuda.to_device(A) + dx = cuda.to_device(x) + dC = cuda.to_device(C) + + side = 'L' + + blas = cuBlas() + + lda = ldc = m = n = 3 + C0 = C.copy() + incx = 1 + getattr(blas, fn)(side, m, n, dA, lda, dx, incx, dC, ldc) + dC.copy_to_host(C) + + self.assertFalse(np.all(C == C0)) + + def test_Sdgmm(self): + self._Tdgmm('Sdgmm', np.float32) + + def test_Ddgmm(self): + self._Tdgmm('Ddgmm', np.float64) + + def test_Cdgmm(self): + self._Tdgmm('Cdgmm', np.complex64) + + def test_Zdgmm(self): + self._Tdgmm('Zdgmm', np.complex128) + + + def _Tgeam(self, fn, dtype): + from pyculib.blas.binding import cuBlas + + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + B = np.array([[2, 2, 0], + [7, 0, 0], + [1, 4, 1]], order='F', dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + + dA = cuda.to_device(A) + dB = cuda.to_device(B) + dC = cuda.to_device(C) + + alpha = 1.2 + beta = .34 + + transa = 'N' + transb = 'N' + + blas = cuBlas() + + lda = ldb = ldc = m = n = 3 + C0 = C.copy() + getattr(blas, fn)(transa, transb, m, n, alpha, dA, lda, beta, dB, + ldb, dC, ldc) + dC.copy_to_host(C) + + self.assertFalse(np.all(C == C0)) + + def test_Sgeam(self): + self._Tgeam('Sgeam', np.float32) + + def test_Dgeam(self): + self._Tgeam('Dgeam', np.float64) + + def test_Cgeam(self): + self._Tgeam('Cgeam', np.complex64) + + def test_Zgeam(self): + self._Tgeam('Zgeam', np.complex128) + + +class TestCuBlasAPI(CUDATestCase): + def setUp(self): + from pyculib.blas import Blas + + self.blas = Blas() + + + def Tnrm2(self, fn, dtype): + x = np.random.random(10).astype(dtype) + got = fn(x) + exp = np.linalg.norm(x) + self.assertTrue(np.allclose(got, exp)) + + def test_nrm2(self): + self.Tnrm2(self.blas.nrm2, np.float32) + self.Tnrm2(self.blas.nrm2, np.float64) + self.Tnrm2(self.blas.nrm2, np.complex64) + self.Tnrm2(self.blas.nrm2, np.complex128) + + def Tdot(self, fn, dtype): + x = np.random.random(10).astype(dtype) + y = np.random.random(10).astype(dtype) + got = self.blas.dot(x, y) + exp = np.dot(x, y) + self.assertTrue(np.allclose(got, exp)) + + def test_dot(self): + self.Tdot(self.blas.dot, np.float32) + self.Tdot(self.blas.dot, np.float64) + self.Tdot(self.blas.dot, np.complex64) + self.Tdot(self.blas.dot, np.complex128) + + def Tdotc(self, fn, dtype): + x = np.random.random(10).astype(dtype) + y = np.random.random(10).astype(dtype) + got = self.blas.dotc(x, y) + exp = np.vdot(x, y) + self.assertTrue(np.allclose(got, exp)) + + def test_dotc(self): + self.Tdot(self.blas.dotc, np.float32) + + def Tscal(self, fn, dtype, alpha): + x = np.random.random(10).astype(dtype) + x0 = x.copy() + fn(alpha, x) + self.assertTrue(np.allclose(x0 * alpha, x)) + + def test_scal(self): + self.Tscal(self.blas.scal, np.float32, 1.234) + self.Tscal(self.blas.scal, np.float64, 1.234) + self.Tscal(self.blas.scal, np.complex64, 1.234 + 5j) + self.Tscal(self.blas.scal, np.complex128, 1.234 + 5j) + self.Tscal(self.blas.scal, np.complex64, 1.234) + self.Tscal(self.blas.scal, np.complex128, 1.234) + + def Taxpy(self, fn, dtype, alpha): + x = np.random.random(10).astype(dtype) + y = np.random.random(10).astype(dtype) + y0 = y.copy() + + fn(alpha, x, y) + + self.assertTrue(np.allclose(alpha * x + y0, y)) + + def test_axpy(self): + self.Taxpy(self.blas.axpy, np.float32, 1.234) + self.Taxpy(self.blas.axpy, np.float64, 1.234) + self.Taxpy(self.blas.axpy, np.complex64, 1.234j) + self.Taxpy(self.blas.axpy, np.complex128, 1.234j) + + def Itamax(self, fn, dtype): + x = np.random.random(10).astype(dtype) + got = fn(x) + self.assertTrue(np.allclose(np.argmax(x), got)) + + def test_amax(self): + self.Itamax(self.blas.amax, np.float32) + self.Itamax(self.blas.amax, np.float64) + self.Itamax(self.blas.amax, np.complex64) + self.Itamax(self.blas.amax, np.complex128) + + def Itamin(self, fn, dtype): + x = np.random.random(10).astype(dtype) + got = fn(x) + self.assertTrue(np.allclose(np.argmin(x), got)) + + def test_amin(self): + self.Itamin(self.blas.amin, np.float32) + self.Itamin(self.blas.amin, np.float64) + self.Itamin(self.blas.amin, np.complex64) + self.Itamin(self.blas.amin, np.complex128) + + def Tasum(self, fn, dtype): + x = np.random.random(10).astype(dtype) + got = fn(x) + self.assertTrue(np.allclose(np.sum(x), got)) + + def test_asum(self): + self.Tasum(self.blas.asum, np.float32) + self.Tasum(self.blas.asum, np.float64) + self.Tasum(self.blas.asum, np.complex64) + self.Tasum(self.blas.asum, np.complex128) + + def Trot(self, fn, dtype): + x = np.random.random(10).astype(dtype) + y = np.random.random(10).astype(dtype) + angle = 1.342 + c = np.cos(angle) + s = np.sin(angle) + + x0, y0 = c * x + s * y, -s * x + c * y + + fn(x, y, c, s) + + self.assertTrue(np.allclose(x, x0)) + self.assertTrue(np.allclose(y, y0)) + + def test_rot(self): + self.Trot(self.blas.rot, np.float32) + self.Trot(self.blas.rot, np.float64) + self.Trot(self.blas.rot, np.complex64) + self.Trot(self.blas.rot, np.complex128) + + def Trotg(self, fn, dt1, dt2): + a, b = (np.array(np.random.random(), dtype=dt1), + np.array(np.random.random(), dtype=dt2)) + r, z, c, s = fn(a, b) + + rot = np.array([[c, s], + [-np.conj(s), c]]) + vec = np.array([[a], + [b]]) + exp = np.dot(rot, vec) + got = np.array([[r], + [0.0]]) + self.assertTrue(np.allclose(exp, got, atol=1e-6)) + + def test_rotg(self): + self.Trotg(self.blas.rotg, np.float32, np.float32) + self.Trotg(self.blas.rotg, np.float64, np.float64) + self.Trotg(self.blas.rotg, np.complex64, np.complex64) + self.Trotg(self.blas.rotg, np.complex128, np.complex128) + + def Trotm(self, fn, dtype): + x = np.random.random(10).astype(dtype) + y = np.random.random(10).astype(dtype) + + param = np.random.random(5).astype(dtype) + param[0] = -1.0 + h11, h21, h12, h22 = param[1:].tolist() + + x0, y0 = h11 * x + h12 * y, h21 * x + h22 * y + + fn(x, y, param) + + self.assertTrue(np.allclose(x, x0)) + self.assertTrue(np.allclose(y, y0)) + + def test_rotm(self): + self.Trotm(self.blas.rotm, np.float32) + self.Trotm(self.blas.rotm, np.float64) + + def Trotmg(self, fn, dtype): + d1, d2, x1, y1 = np.random.random(4).tolist() + + param = fn(d1, d2, x1, y1) + + flag, h11, h21, h12, h22 = param.tolist() + + if flag == -1.0: + pass # don't know how to check + elif flag == 0.0: + self.assertEqual(h11, 0) + self.assertEqual(h22, 0) + elif flag == 1.0: + self.assertEqual(h12, 0) + self.assertEqual(h21, 0) + else: + self.assertEqual(flag, -2.0) + self.assertEqual(h11, 0) + self.assertEqual(h12, 0) + self.assertEqual(h21, 0) + self.assertEqual(h22, 0) + + def test_rotmg(self): + self.Trotmg(self.blas.rotmg, np.float32) + self.Trotmg(self.blas.rotmg, np.float64) + + # Level 2 + + def _test_all(self, test, fn): + dtypes = np.float32, np.float64, np.complex64, np.complex128 + for dt in dtypes: + test(fn, dt) + + def _test_float(self, test, fn): + dtypes = np.float32, np.float64 + for dt in dtypes: + test(fn, dt) + + def _test_complex(self, test, fn): + dtypes = np.complex64, np.complex128 + for dt in dtypes: + test(fn, dt) + + def Tgbmv(self, fn, dtype): + kl = 0 + ku = 0 + alpha = 1. + beta = 0. + A = np.array([[1, 0, 0], + [0, 2, 0], + [0, 0, 3]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([1, 2, 3], dtype=dtype) + lda, n = A.shape + m = lda + y0 = y.copy() + fn('N', m, n, kl, ku, alpha, A, x, beta, y) + self.assertFalse(all(y0 == y)) + + def test_gbmv(self): + self._test_all(self.Tgbmv, self.blas.gbmv) + + def Tgemv(self, fn, dtype): + alpha = 1. + beta = 0. + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([1, 2, 3], dtype=dtype) + m, n = A.shape + y0 = y.copy() + + fn('N', m, n, alpha, A, x, beta, y) + self.assertFalse(all(y0 == y)) + + def test_gemv(self): + self._test_all(self.Tgemv, self.blas.gemv) + + def Ttrmv(self, fn, dtype): + uplo = 'U' + trans = 'N' + diag = True + n = 3 + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + x0 = x.copy() + fn(uplo, trans, diag, n, A, x) + self.assertFalse(all(x == x0)) + + def test_trmv(self): + self._test_all(self.Ttrmv, self.blas.trmv) + + def Ttbmv(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + uplo = 'U' + trans = 'N' + diag = False + n = 3 + x0 = x.copy() + k = 0 + fn(uplo, trans, diag, n, k, A, x) + self.assertFalse(all(x == x0)) + + def test_tbmv(self): + self._test_all(self.Ttbmv, self.blas.tbmv) + + + def Ttpmv(self, fn, dtype): + AP = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + + uplo = 'U' + trans = 'N' + diag = False + n = 3 + x0 = x.copy() + fn(uplo, trans, diag, n, AP, x) + self.assertFalse(all(x == x0)) + + def test_tpmv(self): + self._test_all(self.Ttpmv, self.blas.tpmv) + + def Ttrsv(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + uplo = 'U' + trans = 'N' + diag = False + n = 3 + x0 = x.copy() + fn(uplo, trans, diag, n, A, x) + self.assertFalse(all(x == x0)) + + def test_trsv(self): + self._test_all(self.Ttrsv, self.blas.trsv) + + def Ttpsv(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + uplo = 'U' + trans = 'N' + diag = False + n = 3 + x0 = x.copy() + fn(uplo, trans, diag, n, A, x) + self.assertFalse(all(x == x0)) + + def test_tpsv(self): + self._test_all(self.Ttpsv, self.blas.tpsv) + + def Ttbsv(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + uplo = 'U' + trans = 'N' + diag = False + n = 3 + k = 0 + x0 = x.copy() + fn(uplo, trans, diag, n, k, A, x) + self.assertFalse(all(x == x0)) + + def test_tbsv(self): + self._test_all(self.Ttbsv, self.blas.tbsv) + + + def Tsymv(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + alpha = 1.2 + beta = .34 + uplo = 'U' + n = 3 + y0 = y.copy() + fn(uplo, n, alpha, A, x, beta, y) + self.assertFalse(all(y == y0)) + + def test_symv(self): + self._test_all(self.Tsymv, self.blas.symv) + + Themv = Tsymv + + def test_hemv(self): + self._test_complex(self.Themv, self.blas.hemv) + + + def Tsbmv(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + alpha = 1.2 + beta = .34 + uplo = 'U' + n = 3 + k = 0 + y0 = y.copy() + fn(uplo, n, k, alpha, A, x, beta, y) + self.assertFalse(all(y == y0)) + + def test_sbmv(self): + self._test_float(self.Tsbmv, self.blas.sbmv) + + Thbmv = Tsbmv + + def test_hbmv(self): + self._test_complex(self.Thbmv, self.blas.hbmv) + + def Tspmv(self, fn, dtype): + AP = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + alpha = 1.2 + beta = .34 + uplo = 'U' + n = 3 + y0 = y.copy() + fn(uplo, n, alpha, AP, x, beta, y) + self.assertFalse(all(y == y0)) + + def test_spmv(self): + self._test_float(self.Tspmv, self.blas.spmv) + + Thpmv = Tspmv + + def test_hpmv(self): + self._test_complex(self.Thpmv, self.blas.hpmv) + + def Tger(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + alpha = 1.2 + m = n = 3 + A0 = A.copy() + fn(m, n, alpha, x, y, A) + self.assertFalse(np.all(A == A0)) + + def test_ger(self): + self._test_float(self.Tger, self.blas.ger) + + def test_geru(self): + self._test_complex(self.Tger, self.blas.geru) + + def test_gerc(self): + self._test_complex(self.Tger, self.blas.gerc) + + def Tsyr(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + + alpha = 1.2 + uplo = 'U' + n = 3 + A0 = A.copy() + fn(uplo, n, alpha, x, A) + self.assertFalse(np.all(A == A0)) + + def test_syr(self): + self._test_all(self.Tsyr, self.blas.syr) + + def Ther(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + alpha = 1.2 + uplo = 'U' + n = 3 + A0 = A.copy() + fn(uplo, n, alpha, x, A) + self.assertFalse(np.all(A == A0)) + + def test_her(self): + self._test_complex(self.Ther, self.blas.her) + + def Tspr(self, fn, dtype): + AP = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + alpha = 1.2 + uplo = 'U' + n = 3 + AP0 = AP.copy() + fn(uplo, n, alpha, x, AP) + self.assertFalse(np.all(AP == AP0)) + + def test_spr(self): + self._test_float(self.Tspr, self.blas.spr) + + Thpr = Tspr + + def test_hpr(self): + self._test_complex(self.Thpr, self.blas.hpr) + + def Tsyr2(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + alpha = 1.2 + uplo = 'U' + n = 3 + A0 = A.copy() + fn(uplo, n, alpha, x, y, A) + self.assertFalse(np.all(A == A0)) + + Ther2 = Tsyr2 + + def test_syr2(self): + self._test_all(self.Tsyr2, self.blas.syr2) + + def test_her2(self): + self._test_complex(self.Ther2, self.blas.her2) + + def Tspr2(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 3], dtype=dtype) + y = np.array([8, 2, 3], dtype=dtype) + alpha = 1.2 + uplo = 'U' + n = 3 + A0 = A.copy() + fn(uplo, n, alpha, x, y, A) + self.assertFalse(np.all(A == A0)) + + Thpr2 = Tspr2 + + def test_spr2(self): + self._test_float(self.Tspr2, self.blas.spr2) + + def test_hpr2(self): + self._test_complex(self.Thpr2, self.blas.hpr2) + + # Level 3 + + def Tgemm(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + B = np.array([[2, 2, 0], + [7, 0, 0], + [1, 4, 1]], order='F', dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + alpha = 1.2 + beta = .34 + transa = 'N' + transb = 'N' + m = n = k = 3 + C0 = C.copy() + fn(transa, transb, m, n, k, alpha, A, B, beta, C) + self.assertFalse(np.all(C == C0)) + + def test_gemm(self): + self._test_all(self.Tgemm, self.blas.gemm) + + + def Tsyrk(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + alpha = 1.2 + beta = .34 + uplo = 'U' + trans = 'N' + n = k = 3 + C0 = C.copy() + fn(uplo, trans, n, k, alpha, A, beta, C) + self.assertFalse(np.all(C == C0)) + + def test_syrk(self): + self._test_all(self.Tsyrk, self.blas.syrk) + + Therk = Tsyrk + + def test_herk(self): + self._test_complex(self.Therk, self.blas.herk) + + def Tsymm(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + B = np.array([[2, 2, 0], + [7, 0, 0], + [1, 4, 1]], order='F', dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + alpha = 1.2 + beta = .34 + side = 'L' + uplo = 'U' + m = n = 3 + C0 = C.copy() + fn(side, uplo, m, n, alpha, A, B, beta, C) + self.assertFalse(np.all(C == C0)) + + def test_symm(self): + self._test_all(self.Tsymm, self.blas.symm) + + Themm = Tsymm + + def test_hemm(self): + self._test_complex(self.Themm, self.blas.hemm) + + def Ttrsm(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + B = np.array([[2, 2, 0], + [7, 0, 0], + [1, 4, 1]], order='F', dtype=dtype) + alpha = 1.2 + side = 'L' + uplo = 'U' + trans = 'N' + diag = False + m = n = 3 + B0 = B.copy() + fn(side, uplo, trans, diag, m, n, alpha, A, B) + self.assertFalse(np.all(B == B0)) + + def test_trsm(self): + self._test_all(self.Ttrsm, self.blas.trsm) + + def Ttrmm(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + B = np.array([[2, 2, 0], + [7, 0, 0], + [1, 4, 1]], order='F', dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + alpha = 1.2 + side = 'L' + uplo = 'U' + trans = 'N' + diag = False + m = n = 3 + C0 = C.copy() + fn(side, uplo, trans, diag, m, n, alpha, A, B, C) + self.assertFalse(np.all(C == C0)) + + def test_trmm(self): + self._test_all(self.Ttrmm, self.blas.trmm) + + def Tdgmm(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + x = np.array([1, 2, 2.4], dtype=dtype) + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + side = 'L' + m = n = 3 + C0 = C.copy() + fn(side, m, n, A, x, C) + self.assertFalse(np.all(C == C0)) + + def test_dgmm(self): + self._test_all(self.Tdgmm, self.blas.dgmm) + + + def Tgeam(self, fn, dtype): + A = np.array([[1, 2, 0], + [0, 3, 0], + [1, 0, 1]], order='F', dtype=dtype) + B = np.array([[2, 2, 0], + [7, 0, 0], + [1, 4, 1]], order='F', dtype=dtype) + + C = np.array([[0, 9, 0], + [0, 1, 1], + [0, 0, 1]], order='F', dtype=dtype) + alpha = 1.2 + beta = .34 + transa = 'N' + transb = 'N' + m = n = 3 + C0 = C.copy() + fn(transa, transb, m, n, alpha, A, beta, B, C) + self.assertFalse(np.all(C == C0)) + + def test_geam(self): + self._test_all(self.Tgeam, self.blas.geam) + + +def test(): + import sys + loader = unittest.TestLoader() + suite = unittest.TestSuite() + mod = sys.modules[__name__] + for name in dir(mod): + if name.startswith('Test'): + test_class = getattr(mod, name) + tests = loader.loadTestsFromTestCase(test_class) + suite.addTests(tests) + runner = unittest.runner.TextTestRunner() + return runner.run(suite) + +test_cases = (TestCuBlasBinding, TestCuBlasAPI) + +if __name__ == '__main__': + unittest.main() diff --git a/pyculib/tests/test_fft.py b/pyculib/tests/test_fft.py new file mode 100644 index 0000000..b3e7b9b --- /dev/null +++ b/pyculib/tests/test_fft.py @@ -0,0 +1,364 @@ +from __future__ import print_function, absolute_import, division +import numpy as np +import unittest +from .base import CUDATestCase +from numba import cuda + + +class TestCuFFTLib(CUDATestCase): + def test_lib(self): + from pyculib.fft.binding import libcufft + cufft = libcufft() + self.assertNotEqual(libcufft().version, 0) + + +class TestCuFFTPlan(CUDATestCase): + def test_plan1d(self): + from pyculib.fft.binding import Plan, CUFFT_C2C + n = 10 + data = np.arange(n, dtype=np.complex64) + orig = data.copy() + d_data = cuda.to_device(data) + fftplan = Plan.one(CUFFT_C2C, n) + fftplan.forward(d_data, d_data) + fftplan.inverse(d_data, d_data) + d_data.copy_to_host(data) + result = data / n + self.assertTrue(np.allclose(orig, result.real)) + + def test_plan2d(self): + from pyculib.fft.binding import Plan, CUFFT_C2C + n = 2**4 + data = np.arange(n, dtype=np.complex64).reshape(2, n//2) + orig = data.copy() + d_data = cuda.to_device(data) + fftplan = Plan.two(CUFFT_C2C, *data.shape) + fftplan.forward(d_data, d_data) + fftplan.inverse(d_data, d_data) + d_data.copy_to_host(data) + result = data / n + self.assertTrue(np.allclose(orig, result.real)) + + def test_plan3d(self): + from pyculib.fft.binding import Plan, CUFFT_C2C + n = 32 + data = np.arange(n, dtype=np.complex64).reshape(2, 2, 8) + + orig = data.copy() + d_data = cuda.to_device(data) + fftplan = Plan.three(CUFFT_C2C, *data.shape) + fftplan.forward(d_data, d_data) + fftplan.inverse(d_data, d_data) + d_data.copy_to_host(data) + result = data / n + self.assertTrue(np.allclose(orig, result.real)) + + + def test_against_fft_1d(self): + from pyculib.fft.binding import Plan, CUFFT_R2C + N = 128 + x = np.asarray(np.arange(N), dtype=np.float32) + xf = np.fft.fft(x) + d_x_gpu = cuda.to_device(x) + xf_gpu = np.zeros(N//2+1, np.complex64) + d_xf_gpu = cuda.to_device(xf_gpu) + plan = Plan.many(x.shape, CUFFT_R2C) + plan.forward(d_x_gpu, d_xf_gpu) + d_xf_gpu.copy_to_host(xf_gpu) + self.assertTrue( np.allclose(xf[0:N//2+1], xf_gpu, + atol=1e-6) ) + + def test_against_fft_2d(self): + from pyculib.fft.binding import Plan, CUFFT_R2C + rank = 2 + rowsize = 128 + N = rowsize * rank + x = np.arange(N, dtype=np.float32).reshape(rank, rowsize) + xf = np.fft.fft2(x) + d_x_gpu = cuda.to_device(x) + xf_gpu = np.zeros(shape=(rank, rowsize//2 + 1), dtype=np.complex64) + d_xf_gpu = cuda.to_device(xf_gpu) + plan = Plan.many(x.shape, CUFFT_R2C) + plan.forward(d_x_gpu, d_xf_gpu) + d_xf_gpu.copy_to_host(xf_gpu) + self.assertTrue(np.allclose(xf[:, 0:rowsize//2+1], xf_gpu, atol=1e-6)) + + def test_against_fft_3d(self): + from pyculib.fft.binding import Plan, CUFFT_R2C + depth = 2 + colsize = 2 + rowsize = 64 + N = depth * colsize * rowsize + x = np.arange(N, dtype=np.float32).reshape(depth, colsize, rowsize) + xf = np.fft.fftn(x) + d_x_gpu = cuda.to_device(x) + xf_gpu = np.zeros(shape=(depth, colsize, rowsize//2 + 1), dtype=np.complex64) + d_xf_gpu = cuda.to_device(xf_gpu) + plan = Plan.many(x.shape, CUFFT_R2C) + plan.forward(d_x_gpu, d_xf_gpu) + d_xf_gpu.copy_to_host(xf_gpu) + self.assertTrue(np.allclose(xf[:, :, 0:rowsize//2+1], xf_gpu, atol=1e-6)) + + +class TestCuFFTAPI(CUDATestCase): + def test_fft_1d_single(self): + from pyculib.fft import fft + N = 32 + x = np.asarray(np.arange(N), dtype=np.float32) + xf = np.fft.fft(x) + + xf_gpu = np.empty(shape=N//2 + 1, dtype=np.complex64) + fft(x, xf_gpu) + + self.assertTrue( np.allclose(xf[0:N//2+1], xf_gpu, atol=1e-6) ) + + def test_fft_1d_double(self): + from pyculib.fft import fft + N = 32 + x = np.asarray(np.arange(N), dtype=np.float64) + xf = np.fft.fft(x) + + xf_gpu = np.zeros(shape=N//2 + 1, dtype=np.complex128) + fft(x, xf_gpu) + + self.assertTrue( np.allclose(xf[0:N//2+1], xf_gpu, atol=1e-6) ) + + def test_fft_2d_single(self): + from pyculib.fft import fft + N2 = 2 + N1 = 32 + N = N1 * N2 + x = np.asarray(np.arange(N), dtype=np.float32).reshape(N2, N1) + xf = np.fft.fft2(x) + + xf_gpu = np.empty(shape=(N2, N1//2 + 1), dtype=np.complex64) + fft(x, xf_gpu) + + self.assertTrue( np.allclose(xf[:, 0:N1//2+1], xf_gpu, atol=1e-6) ) + + def test_fft_2d_single_col_major(self): + from pyculib.fft import fft + N2 = 2 + N1 = 8 + N = N1 * N2 + x = np.asarray(np.arange(N), dtype=np.float32).reshape((N2, N1), order='F') + xf_ref = np.fft.rfft2(x) + + xf = np.empty(shape=(N2, N1//2 + 1), dtype=np.complex64, order='F') + fft(x, xf) + self.assertTrue( np.allclose(xf_ref, xf, atol=1e-6) ) + + def test_invfft_2d_single_col_major(self): + from pyculib.fft import ifft + N2 = 4 + N1 = 16 + N = (N1//2 + 1) * N2 + x = np.asarray(np.arange(N), dtype=np.complex64).reshape((N2, N1//2 + 1), order='F') + xf_ref = np.fft.irfft2(x) + xf = np.empty(shape=(N2, N1), dtype=np.float32, order='F') + ifft(x, xf) + # Note the different normalization conventions in np.fft and cuda.fft ! + xf /= N1*N2 + self.assertTrue(np.allclose(xf_ref, xf, atol=1e-6)) + + def test_fft_2d_double(self): + from pyculib.fft import fft + N2 = 2 + N1 = 32 + N = N1 * N2 + x = np.asarray(np.arange(N), dtype=np.float64).reshape(N2, N1) + xf = np.fft.fft2(x) + + xf_gpu = np.empty(shape=(N2, N1//2 + 1), dtype=np.complex128) + fft(x, xf_gpu) + + self.assertTrue( np.allclose(xf[:, 0:N1//2+1], xf_gpu, atol=1e-6) ) + + def test_fft_3d_single(self): + from pyculib.fft import fft + N3 = 2 + N2 = 2 + N1 = 32 + N = N1 * N2 * N3 + x = np.asarray(np.arange(N), dtype=np.float32).reshape(N3, N2, N1) + xf = np.fft.fftn(x) + + xf_gpu = np.empty(shape=(N3, N2, N1//2 + 1), dtype=np.complex64) + fft(x, xf_gpu) + + self.assertTrue( np.allclose(xf[:, :, 0:N1//2+1], xf_gpu, atol=1e-6) ) + + def test_fft_3d_double(self): + from pyculib.fft import fft + N3 = 2 + N2 = 2 + N1 = 32 + N = N1 * N2 * N3 + x = np.asarray(np.arange(N), dtype=np.float64).reshape(N3, N2, N1) + xf = np.fft.fftn(x) + + xf_gpu = np.empty(shape=(N3, N2, N1//2 + 1), dtype=np.complex128) + fft(x, xf_gpu) + + self.assertTrue( np.allclose(xf[:, :, 0:N1//2+1], xf_gpu, atol=1e-6) ) + + def test_fft_1d_roundtrip_single(self): + from pyculib.fft import fft, ifft + N = 32 + x = np.asarray(np.arange(N), dtype=np.float32) + x0 = x.copy() + xf_gpu = np.empty(shape=N//2 + 1, dtype=np.complex64) + fft(x, xf_gpu) + ifft(xf_gpu, x) + self.assertTrue( np.allclose(x / N, x0, atol=1e-6) ) + + def test_fft_1d_roundtrip_double(self): + from pyculib.fft import fft, ifft + N = 32 + x = np.asarray(np.arange(N), dtype=np.float64) + x0 = x.copy() + xf_gpu = np.empty(shape=N//2 + 1, dtype=np.complex128) + fft(x, xf_gpu) + ifft(xf_gpu, x) + self.assertTrue( np.allclose(x / N, x0, atol=1e-6) ) + + + def test_fft_2d_roundtrip_single(self): + from pyculib.fft import fft, ifft + N2 = 2 + N1 = 32 + N = N2 * N1 + x = np.asarray(np.arange(N), dtype=np.float32).reshape(N2, N1) + x0 = x.copy() + xf_gpu = np.empty(shape=(N2, N1//2 + 1), dtype=np.complex64) + fft(x, xf_gpu) + ifft(xf_gpu, x) + self.assertTrue( np.allclose(x / N, x0, atol=1e-6) ) + + def test_fft_3d_roundtrip_single(self): + from pyculib.fft import fft, ifft + N3 = 2 + N2 = 2 + N1 = 32 + N = N3 * N2 * N1 + x = np.asarray(np.arange(N), dtype=np.float32).reshape(N3, N2, N1) + x0 = x.copy() + xf_gpu = np.empty(shape=(N3, N2, N1//2 + 1), dtype=np.complex64) + fft(x, xf_gpu) + ifft(xf_gpu, x) + self.assertTrue( np.allclose(x / N, x0, atol=1e-6) ) + + def test_fft_inplace_1d_single(self): + from pyculib.fft import fft_inplace + N = 32 + x = np.asarray(np.arange(N), dtype=np.complex64) + xf = np.fft.fft(x) + + fft_inplace(x) + + self.assertTrue( np.allclose(xf, x, atol=1e-6) ) + + def test_fft_inplace_1d_double(self): + from pyculib.fft import fft_inplace + N = 32 + x = np.asarray(np.arange(N), dtype=np.complex128) + xf = np.fft.fft(x) + + fft_inplace(x) + + self.assertTrue( np.allclose(xf, x, atol=1e-6) ) + + def test_fft_inplace_2d_single(self): + from pyculib.fft import fft_inplace + N1 = 32 + N2 = 2 + N = N1 * N2 + x = np.asarray(np.arange(N), dtype=np.complex64).reshape(N2, N1) + xf = np.fft.fft2(x) + + fft_inplace(x) + + self.assertTrue( np.allclose(xf, x, atol=1e-6) ) + + def test_fft_inplace_2d_double(self): + from pyculib.fft import fft_inplace + N1 = 32 + N2 = 2 + N = N1 * N2 + x = np.asarray(np.arange(N), dtype=np.complex128).reshape(N2, N1) + xf = np.fft.fft2(x) + + fft_inplace(x) + + self.assertTrue( np.allclose(xf, x, atol=1e-6) ) + + def test_fft_1d_roundtrip_single_2(self): + from pyculib.fft import fft_inplace, ifft_inplace + N = 32 + x = np.asarray(np.arange(N), dtype=np.complex64) + x0 = x.copy() + + fft_inplace(x) + ifft_inplace(x) + + self.assertTrue( np.allclose(x / N, x0, atol=1e-6) ) + + def test_fft_1d_roundtrip_double_2(self): + from pyculib.fft import fft_inplace, ifft_inplace + N = 32 + x = np.asarray(np.arange(N), dtype=np.complex128) + x0 = x.copy() + + fft_inplace(x) + ifft_inplace(x) + + self.assertTrue( np.allclose(x / N, x0, atol=1e-6) ) + + + def test_fft_2d_roundtrip_single_2(self): + from pyculib.fft import fft_inplace, ifft_inplace + N2 = 2 + N1 = 32 + N = N1 * N2 + x = np.asarray(np.arange(N), dtype=np.complex64).reshape(N2, N1) + x0 = x.copy() + + fft_inplace(x) + ifft_inplace(x) + + self.assertTrue( np.allclose(x / N, x0, atol=1e-6) ) + + def test_fft_3d_roundtrip_double(self): + from pyculib.fft import fft_inplace, ifft_inplace + N3 = 2 + N2 = 2 + N1 = 8 + N = N3 * N2 * N1 + x = np.asarray(np.arange(N), dtype=np.complex128).reshape(N3, N2, N1) + x0 = x.copy() + + fft_inplace(x) + ifft_inplace(x) + + self.assertTrue( np.allclose(x / N, x0, atol=1e-6) ) + + +def test(): + import sys + loader = unittest.TestLoader() + suite = unittest.TestSuite() + mod = sys.modules[__name__] + for name in dir(mod): + if name.startswith('Test'): + test_class = getattr(mod, name) + tests = loader.loadTestsFromTestCase(test_class) + suite.addTests(tests) + runner = unittest.runner.TextTestRunner() + return runner.run(suite) + + +test_cases = (TestCuFFTLib, TestCuFFTPlan, TestCuFFTAPI) + +if __name__ == '__main__': + unittest.main() + diff --git a/pyculib/tests/test_rand.py b/pyculib/tests/test_rand.py new file mode 100644 index 0000000..eb4f98e --- /dev/null +++ b/pyculib/tests/test_rand.py @@ -0,0 +1,219 @@ +from __future__ import print_function, absolute_import, division +import numpy as np +import unittest +from .base import CUDATestCase +from numba import cuda + + +class TestCURand(CUDATestCase): + def test_lib(self): + from pyculib.rand.binding import libcurand + curand = libcurand() + self.assertNotEqual(libcurand().version, 0) + +class TestCURandPseudo(CUDATestCase): + def setUp(self): + from pyculib.rand.binding import (Generator, + CURAND_RNG_PSEUDO_DEFAULT) + + self.N = 10 + self.ary32 = np.zeros(self.N, dtype=np.float32) + self.ary64 = np.zeros(self.N, dtype=np.float64) + + self.stream = cuda.stream() + self.devary32 = cuda.to_device(self.ary32, stream=self.stream) + self.devary64 = cuda.to_device(self.ary64, stream=self.stream) + + self.rndgen = Generator(CURAND_RNG_PSEUDO_DEFAULT) + self.rndgen.set_stream(self.stream) + self.rndgen.set_offset(123) + self.rndgen.set_pseudo_random_generator_seed(1234) + + def tearDown(self): + self.devary32.copy_to_host(self.ary32, stream=self.stream) + self.devary64.copy_to_host(self.ary64, stream=self.stream) + + self.stream.synchronize() + + self.assertTrue(any(self.ary32 != 0)) + self.assertTrue(any(self.ary64 != 0)) + + del self.N + del self.ary32 + del self.ary64 + del self.stream + del self.devary32 + del self.devary64 + + def test_uniform(self): + self.rndgen.generate_uniform(self.devary32, self.N) + self.rndgen.generate_uniform(self.devary64, self.N) + + + def test_normal(self): + self.rndgen.generate_normal(self.devary32, self.N, 0, 1) + self.rndgen.generate_normal(self.devary64, self.N, 0, 1) + + def test_log_normal(self): + self.rndgen.generate_log_normal(self.devary32, self.N, 0, 1) + self.rndgen.generate_log_normal(self.devary64, self.N, 0, 1) + + +class TestCURandPoisson(CUDATestCase): + def setUp(self): + from pyculib.rand.binding import (Generator, + CURAND_RNG_PSEUDO_DEFAULT) + + self.N = 10 + self.ary32 = np.zeros(self.N, dtype=np.uint32) + + self.stream = cuda.stream() + self.devary32 = cuda.to_device(self.ary32, stream=self.stream) + + self.rndgen = Generator(CURAND_RNG_PSEUDO_DEFAULT) + self.rndgen.set_stream(self.stream) + self.rndgen.set_offset(123) + self.rndgen.set_pseudo_random_generator_seed(1234) + + def tearDown(self): + self.devary32.copy_to_host(self.ary32, stream=self.stream) + + self.stream.synchronize() + + self.assertTrue(any(self.ary32 != 0)) + + del self.N + del self.ary32 + del self.stream + del self.devary32 + + def test_poisson(self): + self.rndgen.generate_poisson(self.devary32, self.N, 1) + + +class TestCURandQuasi(CUDATestCase): + def test_generate(self): + from pyculib.rand.binding import (Generator, + CURAND_RNG_QUASI_SOBOL64, + CURAND_RNG_QUASI_DEFAULT) + N = 10 + stream = cuda.stream() + + ary32 = np.zeros(N, dtype=np.uint32) + devary32 = cuda.to_device(ary32, stream=stream) + + rndgen = Generator(CURAND_RNG_QUASI_DEFAULT) + rndgen.set_stream(stream) + rndgen.set_offset(123) + rndgen.set_quasi_random_generator_dimensions(1) + rndgen.generate(devary32, N) + + devary32.copy_to_host(ary32, stream=stream) + stream.synchronize() + + self.assertTrue(any(ary32 != 0)) + + + ary64 = np.zeros(N, dtype=np.uint64) + devary64 = cuda.to_device(ary64, stream=stream) + + rndgen = Generator(CURAND_RNG_QUASI_SOBOL64) + rndgen.set_stream(stream) + rndgen.set_offset(123) + rndgen.set_quasi_random_generator_dimensions(1) + rndgen.generate(devary64, N) + + devary64.copy_to_host(ary64, stream=stream) + stream.synchronize() + + self.assertTrue(any(ary64 != 0)) + + +class TestCURandAPI(CUDATestCase): + def test_pseudo(self): + from pyculib import rand + prng = rand.PRNG() + prng.seed = 0xbeef + N = 10 + ary = np.zeros(N, dtype=np.float32) + prng.uniform(ary, N) + self.assertTrue(any(ary != 0)) + + iary = np.zeros(N, dtype=np.uint32) + prng.poisson(iary, N) + self.assertTrue(any(iary != 0)) + + def test_quasi(self): + from pyculib import rand + qrng = rand.QRNG() + qrng.ndim = 2 + N = 10 + ary = np.zeros(N, dtype=np.uint32) + qrng.generate(ary, N) + self.assertTrue(any(ary != 0)) + + +class TestTopLevel(CUDATestCase): + def test_uniform(self): + from pyculib import rand + A = rand.uniform(10) + B = rand.uniform(10) + self.assertTrue(np.mean(abs((A - B) / B)) > .10) + + def test_normal(self): + from pyculib import rand + A = rand.normal(0, 1, 10) + B = rand.normal(0, 1, 10) + self.assertTrue(np.mean(abs((A - B) / B)) > .10) + + def test_lognormal(self): + from pyculib import rand + A = rand.lognormal(0, 1, 10) + B = rand.lognormal(0, 1, 10) + self.assertTrue(np.mean(abs((A - B) / B)) > .10) + + def test_poisson(self): + from pyculib import rand + A = rand.poisson(10, 10) + B = rand.poisson(10, 10) + self.assertTrue(np.mean(abs((A - B) / B)) > .10) + + def test_quasi(self): + from pyculib import rand + A = rand.quasi(10, nd=1, bits=32) + B = rand.quasi(10, nd=1, bits=32) + self.assertTrue(np.mean(abs((A - B) / B)) > .10) + + A = rand.quasi(10, nd=1, bits=64) + B = rand.quasi(10, nd=1, bits=64) + self.assertTrue(np.mean(abs((A - B) / B)) > .10) + + A = rand.quasi(10, nd=5, bits=32) + B = rand.quasi(10, nd=5, bits=32) + self.assertTrue(np.mean(abs((A - B) / B)) > .10) + + A = rand.quasi(10, nd=5, bits=64) + B = rand.quasi(10, nd=5, bits=64) + self.assertTrue(np.mean(abs((A - B) / B)) > .10) + + +def test(): + import sys + loader = unittest.TestLoader() + suite = unittest.TestSuite() + mod = sys.modules[__name__] + for name in dir(mod): + if name.startswith('Test'): + test_class = getattr(mod, name) + tests = loader.loadTestsFromTestCase(test_class) + suite.addTests(tests) + runner = unittest.runner.TextTestRunner() + return runner.run(suite) + +test_cases = (TestCURand, TestCURandPseudo, TestCURandPoisson, TestCURandQuasi, + TestCURandAPI, TestTopLevel) + + +if __name__ == '__main__': + unittest.main() + diff --git a/pyculib/tests/test_sorting.py b/pyculib/tests/test_sorting.py new file mode 100644 index 0000000..22e8fe0 --- /dev/null +++ b/pyculib/tests/test_sorting.py @@ -0,0 +1,151 @@ +from __future__ import print_function, absolute_import, division +import numpy as np +import unittest +from .base import CUDATestCase + +SELECT_THRESHOLD = 100000 + + +class TestRadixSort(CUDATestCase): + def _test_sort(self, dtype, counts, reverse=False, seed=0, + getindices=False): + from pyculib import sorting + np.random.seed(seed) + for count in counts: + data = (np.random.rand(count) * 10 * count).astype(dtype) + orig = data.copy() + gold = data.copy() + gold.sort() + + if reverse: + gold = gold[::-1] + rs = sorting.RadixSort(maxcount=count, dtype=data.dtype, + descending=reverse) + if getindices: + indices = rs.argsort(data) + else: + indices = rs.sort(data) + self.assertTrue(np.all(data == gold)) + if getindices: + self.assertTrue(np.all(orig[indices] == gold)) + else: + self.assertIsNone(indices) + + def test_sort_float32(self): + counts = [1, 2, 10, 13, 31, 73] + self._test_sort(np.float32, counts) + self._test_sort(np.float32, counts, reverse=True) + self._test_sort(np.float32, counts, reverse=True, getindices=True) + + def test_sort_int32(self): + counts = [1, 2, 10, 13, 31, 73] + self._test_sort(np.int32, counts) + self._test_sort(np.int32, counts, reverse=True) + self._test_sort(np.int32, counts, reverse=True, getindices=True) + + def test_sort_float64(self): + counts = [1, 2, 10, 13, 31, 73] + self._test_sort(np.float64, counts) + self._test_sort(np.float64, counts, reverse=True) + self._test_sort(np.float64, counts, reverse=True, getindices=True) + + def _test_select(self, dtype, counts, ks, reverse=False, seed=0, + getindices=False): + from pyculib import sorting + np.random.seed(seed) + for k, count in zip(ks, counts): + data = (np.random.rand(count) * 10 * count).astype(dtype) + orig = data.copy() + gold = data.copy() + gold.sort() + if reverse: + gold = gold[::-1] + gold = gold[:k] + rs = sorting.RadixSort(maxcount=count, dtype=data.dtype, + descending=reverse) + if getindices: + indices = rs.argselect(keys=data, k=k) + else: + indices = rs.select(keys=data, k=k) + data = data[:k] + self.assertTrue(np.all(data == gold)) + # print(data, gold) + if getindices: + # print(indices) + # print(orig[indices]) + self.assertTrue(np.all(orig[indices] == gold)) + else: + self.assertIsNone(indices) + + def test_select_float32(self): + counts = [1, 2, 10, 13, 31, 73, 100, 101, SELECT_THRESHOLD] + ks = [1, 1, 3, 5, 10, 60, 99, 101, 1000] + self._test_select(np.float32, counts, ks) + self._test_select(np.float32, counts, ks, reverse=True) + self._test_select(np.float32, counts, ks, reverse=True, + getindices=True) + + def test_select_int32(self): + counts = [1, 2, 10, 13, 31, 73, 100, 101, SELECT_THRESHOLD] + ks = [1, 1, 3, 5, 10, 60, 99, 101, 1000] + self._test_select(np.int32, counts, ks) + self._test_select(np.int32, counts, ks, reverse=True) + self._test_select(np.int32, counts, ks, reverse=True, + getindices=True) + + def test_select_float64(self): + counts = [1, 2, 10, 13, 31, 73, 100, 101, SELECT_THRESHOLD] + ks = [1, 1, 3, 5, 10, 60, 99, 101, 1000] + self._test_select(np.float64, counts, ks) + self._test_select(np.float64, counts, ks, reverse=True, + getindices=True) + + +class TestSegmentedSort(CUDATestCase): + def _test_generic(self, dtype, divby=1): + from pyculib import sorting + keys = np.array(list(reversed(range(100))), dtype=dtype) / divby + reference = keys.copy() + original = keys.copy() + vals = np.arange(keys.size, dtype=np.int32) + segments = np.array([10, 40, 70], dtype=np.int32) + sorting.segmented_sort(keys, vals, segments) + + reference[:10].sort() + reference[10:40].sort() + reference[40:70].sort() + reference[70:].sort() + + self.assertTrue(np.all(keys == reference)) + self.assertTrue(np.all(original[vals] == reference)) + + def test_float32(self): + self._test_generic(np.float32, divby=10) + + def test_float64(self): + self._test_generic(np.float64, divby=10) + + def test_int32(self): + self._test_generic(np.int32) + + def test_int64(self): + self._test_generic(np.int64) + + +def test(): + import sys + loader = unittest.TestLoader() + suite = unittest.TestSuite() + mod = sys.modules[__name__] + for name in dir(mod): + if name.startswith('Test'): + test_class = getattr(mod, name) + tests = loader.loadTestsFromTestCase(test_class) + suite.addTests(tests) + runner = unittest.runner.TextTestRunner() + return runner.run(suite) + +test_cases = (TestRadixSort, TestSegmentedSort) + +if __name__ == '__main__': + unittest.main() diff --git a/pyculib/tests/test_sparse.py b/pyculib/tests/test_sparse.py new file mode 100644 index 0000000..4d751d9 --- /dev/null +++ b/pyculib/tests/test_sparse.py @@ -0,0 +1,660 @@ +from __future__ import print_function, absolute_import +import numpy as np +import scipy.sparse +import unittest +from .base import CUDATestCase + + +class TestCuSparseLevel1(CUDATestCase): + def setUp(self): + from pyculib.sparse import Sparse + self.cus = Sparse() + + def tearDown(self): + del self.cus + + def generic_test_axpyi(self, dtype): + alpha = 2 + xval = np.arange(5, dtype=dtype) + 1 + xind = np.arange(xval.size, dtype='int32') * 2 + y = np.zeros(shape=xval.size * 2, dtype=xval.dtype) + self.cus.axpyi(alpha, xval, xind, y) + self.assertTrue(np.allclose(y[xind], (xval * 2))) + + def test_Saxpyi(self): + self.generic_test_axpyi(dtype=np.float32) + + def test_Daxpyi(self): + self.generic_test_axpyi(dtype=np.float64) + + def test_Caxpyi(self): + self.generic_test_axpyi(dtype=np.complex64) + + def test_Zaxpyi(self): + self.generic_test_axpyi(dtype=np.complex128) + + def generic_test_doti(self, dtype): + xval = np.arange(5, dtype=dtype) + 1 + xind = np.arange(xval.size, dtype='int32') * 2 + y = np.ones(shape=xval.size * 2, dtype=xval.dtype) + result = self.cus.doti(xval, xind, y) + self.assertTrue(result) + + def test_Sdoti(self): + self.generic_test_doti(dtype=np.float32) + + def test_Zdoti(self): + self.generic_test_doti(dtype=np.complex128) + + def generic_test_dotci(self, dtype): + xval = np.arange(5, dtype=dtype) + 1 + xind = np.arange(xval.size, dtype='int32') * 2 + y = np.ones(shape=xval.size * 2, dtype=xval.dtype) + result = self.cus.dotci(xval, xind, y) + self.assertTrue(result) + + def test_Zdotci(self): + self.generic_test_dotci(dtype=np.complex128) + + def generic_test_gthr(self, dtype): + xval = np.arange(5, dtype=dtype) + 1 + xind = np.arange(xval.size, dtype='int32') * 2 + y = np.ones(shape=xval.size * 2, dtype=xval.dtype) + self.cus.gthr(y, xval, xind) + self.assertTrue(np.all(xval == 1)) + + def test_Sgthr(self): + self.generic_test_gthr(dtype=np.float32) + + def test_Cgthr(self): + self.generic_test_gthr(dtype=np.complex64) + + def generic_test_gthrz(self, dtype): + xval = np.arange(5, dtype=dtype) + 1 + xind = np.arange(xval.size, dtype='int32') * 2 + y = np.ones(shape=xval.size * 2, dtype=xval.dtype) + self.cus.gthrz(y, xval, xind) + self.assertTrue(np.all(xval == 1)) + self.assertTrue(np.all(y[xind] == 0)) + + def test_Dgthr(self): + self.generic_test_gthrz(dtype=np.float64) + + def test_Zgthr(self): + self.generic_test_gthrz(dtype=np.complex128) + + def generic_test_roti(self, dtype): + xval = np.arange(5, dtype=dtype) + 1 + xind = np.arange(xval.size, dtype='int32') * 2 + y = np.ones(shape=xval.size * 2, dtype=xval.dtype) + c = .2 + s = .3 + oldxval = xval.copy() + oldy = y.copy() + self.cus.roti(xval, xind, y, c, s) + self.assertFalse(np.all(oldxval == xval)) + self.assertFalse(np.all(oldy == y)) + + def test_Sroti(self): + self.generic_test_roti(dtype=np.float32) + + def test_Droti(self): + self.generic_test_roti(dtype=np.float64) + + def generic_test_sctr(self, dtype): + xval = np.arange(5, dtype=dtype) + 1 + xind = np.arange(xval.size, dtype='int32') * 2 + y = np.ones(shape=xval.size * 2, dtype=xval.dtype) + oldy = y.copy() + self.cus.sctr(xval, xind, y) + self.assertFalse(np.all(oldy == y)) + + def test_Ssctr(self): + self.generic_test_sctr(dtype=np.float32) + + def test_Csctr(self): + self.generic_test_sctr(dtype=np.complex64) + + +class TestCuSparseMatrixOp(CUDATestCase): + def test_bsr_matrix(self): + from pyculib.sparse import bsr_matrix + row = np.array([0, 0, 1, 2, 2, 2]) + col = np.array([0, 2, 2, 0, 1, 2]) + data = np.array([1, 2, 3, 4, 5, 6]) + expect = scipy.sparse.bsr_matrix((data, (row, col)), shape=(3, 3)) + mat = bsr_matrix((data, (row, col)), shape=(3, 3)) + host = mat.copy_to_host() + self.assertTrue(np.all(host.indices == expect.indices)) + self.assertTrue(np.all(host.indptr == expect.indptr)) + self.assertTrue(np.all(host.data == expect.data)) + + def test_matdescr(self): + from pyculib.sparse import Sparse + sparse = Sparse() + md = sparse.matdescr() + md.diagtype = 'N' + md.fillmode = 'L' + md.indexbase = 0 + md.matrixtype = 'G' + + self.assertEqual('N', md.diagtype) + self.assertEqual('L', md.fillmode) + self.assertEqual(0, md.indexbase) + self.assertEqual('G', md.matrixtype) + del md + + +class TestCuSparseLevel2(CUDATestCase): + def setUp(self): + from pyculib.sparse import Sparse + self.cus = Sparse() + + def generic_test_bsrmv(self, dtype): + from pyculib.sparse import bsr_matrix + + row = np.array([0, 0, 1, 2, 2, 2]) + col = np.array([0, 2, 2, 0, 1, 2]) + data = np.array([1, 2, 3, 4, 5, 6], dtype=dtype) + + bsrmat = bsr_matrix((data, (row, col)), shape=(3, 3)) + x = np.ones(3, dtype=dtype) + y = np.ones(3, dtype=dtype) + oldy = y.copy() + + alpha = 1 + beta = 1 + descr = self.cus.matdescr() + self.cus.bsrmv_matrix('C', 'N', alpha, descr, bsrmat, x, beta, y) + + self.assertFalse(np.all(y == oldy)) + + def test_Sbsrmv(self): + dtype = np.float32 + self.generic_test_bsrmv(dtype=dtype) + + def test_Cbsrmv(self): + dtype = np.complex64 + self.generic_test_bsrmv(dtype=dtype) + + def test_Sbsrxmv(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + alpha = 0 + beta = 0 + descr = self.cus.matdescr() + bsrVal = np.zeros(10, dtype=dtype) + bsrMaskPtr = np.zeros(10, dtype=np.int32) + bsrRowPtr = np.zeros(10, dtype=np.int32) + bsrEndPtr = np.zeros(10, dtype=np.int32) + bsrColInd = np.zeros(10, dtype=np.int32) + blockDim = 1 + x = np.zeros(10, dtype=dtype) + y = np.zeros(10, dtype=dtype) + self.cus.bsrxmv('C', 'N', 1, 1, 1, 1, alpha, descr, bsrVal, + bsrMaskPtr, bsrRowPtr, bsrEndPtr, bsrColInd, + blockDim, x, beta, y) + + def test_Scsrmv(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + alpha = 0 + beta = 0 + descr = self.cus.matdescr() + csrVal = np.zeros(10, dtype=dtype) + csrColInd = np.zeros(10, dtype=np.int32) + csrRowPtr = np.zeros(10, dtype=np.int32) + x = np.zeros(10, dtype=dtype) + y = np.zeros(10, dtype=dtype) + trans = 'N' + m = 1 + n = 1 + nnz = 1 + self.cus.csrmv(trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, + csrColInd, x, beta, y) + + def test_Scsrmv(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + + descr = self.cus.matdescr() + csrVal = np.zeros(10, dtype=dtype) + csrColInd = np.zeros(10, dtype=np.int32) + csrRowPtr = np.zeros(10, dtype=np.int32) + trans = 'N' + m = 1 + nnz = 1 + info = self.cus.csrsv_analysis(trans, m, nnz, descr, csrVal, + csrRowPtr, csrColInd) + + alpha = 1.0 + x = np.zeros(10, dtype=dtype) + y = np.zeros(10, dtype=dtype) + self.cus.csrsv_solve(trans, m, alpha, descr, csrVal, csrRowPtr, + csrColInd, info, x, y) + + +class TestCuSparseLevel3(CUDATestCase): + def setUp(self): + from pyculib.sparse import Sparse + self.cus = Sparse() + + def test_Scsrmm(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + + descrA = self.cus.matdescr() + B = C = csrValA = np.zeros(10, dtype=dtype) + csrColIndA = np.zeros(10, dtype=np.int32) + csrRowPtrA = np.zeros(10, dtype=np.int32) + ldb = 1 + ldc = 1 + m = 1 + n = 1 + k = 1 + nnz = 1 + alpha = 1 + beta = 1 + transA = 'N' + self.cus.csrmm(transA, m, n, k, nnz, alpha, descrA, csrValA, + csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc) + + def test_Ccsrmm(self): + """ + Just exercise the codepath + """ + dtype = np.complex64 + + descrA = self.cus.matdescr() + B = C = csrValA = np.zeros(10, dtype=dtype) + csrColIndA = np.zeros(10, dtype=np.int32) + csrRowPtrA = np.zeros(10, dtype=np.int32) + ldb = 1 + ldc = 1 + m = 1 + n = 1 + k = 1 + nnz = 1 + alpha = 1 + beta = 1 + transA = transB = 'N' + self.cus.csrmm2(transA, transB, m, n, k, nnz, alpha, descrA, + csrValA, + csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc) + + def test_Scsrsm(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + + descrA = self.cus.matdescr() + X = Y = csrValA = np.zeros(10, dtype=dtype) + csrColIndA = np.zeros(10, dtype=np.int32) + csrRowPtrA = np.zeros(10, dtype=np.int32) + m = 1 + n = 1 + nnz = 1 + transA = 'N' + info = self.cus.csrsm_analysis(transA, m, nnz, descrA, csrValA, + csrRowPtrA, csrColIndA) + alpha = 1 + ldx = 1 + ldy = 1 + self.cus.csrsm_solve(transA, m, n, alpha, descrA, csrValA, + csrRowPtrA, csrColIndA, info, X, ldx, Y, ldy) + + +class TestCuSparseExtra(CUDATestCase): + def setUp(self): + from pyculib.sparse import Sparse + self.cus = Sparse() + + def test_XcsrgeamNnz(self): + """ + Just exercise the codepath + """ + m = n = 1 + nnzA = 1 + nnzB = 1 + descrA = descrB = descrC = self.cus.matdescr() + csrColIndA = csrColIndB = np.zeros(10, dtype=np.int32) + csrRowPtrA = csrRowPtrB = csrRowPtrC = np.zeros(10, dtype=np.int32) + nnzC = self.cus.XcsrgeamNnz(m, n, descrA, nnzA, csrRowPtrA, csrColIndA, + descrB, nnzB, csrRowPtrB, csrColIndB, + descrC, + csrRowPtrC) + self.assertTrue(isinstance(nnzC, int)) + + def test_Scsrgeam(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + m = n = 1 + nnzA = 1 + nnzB = 1 + alpha = beta = 1 + csrValA = csrValB = csrValC = np.zeros(10, dtype=dtype) + descrA = descrB = descrC = self.cus.matdescr() + csrColIndA = csrColIndB = csrColIndC = np.zeros(10, dtype=np.int32) + csrRowPtrA = csrRowPtrB = csrRowPtrC = np.zeros(10, dtype=np.int32) + self.cus.csrgeam(m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, + csrColIndA, beta, descrB, nnzB, csrValB, + csrRowPtrB, csrColIndB, descrC, csrValC, + csrRowPtrC, csrColIndC) + + + def test_XcsrgemmNnz(self): + """ + Just exercise the codepath + """ + m = n = k = 1 + nnzA = 1 + nnzB = 1 + descrA = descrB = descrC = self.cus.matdescr() + csrColIndA = csrColIndB = np.zeros(10, dtype=np.int32) + csrRowPtrA = csrRowPtrB = csrRowPtrC = np.zeros(10, dtype=np.int32) + transA = transB = 'N' + nnzC = self.cus.XcsrgemmNnz(transA, transB, m, n, k, descrA, nnzA, + csrRowPtrA, + csrColIndA, descrB, nnzB, csrRowPtrB, + csrColIndB, descrC, + csrRowPtrC) + self.assertTrue(isinstance(nnzC, int)) + + def test_Scsrgemm(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + m = n = k = 0 + transA = transB = 'N' + nnzA = 0 + nnzB = 0 + csrValA = csrValB = csrValC = np.zeros(10, dtype=dtype) + descrA = descrB = descrC = self.cus.matdescr() + csrColIndA = csrColIndB = csrColIndC = np.zeros(10, dtype=np.int32) + csrRowPtrA = csrRowPtrB = csrRowPtrC = np.zeros(10, dtype=np.int32) + self.cus.csrgemm(transA, transB, m, n, k, descrA, nnzA, csrValA, + csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, + csrRowPtrB, + csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC) + + def test_csrgemm_ez(self): + from pyculib.sparse import CudaSparseMatrix + from scipy.sparse import csr_matrix + + def random_square_csr_matrix(N): + X = np.random.random((N, N)) + X[X > 0.7] = 0 + return csr_matrix(X) + + a = random_square_csr_matrix(10) + b = random_square_csr_matrix(10) + a_cuda = CudaSparseMatrix().from_host_matrix(a) + b_cuda = CudaSparseMatrix().from_host_matrix(b) + c_cuda = self.cus.csrgemm_ez(a, b) + c_host = c_cuda.copy_to_host() + c_gold = a * b + # Conversion to dense necessary because Scipy and cuSPARSE order their + # indices differently + np.testing.assert_allclose(c_gold.toarray(), c_host.toarray()) + + +class TestCuSparsePreconditioners(CUDATestCase): + def setUp(self): + from pyculib.sparse import Sparse + self.cus = Sparse() + + def test_Scsric0(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + + m = n = 3 + trans = 'N' + + sary = scipy.sparse.rand(m, n, 0.75, format='csr', dtype=dtype) + nnz = sary.nnz + csrValM = sary.data + csrRowPtrA = sary.indptr + csrColIndA = sary.indices + + descr = self.cus.matdescr(matrixtype='S') + info = self.cus.csrsv_analysis(trans, m, nnz, descr, csrValM, + csrRowPtrA, csrColIndA) + self.cus.csric0(trans, m, descr, csrValM, csrRowPtrA, + csrColIndA, info) + + def test_Scsrilu0(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + + m = n = 3 + trans = 'N' + + sary = scipy.sparse.rand(m, n, 0.75, format='csr', dtype=dtype) + csrValM = sary.data + csrRowPtrA = sary.indptr + csrColIndA = sary.indices + + descr = self.cus.matdescr() + info = self.cus.api.solve_analysis_info() + self.cus.csrilu0(trans, m, descr, csrValM, csrRowPtrA, + csrColIndA, info) + + def test_Sgtsv(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + + m = 4 + n = 3 + ldb = m + dl = np.asarray([3] * 8, dtype=dtype) + d = np.asarray([1] * 9, dtype=dtype) + du = np.asarray([4] * 8, dtype=dtype) + B = np.ones((m, n), dtype=dtype, order='F') + Bcopy = B.copy() + self.cus.gtsv(m, n, dl, d, du, B, ldb) + self.assertTrue(not np.all(B == Bcopy)) + + def test_Sgtsv_nopivot(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + + m = 4 + n = 3 + ldb = m + dl = np.asarray([3] * 8, dtype=dtype) + d = np.asarray([1] * 9, dtype=dtype) + du = np.asarray([4] * 8, dtype=dtype) + B = np.ones((m, n), dtype=dtype, order='F') + Bcopy = B.copy() + self.cus.gtsv_nopivot(m, n, dl, d, du, B, ldb) + self.assertTrue(not np.all(B == Bcopy)) + + def test_SgtsvStridedBatch(self): + """ + Just exercise the codepath + """ + dtype = np.float32 + + batchCount = 1 + batchStride = 4 + + m = 4 + n = 3 + dl = np.asarray([3] * 8, dtype=dtype) + d = np.asarray([1] * 9, dtype=dtype) + du = np.asarray([4] * 8, dtype=dtype) + x = np.ones((m, n), dtype=dtype, order='F') + xcopy = x.copy() + self.cus.gtsvStridedBatch(m, dl, d, du, x, batchCount, batchStride) + self.assertTrue(not np.all(x == xcopy)) + + +class TestCuSparseFormatConversion(CUDATestCase): + """ + These test can corrupt the CUDA context making the remaining test fails + """ + + def setUp(self): + from pyculib.sparse import Sparse + self.cus = Sparse() + + def tearDown(self): + del self.cus + + def test_Sbsr2csr(self): + dtype = np.dtype('float32') + + dirA = 'C' + mb = nb = 0 + descrA = self.cus.matdescr() + descrC = self.cus.matdescr() + bsrValA = csrValC = np.zeros(10, dtype=dtype) + bsrRowPtrA = bsrColIndA = np.zeros(10, dtype=np.int32) + csrRowPtrC = csrColIndC = np.zeros(10, dtype=np.int32) + blockDim = 1 + self.cus.bsr2csr(dirA, mb, nb, descrA, bsrValA, bsrRowPtrA, + bsrColIndA, blockDim, descrC, csrValC, + csrRowPtrC, csrColIndC) + + def test_Xcoo2csr(self): + nnz = 1 + m = 1 + csrRowPtr = np.zeros(20, dtype=np.int32) + cooRowInd = np.zeros(20, dtype=np.int32) + self.cus.Xcoo2csr(cooRowInd, nnz, m, csrRowPtr) + + def test_Scsc2dense(self): + m = n = 1 + lda = 1 + descrA = self.cus.matdescr() + cscValA = np.zeros(10, dtype=np.float32) + cscRowIndA = cscColPtrA = A = np.zeros(10, dtype=np.int32) + self.cus.csc2dense(m, n, descrA, cscValA, cscRowIndA, cscColPtrA, A, + lda) + + def test_Xcsr2bsrNnz(self): + dirA = 'C' + m = n = 1 + blockDim = 1 + descrC = descrA = self.cus.matdescr() + bsrRowPtrC = csrRowPtrA = csrColIndA = np.zeros(10, dtype=np.int32) + self.cus.Xcsr2bsrNnz(dirA, m, n, descrA, csrRowPtrA, csrColIndA, + blockDim, descrC, bsrRowPtrC) + + def test_Scsr2bsr(self): + dtype = np.float32 + + dirA = 'C' + m = n = 1 + blockDim = 1 + descrC = descrA = self.cus.matdescr() + csrValA = bsrColIndC = bsrValC = np.zeros(10, dtype=dtype) + bsrRowPtrC = csrRowPtrA = csrColIndA = np.zeros(10, dtype=np.int32) + self.cus.csr2bsr(dirA, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, + blockDim, descrC, bsrValC, bsrRowPtrC, bsrColIndC) + + def test_Xcsr2coo(self): + nnz = m = 1 + csrRowPtr = cooRowInd = np.zeros(10, dtype=np.int32) + self.cus.Xcsr2coo(csrRowPtr, nnz, m, cooRowInd) + + def test_Scsr2csc(self): + dtype = np.float32 + m = n = nnz = 1 + csrVal = cscVal = np.zeros(10, dtype=dtype) + csrRowPtr = csrColInd = np.zeros(10, dtype=np.int32) + cscRowInd = cscColPtr = np.zeros(10, dtype=np.int32) + copyValues = 'N' + self.cus.csr2csc(m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, + cscRowInd, cscColPtr, copyValues) + + def test_Scsr2dense(self): + dtype = np.float32 + + m = n = 1 + lda = 1 + descrA = self.cus.matdescr() + A = csrValA = np.zeros(10, dtype=dtype) + csrRowPtrA = csrColIndA = np.zeros(10, np.int32) + self.cus.csr2dense(m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, + lda) + + def test_Sdense2csc(self): + dtype = np.float32 + + m = n = 2 + lda = m + nnzPerCol = np.ones(n, np.int32) * 2 + descrA = self.cus.matdescr() + A = np.ones(n, dtype=dtype) + cscValA = np.zeros(10, dtype=dtype) + cscRowIndA = np.zeros(10, np.int32) + cscColPtrA = np.zeros(10, np.int32) + + self.cus.dense2csc(m, n, descrA, A, lda, nnzPerCol, cscValA, + cscRowIndA, cscColPtrA) + + def test_Sdense2csr(self): + dtype = np.float32 + + m = n = 2 + lda = m + nnzPerRow = np.ones(n, np.int32) * 2 + descrA = self.cus.matdescr() + A = np.ones(10, dtype=dtype) + csrValA = np.zeros(10, dtype=dtype) + csrRowPtrA = csrColIndA = np.zeros(10, np.int32) + self.cus.dense2csr(m, n, descrA, A, lda, nnzPerRow, csrValA, + csrRowPtrA, csrColIndA) + + def test_Snnz(self): + dtype = np.float32 + dirA = 'C' + m = n = 2 + lda = m + descrA = self.cus.matdescr() + A = np.ones(10, dtype=dtype) + nnzPerRowColumn = np.zeros(10, np.int32) + self.cus.nnz(dirA, m, n, descrA, A, lda, nnzPerRowColumn) + + +def test(): + import sys + loader = unittest.TestLoader() + suite = unittest.TestSuite() + mod = sys.modules[__name__] + for name in dir(mod): + if name.startswith('Test'): + test_class = getattr(mod, name) + tests = loader.loadTestsFromTestCase(test_class) + suite.addTests(tests) + runner = unittest.runner.TextTestRunner() + return runner.run(suite) + +test_cases = (TestCuSparseLevel1, TestCuSparseMatrixOp, TestCuSparseLevel2, + TestCuSparseLevel3, TestCuSparseExtra, + TestCuSparsePreconditioners, TestCuSparseFormatConversion) + + +if __name__ == '__main__': + unittest.main() diff --git a/pyculib/utils/__init__.py b/pyculib/utils/__init__.py new file mode 100644 index 0000000..4b74290 --- /dev/null +++ b/pyculib/utils/__init__.py @@ -0,0 +1,4 @@ +from __future__ import absolute_import +from .libutils import ctype_function, Lib +from .cctypes import c_complex, c_double_complex, memalign +from .finalizer import OwnerMixin diff --git a/pyculib/utils/cctypes.py b/pyculib/utils/cctypes.py new file mode 100644 index 0000000..99f6a58 --- /dev/null +++ b/pyculib/utils/cctypes.py @@ -0,0 +1,47 @@ +from __future__ import absolute_import, print_function, division +from ctypes import c_float, c_double, Structure, c_uint8, sizeof, addressof +import numpy as np + + +class c_complex(Structure): + _fields_ = [('real', c_float), ('imag', c_float)] + + def __init__(self, real=0, imag=0): + if isinstance(real, (complex, np.complex64, np.complex128)): + real, imag = real.real, real.imag + super(c_complex, self).__init__(real, imag) + + @property + def value(self): + return complex(self.real, self.imag) + + +class c_double_complex(Structure): + _fields_ = [('real', c_double), ('imag', c_double)] + + def __init__(self, real=0, imag=0): + if isinstance(real, (complex, np.complex64, np.complex128)): + real, imag = real.real, real.imag + super(c_double_complex, self).__init__(real, imag) + + @property + def value(self): + return complex(self.real, self.imag) + + +def memalign(cty, align): + """Allocate a ctype object on the specific byte alignment + """ + # Allocate bytes with offset + mem = (c_uint8 * (sizeof(cty) + align))() + addr = addressof(mem) + + # Move to alignment + offset = addr % align + if offset: + offset = align - offset + + buf = cty.from_address(offset + addr) + assert 0 == addressof(buf) % align + + return buf, mem diff --git a/pyculib/utils/finalizer.py b/pyculib/utils/finalizer.py new file mode 100644 index 0000000..17ca300 --- /dev/null +++ b/pyculib/utils/finalizer.py @@ -0,0 +1,43 @@ +''' +Modified C-level finalizer by Benjamin Peterson +Available at http://code.activestate.com/recipes/577242-calling-c-level-finalizers-without-__del__/ +''' +import weakref + +class OwnerRef(weakref.ref): + """A simple weakref.ref subclass, so attributes can be added.""" + pass + +def _run_finalizer(ref): + """Internal weakref callback to run finalizers""" + del _finalize_refs[ref.owner] + for item, finalizer in ref.items: + try: + finalizer(item) + except Exception: + pass + +_finalize_refs = {} + +def track(owner, item, finalizer): + """Register an object for finalization. + + ``owner`` is the the object which is responsible for ``item``. + ``finalizer`` will be called with ``item`` as its only argument when + ``owner`` is destroyed by the garbage collector. + """ + if id(owner) in _finalize_refs: + ref = _finalize_refs[id(owner)] + else: + ref = OwnerRef(owner, _run_finalizer) + ref.owner = id(owner) + ref.items = [] + ref.items.append((item, finalizer)) + _finalize_refs[id(owner)] = ref + +class OwnerMixin(object): + def _finalizer_track(self, item): + if not hasattr(self, '_finalize'): + raise AttributeError("%s must define a _finalize method" % self) + track(self, item, type(self)._finalize) + diff --git a/pyculib/utils/libutils.py b/pyculib/utils/libutils.py new file mode 100644 index 0000000..92434ac --- /dev/null +++ b/pyculib/utils/libutils.py @@ -0,0 +1,55 @@ +from __future__ import absolute_import +from numba.cuda.cudadrv.libs import open_cudalib + +class ctype_function(object): + def __init__(self, restype=None, *argtypes): + self.restype = restype + self.argtypes = argtypes + +class Lib(object): + __singleton = None + lib = None + + def __new__(cls): + # Check if we already have opened the dll + if cls.__singleton is None: + try: + dll = open_cudalib(cls.lib) + except OSError as e: + raise Exception("Cannot open library for %s:\n%s" % (cls.lib, + e)) + # Create new instance + inst = object.__new__(cls) + cls.__singleton = inst + inst.dll = dll + inst._initialize() + else: + inst = cls.__singleton + return inst + + def _initialize(self): + # Populate the instance with the functions + for name, obj in vars(type(self)).items(): + if isinstance(obj, ctype_function): + fn = getattr(self.dll, name) + fn.restype = obj.restype + fn.argtypes = obj.argtypes + setattr(self, name, self._auto_checking_wrapper(fn, name=name)) + + def _auto_checking_wrapper(self, fn, name): + def wrapped(*args, **kws): + nargs = len(args) + len(kws) + expected = len(fn.argtypes) + if nargs != expected: + msg = "expecting {expected} arguments but got {nargs}: {fname}" + raise TypeError(msg.format(expected=expected, nargs=nargs, + fname=name)) + status = fn(*args, **kws) + self.check_error(status) + return status + return wrapped + + def check_error(self, status): + if status != 0: + raise self.ErrorType(status) + diff --git a/pyculib/warnings.py b/pyculib/warnings.py new file mode 100644 index 0000000..88037db --- /dev/null +++ b/pyculib/warnings.py @@ -0,0 +1,22 @@ +from __future__ import absolute_import +from pyculib import config +import warnings # so we can use warnings.warn +from warnings import * + +class PerformanceWarning(Warning): + """ + Warning category for when an operation might not be + as fast as expected. + """ + + +# Define a simple no-op for the (default) case +# where performance warnings are disabled. + +def no_warn(*args, **kwds): pass + +if config.WARNINGS: + warn = warnings.warn + +else: + warn = no_warn diff --git a/runtests.py b/runtests.py new file mode 100755 index 0000000..1125c10 --- /dev/null +++ b/runtests.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python + +import runpy + +runpy.run_module('pyculib.runtests', run_name='__main__') diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3f5a53c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[versioneer] +VCS = git +style = pep440 +versionfile_source = pyculib/_version.py +versionfile_build = pyculib/_version.py +tag_prefix = +parentdir_prefix = pyculib diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..862ab39 --- /dev/null +++ b/setup.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +try: + from setuptools import setup +except ImportError: + from distutils.core import setup + +from distutils.spawn import spawn +from distutils.command import build + +import versioneer + +class build_doc(build.build): + description = "build documentation" + + def run(self): + spawn(['make', '-C', 'docs', 'html']) + +packages = [ + 'pyculib', + 'pyculib.blas', + 'pyculib.fft', + 'pyculib.rand', + 'pyculib.sparse', + 'pyculib.sorting', + 'pyculib.utils', + 'pyculib.tests', +] + +cmdclass = versioneer.get_cmdclass() +cmdclass['build_doc'] = build_doc + +if __name__ == '__main__': + setup( + name='pyculib', + description='Pyculib - python bindings for NVIDIA CUDA libraries', + author='Continuum Analytics, Inc.', + author_email='support@continuum.io', + url='http://continuum.io', + packages=packages, + license='BSD', + version=versioneer.get_version(), + cmdclass=cmdclass, + ) diff --git a/versioneer.py b/versioneer.py new file mode 100644 index 0000000..64fea1c --- /dev/null +++ b/versioneer.py @@ -0,0 +1,1822 @@ + +# Version: 0.18 + +"""The Versioneer - like a rocketeer, but for versions. + +The Versioneer +============== + +* like a rocketeer, but for versions! +* https://github.com/warner/python-versioneer +* Brian Warner +* License: Public Domain +* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy +* [![Latest Version] +(https://pypip.in/version/versioneer/badge.svg?style=flat) +](https://pypi.python.org/pypi/versioneer/) +* [![Build Status] +(https://travis-ci.org/warner/python-versioneer.png?branch=master) +](https://travis-ci.org/warner/python-versioneer) + +This is a tool for managing a recorded version number in distutils-based +python projects. The goal is to remove the tedious and error-prone "update +the embedded version string" step from your release process. Making a new +release should be as easy as recording a new tag in your version-control +system, and maybe making new tarballs. + + +## Quick Install + +* `pip install versioneer` to somewhere to your $PATH +* add a `[versioneer]` section to your setup.cfg (see below) +* run `versioneer install` in your source tree, commit the results + +## Version Identifiers + +Source trees come from a variety of places: + +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI + +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: + +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step + +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes. + +The version identifier is used for multiple purposes: + +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball + +## Theory of Operation + +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. + +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. + +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. + +## Installation + +See [INSTALL.md](./INSTALL.md) for detailed installation instructions. + +## Version-String Flavors + +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. + +Both functions return a dictionary with different flavors of version +information: + +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. + +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". + +* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the + commit date in ISO 8601 format. This will be None if the date is not + available. + +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None + +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". + +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. + +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions + +## Styles + +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. + +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". + +Other styles are available. See [details.md](details.md) in the Versioneer +source tree for descriptions. + +## Debugging + +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). + +## Known Limitations + +Some situations are known to cause problems for Versioneer. This details the +most significant ones. More can be found on Github +[issues page](https://github.com/warner/python-versioneer/issues). + +### Subprojects + +Versioneer has limited support for source trees in which `setup.py` is not in +the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are +two common reasons why `setup.py` might not be in the root: + +* Source trees which contain multiple subprojects, such as + [Buildbot](https://github.com/buildbot/buildbot), which contains both + "master" and "slave" subprojects, each with their own `setup.py`, + `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI + distributions (and upload multiple independently-installable tarballs). +* Source trees whose main purpose is to contain a C library, but which also + provide bindings to Python (and perhaps other langauges) in subdirectories. + +Versioneer will look for `.git` in parent directories, and most operations +should get the right version string. However `pip` and `setuptools` have bugs +and implementation details which frequently cause `pip install .` from a +subproject directory to fail to find a correct version string (so it usually +defaults to `0+unknown`). + +`pip install --editable .` should work correctly. `setup.py install` might +work too. + +Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in +some later version. + +[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking +this issue. The discussion in +[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the +issue from the Versioneer side in more detail. +[pip PR#3176](https://github.com/pypa/pip/pull/3176) and +[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve +pip to let Versioneer work correctly. + +Versioneer-0.16 and earlier only looked for a `.git` directory next to the +`setup.cfg`, so subprojects were completely unsupported with those releases. + +### Editable installs with setuptools <= 18.5 + +`setup.py develop` and `pip install --editable .` allow you to install a +project into a virtualenv once, then continue editing the source code (and +test) without re-installing after every change. + +"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a +convenient way to specify executable scripts that should be installed along +with the python package. + +These both work as expected when using modern setuptools. When using +setuptools-18.5 or earlier, however, certain operations will cause +`pkg_resources.DistributionNotFound` errors when running the entrypoint +script, which must be resolved by re-installing the package. This happens +when the install happens with one version, then the egg_info data is +regenerated while a different version is checked out. Many setup.py commands +cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into +a different virtualenv), so this can be surprising. + +[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes +this one, but upgrading to a newer version of setuptools should probably +resolve it. + +### Unicode version strings + +While Versioneer works (and is continually tested) with both Python 2 and +Python 3, it is not entirely consistent with bytes-vs-unicode distinctions. +Newer releases probably generate unicode version strings on py2. It's not +clear that this is wrong, but it may be surprising for applications when then +write these strings to a network connection or include them in bytes-oriented +APIs like cryptographic checksums. + +[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates +this question. + + +## Updating Versioneer + +To upgrade your project to a new release of Versioneer, do the following: + +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files + +## Future Directions + +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. + + +## License + +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . + +""" + +from __future__ import print_function +try: + import configparser +except ImportError: + import ConfigParser as configparser +import errno +import json +import os +import re +import subprocess +import sys + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ("Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND').") + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + me = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(me)[0]) + vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) + if me_dir != vsr_dir: + print("Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py)) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise EnvironmentError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.SafeConfigParser() + with open(setup_cfg, "r") as f: + parser.readfp(f) + VCS = parser.get("versioneer", "VCS") # mandatory + + def get(parser, name): + if parser.has_option("versioneer", name): + return parser.get("versioneer", name) + return None + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = get(parser, "style") or "" + cfg.versionfile_source = get(parser, "versionfile_source") + cfg.versionfile_build = get(parser, "versionfile_build") + cfg.tag_prefix = get(parser, "tag_prefix") + if cfg.tag_prefix in ("''", '""'): + cfg.tag_prefix = "" + cfg.parentdir_prefix = get(parser, "parentdir_prefix") + cfg.verbose = get(parser, "verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +LONG_VERSION_PY['git'] = ''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.18 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + print("stdout was %%s" %% stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %%s but none started with prefix %%s" %% + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs - tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %%s not under git control" %% root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%%s*" %% tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%%d" %% pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%%d" %% pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def do_vcs_install(manifest_in, versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + + For Git, this means creating/changing .gitattributes to mark _version.py + for export-subst keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [manifest_in, versionfile_source] + if ipy: + files.append(ipy) + try: + me = __file__ + if me.endswith(".pyc") or me.endswith(".pyo"): + me = os.path.splitext(me)[0] + ".py" + versioneer_file = os.path.relpath(me) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + f = open(".gitattributes", "r") + for line in f.readlines(): + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + f.close() + except EnvironmentError: + pass + if not present: + f = open(".gitattributes", "a+") + f.write("%s export-subst\n" % versionfile_source) + f.close() + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.18) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. + +import json + +version_json = ''' +%s +''' # END VERSION_JSON + + +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except EnvironmentError: + raise NotThisMethod("unable to read _version.py") + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps(versions, sort_keys=True, + indent=1, separators=(",", ": ")) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set %s to '%s'" % (filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert cfg.versionfile_source is not None, \ + "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print("got version from file %s %s" % (versionfile_abs, ver)) + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, "error": "unable to compute version", + "date": None} + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(): + """Get the custom setuptools/distutils subclasses used by Versioneer.""" + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see https://github.com/warner/python-versioneer/issues/52 + + cmds = {} + + # we add "version" to both distutils and setuptools + from distutils.core import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + print(" date: %s" % vers.get("date")) + if vers["error"]: + print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version + + # we override "build_py" in both distutils and setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + # pip install: + # copies source tree to a tempdir before running egg_info/etc + # if .git isn't copied too, 'git describe' will fail + # then does setup.py bdist_wheel, or sometimes setup.py install + # setup.py egg_info -> ? + + # we override different "build_py" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.build_py import build_py as _build_py + else: + from distutils.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + # nczeczulin reports that py2exe won't like the pep440-style string + # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. + # setup(console=[{ + # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION + # "product_version": versioneer.get_version(), + # ... + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + if 'py2exe' in sys.modules: # py2exe enabled? + try: + from py2exe.distutils_buildexe import py2exe as _py2exe # py3 + except ImportError: + from py2exe.build_exe import py2exe as _py2exe # py2 + + class cmd_py2exe(_py2exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _py2exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["py2exe"] = cmd_py2exe + + # we override different "sdist" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.sdist import sdist as _sdist + else: + from distutils.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, + self._versioneer_generated_versions) + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- + +You will also need to edit your setup.py to use the results: + + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) + +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. + +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = + +""" + +INIT_PY_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + + +def do_setup(): + """Main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except (EnvironmentError, configparser.NoSectionError, + configparser.NoOptionError) as e: + if isinstance(e, (EnvironmentError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", + file=sys.stderr) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), + "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy, "r") as f: + old = f.read() + except EnvironmentError: + old = "" + if INIT_PY_SNIPPET not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(INIT_PY_SNIPPET) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in, "r") as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except EnvironmentError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print(" appending versionfile_source ('%s') to MANIFEST.in" % + cfg.versionfile_source) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-subst keyword + # substitution. + do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py", "r") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1)