diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..f87ab00
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+pyculib/_version.py export-subst
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a4d23bb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+build
+_build
+__pycache__
+*.pyc
+\#*\#
+*.so
+*.dll
+*.dylib
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..d442eba
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include versioneer.py
+include pyculib/_version.py
diff --git a/README.md b/README.md
index 38c2318..0752f51 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,79 @@
-# pycudalib
-pycudalib
+# Pyculib
+
+Pyculib provides Python bindings to the following CUDA libraries:
+
+ * [cuBLAS](https://developer.nvidia.com/cublas)
+ * [cuFFT](https://developer.nvidia.com/cufft)
+ * [cuSPARSE](https://developer.nvidia.com/cusparse)
+ * [cuRAND](https://developer.nvidia.com/curand)
+ * CUDA Sorting algorithms from the [CUB](https://nvlabs.github.io/cub/) and
+   [Modern GPU](https://github.com/moderngpu/moderngpu) libraries.
+
+These bindings are direct ports of those available in [Anaconda
+Accelerate](https://docs.continuum.io/accelerate/cuda-libs).
+
+Documentation is located [here](LINK_TO_DOCS)
+
+## Installing
+
+The easiest way to install Pyculib and get updates is by using the [Anaconda
+Distribution](https://www.continuum.io/downloads)
+
+```
+#> conda install pyculib
+```
+
+To compile from source, it is recommended to create a conda environment
+containing the following:
+
+ * cffi
+ * cudatoolkit
+ * numpy
+ * numba
+ * pyculib\_sorting
+ * scipy
+
+for instructions on how to do this see the [conda](https://conda.io/docs/)
+documentation, specifically the section on [managing
+environments](https://conda.io/docs/using/envs.html#managing-environments).
+
+Once a suitable environment is activated, installation achieved simply by
+running:
+
+```
+#> python setup.py install
+```
+
+and the installation can be tested with:
+
+```
+#> ./runtests.py
+```
+
+
+## Documentation
+
+Documentation is located [here](LINK_TO_DOCS).
+
+### Building Documentation
+
+It is also possible to build a local copy of the documentation from source.
+This requires GNU Make and sphinx (available via conda).
+
+
+Documentation is stored in the `doc` folder, and should be built with:
+
+```
+#> make SPHINXOPTS=-Wn clean html
+```
+
+This ensures that the documentation renders without errors. If errors occur,
+they can all be seen at once by building with:
+
+```
+#> make SPHINXOPTS=-n clean html
+```
+
+However, these errors should all be fixed so that building with `-Wn` is
+possible prior to merging any documentation changes or updates.
+
diff --git a/condarecipe/bld.bat b/condarecipe/bld.bat
new file mode 100644
index 0000000..39b5e1f
--- /dev/null
+++ b/condarecipe/bld.bat
@@ -0,0 +1 @@
+%PYTHON% setup.py install
diff --git a/condarecipe/build.sh b/condarecipe/build.sh
new file mode 100755
index 0000000..4dad93a
--- /dev/null
+++ b/condarecipe/build.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+$PYTHON setup.py install
diff --git a/condarecipe/meta.yaml b/condarecipe/meta.yaml
new file mode 100644
index 0000000..52d61a6
--- /dev/null
+++ b/condarecipe/meta.yaml
@@ -0,0 +1,33 @@
+package:
+  name: pyculib
+  version: {{ GIT_DESCRIBE_TAG }}
+
+source:
+  path: ..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER|int }}
+
+requirements:
+  build:
+    - python
+    - numpy x.x
+    - numba
+    - cffi
+  run:
+    - python
+    - numpy x.x
+    - numba
+    - cudatoolkit 7.5
+    - libgfortran     [linux64]
+    - pyculib_sorting 1.0.0
+    - cffi
+
+test:
+  requires:
+    - pytest
+    - scipy
+  imports:
+    - pyculib
+  commands:
+    - python -m unittest -v pyculib
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..525be10
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,192 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Pyculib.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Pyculib.qhc"
+
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/Pyculib"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Pyculib"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..fe25fa4
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,263 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
+set I18NSPHINXOPTS=%SPHINXOPTS% source
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  xml        to make Docutils-native XML files
+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	echo.  coverage   to run coverage check of the documentation if enabled
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+
+REM Check if sphinx-build is available and fallback to Python version if any
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 goto sphinx_python
+goto sphinx_ok
+
+:sphinx_python
+
+set SPHINXBUILD=python -m sphinx.__init__
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+:sphinx_ok
+
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Pyculib.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Pyculib.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdf" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdfja" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf-ja
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+if "%1" == "coverage" (
+	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of coverage in the sources finished, look at the ^
+results in %BUILDDIR%/coverage/python.txt.
+	goto end
+)
+
+if "%1" == "xml" (
+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The XML files are in %BUILDDIR%/xml.
+	goto end
+)
+
+if "%1" == "pseudoxml" (
+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+	goto end
+)
+
+:end
diff --git a/docs/quickstart.md b/docs/quickstart.md
new file mode 100644
index 0000000..0a45f20
--- /dev/null
+++ b/docs/quickstart.md
@@ -0,0 +1,48 @@
+# Pyculib QuickStart
+
+Pyculib provides access to NVIDIA' optimized CUDA libraries
+from a high-level, Pythonic interface. It builds on top of the functionality
+provided in the open-source Numba JIT compiler.
+
+
+## How do I install it?
+
+System requirements:
+
+* Python 2.7, 3.4+
+* Numpy XXX TODO: FIX
+* NVIDIA CUDA-enabled GPU with compute
+  capability 2.0 or above. CUDA Toolkit 7.5 and driver version 349.00 or above
+  ([https://developer.nvidia.com/cuda-toolkit](https://developer.nvidia.com/cuda-toolkit))
+* Numba 0.33+
+
+### Install from Anaconda
+
+Download and install Anaconda from
+[https://www.continuum.io/downloads](https://www.continuum.io/downloads).
+
+In terminal:
+
+```
+conda update conda
+conda install pyculib
+```
+
+## How do I use it?
+
+Pyculib provides access to optimized dense and sparse linear algebra, random
+number generators, sorting routines, and FFTs. This example demonstrates the use
+of CUDA-FFT routines:
+
+
+### CUDA-Accelerated FFT
+
+```python
+from pyculib.fft import fft
+fft(x, xf)
+```
+
+## Where do I learn more?
+
+* Full documentation: (Insert URL)
+* CUDA-accelerated functions: (Insert URL)/cudalibs.html
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..32e8a41
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Pyculib documentation build configuration file, created by
+# sphinx-quickstart on Fri Aug  7 17:43:03 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+import shlex
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    'sphinx.ext.mathjax',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'Pyculib'
+copyright = '2017, Continuum Analytics, Inc.'
+author = 'Continuum Analytics, Inc.'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.1'
+# The full version, including alpha/beta/rc tags.
+release = '0.1.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = []
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
+#html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# Now only 'ja' uses this config value
+#html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Pyculibdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+
+# Latex figure (float) alignment
+#'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, 'Pyculib.tex', 'Pyculib Documentation',
+   'Continuum Analytics, Inc.', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pyculib', 'Pyculib Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  (master_doc, 'Pyculib', 'Pyculib Documentation',
+   author, 'Pyculib', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = { 'python': ('https://docs.python.org/', None),
+                        'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
+                        'numpy': ('https://docs.scipy.org/doc/numpy', None),
+}
+
+# Ignore python.array not being resolved
+nitpick_ignore = [('py:obj', 'python.array')]
diff --git a/docs/source/cublas.rst b/docs/source/cublas.rst
new file mode 100644
index 0000000..8760cca
--- /dev/null
+++ b/docs/source/cublas.rst
@@ -0,0 +1,368 @@
+cuBLAS
+======
+
+Provides basic linear algebra building blocks. See `NVIDIA cuBLAS
+<http://docs.nvidia.com/cuda/cublas/index.html>`_.
+
+The cuBLAS binding provides an interface that accepts NumPy arrays and Numba's
+CUDA device arrays. The binding automatically transfers NumPy array arguments to
+the device as required. This automatic transfer may generate some unnecessary
+transfers, so optimal performance is likely to be obtained by the manual
+transfer for NumPy arrays into device arrays and using the cuBLAS to manipulate
+device arrays where possible.
+
+No special naming convention is used to identify the data
+type, unlike in the BLAS C and Fortran APIs. Arguments for array storage
+information which are part of the cuBLAS C API are also not necessary since
+NumPy arrays and device arrays contain this information.
+
+All functions are accessed through the :class:`pyculib.blas.Blas` class:
+
+.. autoclass:: pyculib.blas.Blas
+
+BLAS Level 1
+------------
+
+.. py:method:: pyculib.blas.Blas.nrm2(x)
+
+    Computes the L2 norm for array `x`. Same as `numpy.linalg.norm(x)`.
+
+    :param x: input vector
+    :type x: python.array
+    :returns: resulting norm.
+
+.. py:method:: pyculib.blas.Blas.dot(x, y)
+
+    Compute the dot product of array `x` and array `y`.  Same as `np.dot(x, y)`.
+
+    :param x: vector
+    :type x: python.array
+    :param y: vector
+    :type y: python.array
+    :returns: dot product of `x` and `y`
+
+.. py:method:: pyculib.blas.Blas.dotc(x, y)
+
+    Uses the conjugate of the element of the vectors to compute the dot product
+    of array `x` and array `y` for complex dtype only.  Same as `np.vdot(x, y)`.
+
+    :param x: vector
+    :type x: python.array
+    :param y: vector
+    :type y: python.array
+    :returns: dot product of `x` and `y`
+
+
+.. py:method:: pyculib.blas.Blas.scal(alpha, x)
+
+    Scale `x` inplace by alpha.  Same as `x = alpha * x`
+
+    :param alpha: scalar
+    :param x: vector
+    :type x: python.array
+
+.. py:method:: pyculib.blas.Blas.axpy(alpha, x)
+
+    Compute `y = alpha * x + y` inplace.
+
+    :param alpha: scalar
+    :param x: vector
+    :type x: python.array
+
+
+.. py:method:: pyculib.blas.Blas.amax(x)
+
+
+    Find the index of the first largest element in array `x`.
+    Same as `np.argmax(x)`
+
+    :param x: vector
+    :type x: python.array
+    :returns: index (start from 0).
+
+
+.. py:method:: pyculib.blas.Blas.amin(x)
+
+    Find the index of the first largest element in array `x`.
+    Same as `np.argmin(x)`
+
+    :param x: vector
+    :type x: python.array
+    :returns: index (start from 0).
+
+
+.. py:method:: pyculib.blas.Blas.asum(x)
+
+    Compute the sum of all element in array `x`.
+
+    :param x: vector
+    :type x: python.array
+    :returns: `x.sum()`
+
+.. py:method:: pyculib.blas.Blas.rot(x, y, c, s)
+
+    Apply the Givens rotation matrix specified by the cosine element `c` and the
+    sine element `s` inplace on vector element `x` and `y`.
+
+    Same as `x, y = c * x + s * y, -s * x + c * y`
+
+    :param x: vector
+    :type x: python.array
+    :param y: vector
+    :type y: python.array
+
+
+.. py:method:: pyculib.blas.Blas.rotg(a, b)
+
+    Constructs the Givens rotation matrix with the column vector (a, b).
+
+    :param a: first element of the column vector
+    :param b: second element of the column vector
+    :returns: a tuple (r, z, c, s)
+
+        r -- `r = a**2 + b**2`
+
+        z -- Use to reconstruct `c` and `s`.
+             Refer to cuBLAS documentation for detail.
+
+        c -- The consine element.
+
+        s -- The sine element.
+
+
+.. py:method:: pyculib.blas.Blas.rotm(x, y, param)
+
+    Applies the modified Givens transformation inplace.
+
+    Same as::
+
+        param = flag, h11, h21, h12, h22
+        x[i] = h11 * x[i] + h12 * y[i]
+        y[i] = h21 * x[i] + h22 * y[i]
+
+    Refer to the cuBLAS documentation for the use of `flag`.
+
+    :param x: vector
+    :type x: python.array
+    :param y: vector
+    :type y: python.array
+
+
+.. py:method:: pyculib.blas.Blas.rotmg(d1, d2, x1, y1)
+
+    Constructs the modified Givens transformation `H` that zeros out the second
+    entry of a column vector `(d1 * x1, d2 * y1)`.
+
+    :param d1: scaling factor for the x-coordinate of the input vector
+    :param d2: scaling factor for the y-coordinate of the input vector
+    :param x1: x-coordinate of the input vector
+    :param y1: y-coordinate of the input vector
+
+    :returns: A 1D array that is usable in `rotm`.
+              The first element is the flag for `rotm`.
+              The rest of the elements corresponds to the `h11, h21, h12, h22`
+              elements of `H`.
+
+BLAS Level 2
+-------------
+
+All level 2 routines follow the following naming convention for all arguments:
+
+* A, B, C, AP -- (2D array) Matrix argument.
+                 `AP` implies packed storage for banded matrix.
+* x, y, z -- (1D arrays)  Vector argument.
+* alpha, beta -- (scalar) Can be floats or complex numbers depending.
+* m -- (scalar)  Number of rows of matrix `A`.
+* n -- (scalar)  Number of columns of matrix `A`.  If `m` is not needed,
+                 `n` also means the number of rows of the matrix `A`; thus,
+                 implying a square matrix.
+* trans, transa, transb -- (string)
+                Select the operation `op` to apply to a matrix:
+
+                - 'N': `op(X) = X`, the identity operation;
+                - 'T': `op(X) = X**T`, the transpose;
+                - 'C': `op(X) = X**H`, the conjugate transpose.
+
+                `trans` only applies to the only matrix argument.
+                `transa` and `transb` apply to matrix `A` and matrix `B`,
+                respectively.
+* uplo -- (string) Can be 'U' for filling the upper trianglar matrix; or 'L' for
+          filling the lower trianglar matrix.
+* diag -- (boolean)  Whether the matrix diagonal has unit elements.
+* mode -- (string) 'L' means the matrix is on the left side in the equation.
+                   'R' means the matrix is on the right side in the equation.
+
+.. note:: The last array argument is always overwritten with the result.
+
+.. py:method:: pyculib.blas.Blas.gbmv(trans, m, n, kl, ku, alpha, A, x, beta, y)
+
+    banded matrix-vector multiplication `y = alpha * op(A) * x + beta * y` where
+    `A` has `kl` sub-diagonals and `ku` super-diagonals.
+
+.. py:method:: pyculib.blas.Blas.gemv(trans, m, n, alpha, A, x, beta, y)
+
+    matrix-vector multiplication `y = alpha * op(A) * x + beta * y`
+
+.. py:method:: pyculib.blas.Blas.trmv(uplo, trans, diag, n, A, x)
+
+    triangular matrix-vector multiplication `x = op(A) * x`
+
+.. py:method:: pyculib.blas.Blas.tbmv(uplo, trans, diag, n, k, A, x)
+
+    triangular banded matrix-vector `x = op(A) * x`
+
+.. py:method:: pyculib.blas.Blas.tpmv(uplo, trans, diag, n, AP, x)
+
+    triangular packed matrix-vector multiplication `x = op(A) * x`
+
+.. py:method:: pyculib.blas.Blas.trsv(uplo, trans, diag, n, A, x)
+
+    Solves the triangular linear system with a single right-hand-side.
+    `op(A) * x = b`
+
+.. py:method:: pyculib.blas.Blas.tpsv(uplo, trans, diag, n, AP, x)
+
+    Solves the packed triangular linear system with a single right-hand-side.
+    `op(A) * x = b`
+
+.. py:method:: pyculib.blas.Blas.tbsv(uplo, trans, diag, n, k, A, x)
+
+    Solves the triangular banded linear system with a single right-hand-side.
+    `op(A) * x = b`
+
+.. py:method:: pyculib.blas.Blas.symv(uplo, n, alpha, A, x, beta, y)
+
+    symmetric matrix-vector multiplication `y = alpha * A * x + beta * y`
+
+.. py:method:: pyculib.blas.Blas.hemv(uplo, n, alpha, A, x, beta, y)
+
+    Hermitian matrix-vector multiplication `y = alpha * A * x + beta * y`
+
+.. py:method:: pyculib.blas.Blas.sbmv(uplo, n, k, alpha, A, x, beta, y)
+
+    symmetric banded matrix-vector multiplication  `y = alpha * A * x + beta * y`
+
+.. py:method:: pyculib.blas.Blas.hbmv(uplo, n, k, alpha, A, x, beta, y)
+
+    Hermitian banded matrix-vector multiplication  `y = alpha * A * x + beta * y`
+
+.. py:method:: pyculib.blas.Blas.spmv(uplo, n, alpha, AP, x, beta, y)
+
+    symmetric packed matrix-vector multiplication `y = alpha * A * x + beta * y`
+
+.. py:method:: pyculib.blas.Blas.hpmv(uplo, n, alpha, AP, x, beta, y)
+
+    Hermitian packed matrix-vector multiplication `y = alpha * A * x + beta * y`
+
+.. py:method:: pyculib.blas.Blas.ger(m, n, alpha, x, y, A)
+
+    the rank-1 update `A := alpha * x * y ** T + A`
+
+.. py:method:: pyculib.blas.Blas.geru(m, n, alpha, x, y, A)
+
+    the rank-1 update `A := alpha * x * y ** T + A`
+
+.. py:method:: pyculib.blas.Blas.gerc(m, n, alpha, x, y, A)
+
+    the rank-1 update `A := alpha * x * y ** H + A`
+
+.. py:method:: pyculib.blas.Blas.syr(uplo, n, alpha, x, A)
+
+    symmetric rank 1 operation `A := alpha * x * x ** T + A`
+
+.. py:method:: pyculib.blas.Blas.her(uplo, n, alpha, x, A)
+
+    hermitian rank 1 operation  `A := alpha * x * x ** H + A`
+
+.. py:method:: pyculib.blas.Blas.spr(uplo, n, alpha, x, AP)
+
+    the symmetric rank 1 operation `A := alpha * x * x ** T + A`
+
+.. py:method:: pyculib.blas.Blas.hpr(uplo, n, alpha, x, AP)
+
+    hermitian rank 1 operation `A := alpha * x * x ** H + A`
+
+.. py:method:: pyculib.blas.Blas.syr2(uplo, n, alpha, x, y, A)
+
+    symmetric rank-2 update `A = alpha * x * y ** T + y * x ** T + A`
+
+.. py:method:: pyculib.blas.Blas.her2(uplo, n, alpha, x, y, A)
+
+    Hermitian rank-2 update `A = alpha * x * y ** H + alpha * y * x ** H + A`
+
+.. py:method:: pyculib.blas.Blas.spr2(uplo, n, alpha, x, y, A)
+
+    packed symmetric rank-2 update `A = alpha * x * y ** T + y * x ** T + A`
+
+.. py:method:: pyculib.blas.Blas.hpr2(uplo, n, alpha, x, y, A)
+
+    packed Hermitian rank-2 update `A = alpha * x * y ** H + alpha * y * x ** H + A`
+
+BLAS Level 3
+-------------
+
+All level 3 routines follow the same naming convention for arguments as in
+level 2 routines.
+
+.. py:method:: pyculib.blas.Blas.gemm(transa, transb, m, n, k, alpha, A, B, beta, C)
+
+    matrix-matrix multiplication `C = alpha * op(A) * op(B) + beta * C`
+
+.. py:method:: pyculib.blas.Blas.syrk(uplo, trans, n, k, alpha, A, beta, C)
+
+    symmetric rank- k update `C = alpha * op(A) * op(A) ** T + beta * C`
+
+.. py:method:: pyculib.blas.Blas.herk(uplo, trans, n, k, alpha, A, beta, C)
+
+    Hermitian rank- k update `C = alpha * op(A) * op(A) ** H + beta * C`
+
+.. py:method:: pyculib.blas.Blas.symm(side, uplo, m, n, alpha, A, B, beta, C)
+
+    symmetric matrix-matrix multiplication::
+
+        if  side == 'L':
+            C = alpha * A * B + beta * C
+        else:  # side == 'R'
+            C = alpha * B * A + beta * C
+
+.. py:method:: pyculib.blas.Blas.hemm(side, uplo, m, n, alpha, A, B, beta, C)
+
+    Hermitian matrix-matrix multiplication::
+
+            if  side == 'L':
+                C = alpha * A * B + beta * C
+            else:   #  side == 'R':
+                C = alpha * B * A + beta * C
+
+.. py:method:: pyculib.blas.Blas.trsm(side, uplo, trans, diag, m, n, alpha, A, B)
+
+    Solves the triangular linear system with multiple right-hand-sides::
+
+        if  side == 'L':
+            op(A) * X = alpha * B
+        else:       # side == 'R'
+            X * op(A) = alpha * B
+
+
+.. py:method:: pyculib.blas.Blas.trmm(side, uplo, trans, diag, m, n, alpha, A, B, C)
+
+    triangular matrix-matrix multiplication::
+
+        if  side == ':'
+            C = alpha * op(A) * B
+        else:   # side == 'R'
+            C = alpha * B * op(A)
+
+.. py:method:: pyculib.blas.Blas.dgmm(side, m, n, A, x, C)
+
+    matrix-matrix multiplication::
+
+        if  mode == 'R':
+            C = A * x * diag(X)
+        else:       # mode == 'L'
+            C = diag(X) * x * A
+
+
+.. py:method:: pyculib.blas.Blas.geam(transa, transb, m, n, alpha, A, beta, B, C)
+
+    matrix-matrix addition/transposition `C = alpha * op(A) + beta * op(B)`
diff --git a/docs/source/cuda-libs.rst b/docs/source/cuda-libs.rst
new file mode 100644
index 0000000..ef2b409
--- /dev/null
+++ b/docs/source/cuda-libs.rst
@@ -0,0 +1,14 @@
+==============
+CUDA libraries
+==============
+
+The following CUDA libraries have bindings and algorithms that are available for use with Pyculib:
+
+.. toctree::
+	:maxdepth: 1
+
+	cublas
+	cusparse
+	cufft
+	curand
+	sorting
diff --git a/docs/source/cufft.rst b/docs/source/cufft.rst
new file mode 100644
index 0000000..26cfd66
--- /dev/null
+++ b/docs/source/cufft.rst
@@ -0,0 +1,38 @@
+cuFFT
+=======
+
+Provides FFT and inverse FFT for 1D, 2D and 3D arrays.
+See `NVIDIA cuFFT <http://docs.nvidia.com/cuda/cufft/index.html>`_.
+
+
+.. note::  cuFFT only supports FFT operations on numpy.float32, numpy float64,
+           numpy.complex64, numpy.complex128 with C-contiguous datalayout.
+
+
+Forward FFT
+------------
+
+.. py:function:: pyculib.fft.fft(ary, out[, stream])
+.. py:function:: pyculib.fft.fft_inplace(ary[, stream])
+
+    :param ary: The input array. The inplace version stores the result in here.
+    :param out: The output array for non-inplace versions.
+    :param stream: The CUDA stream in which all operations will take place.
+
+
+Inverse FFT
+------------
+
+.. py:function:: pyculib.fft.ifft(ary, out[, stream])
+.. py:function:: pyculib.fft.ifft_inplace(ary[, stream])
+
+    :param ary: The input array. The inplace version stores the result in here.
+    :param out: The output array for non-inplace versions.
+    :param stream: The CUDA stream in which all operations will take place.
+
+FFTPlan
+--------
+
+.. autoclass:: pyculib.fft.FFTPlan
+    :members:
+
diff --git a/docs/source/curand.rst b/docs/source/curand.rst
new file mode 100644
index 0000000..b31224e
--- /dev/null
+++ b/docs/source/curand.rst
@@ -0,0 +1,43 @@
+cuRAND
+======
+
+Provides `pseudo-random number generator` (PRNG) and `quasi-random generator` (QRNG).
+See `NVIDIA cuRAND <http://docs.nvidia.com/cuda/curand/index.html>`_.
+
+class PRNG
+-----------
+
+.. autoclass:: pyculib.rand.PRNG
+   :members:
+
+
+class QRNG
+------------
+
+.. autoclass:: pyculib.rand.QRNG
+   :members:
+
+
+Top Level PRNG Functions
+--------------------------
+
+Simple interface to the PRNG methods.
+
+.. note:: This methods automatically create a PRNG object.
+
+.. autofunction:: pyculib.rand.uniform
+
+.. autofunction:: pyculib.rand.normal
+
+.. autofunction:: pyculib.rand.lognormal
+
+.. autofunction:: pyculib.rand.poisson
+
+Top Level QRNG Functions
+--------------------------
+
+Simple interface to the QRNG methods.
+
+.. note:: This methods automatically create a QRNG object.
+
+.. autofunction:: pyculib.rand.quasi
diff --git a/docs/source/cusparse.rst b/docs/source/cusparse.rst
new file mode 100644
index 0000000..5c6c405
--- /dev/null
+++ b/docs/source/cusparse.rst
@@ -0,0 +1,763 @@
+cuSPARSE
+========
+
+Provides basic linear algebra operations for sparse matrices. See `NVIDIA
+cuSPARSE <http://docs.nvidia.com/cuda/cusparse/>`_ for an in-depth description
+of the cuSPARSE library and its methods and data types. All functions are
+accessed through the :class:`pyculib.sparse.Sparse` class:
+
+.. autoclass:: pyculib.sparse.Sparse
+
+Similarly to the cuBLAS interface, no special naming convention is used for
+functions to operate on different datatypes - all datatypes are handled by each
+function, and dispatch of the corresponding library function is handled by
+Pyculib. However, it is often necessary to provide a *matrix descriptor* to
+functions, which provides some information about the format and properties of a
+matrix. A matrix descriptor can be obtained from the
+:py:meth:`pyculib.sparse.Sparse.matdescr` method:
+
+.. py:method:: pyculib.sparse.Sparse.matdescr(indexbase, diagtype, fillmode, matrixtype)
+
+   Creates a matrix descriptor that describes a matrix with the given
+   `indexbase`, `diagtype`, `fillmode`, and `matrixtype`. Note that not all of
+   these options are relevant to every matrix storage format.
+
+   :param indexbase: Optional. 0 for 0-based indexing, or 1 for 1-based
+                     indexing. If not specified, the default given to the
+                     :py:class:`pyculib.sparse.Sparse` constructor is
+                     used instead.
+   :param diagtype: Optional. Defaults to `'N'`. `'N'` signifies that the matrix
+                    diagonal has non-unit elements. `'U'` signifies that the
+                    matrix diagonal only contains unit elements.
+   :param fillmode: Optional. Defaults to `'L'`. `'L'` indicates that the lower
+                    triangular part of the matrix is stored. `'U'` indicates
+                    that the upper triangular part of the matrix is stored.
+   :param matrixtype: Optional. Defaults to `'G'`. `'S'` indicates that the
+                      matrix is symmetric. `'H'` indicates that the matrix is
+                      Hermitian. `'T'` indicates that the matrix is triangular.
+                      `'G'` is used for a *general* matrix, which is not
+                      symmetric, Hermitian, or triangular.
+   :return: A matrix descriptor.
+
+Many of the methods of the :class:`pyculib.sparse.Sparse` class accept
+the individual data structures that make up a sparse representation of a matrix
+(for example the values, the row pointers and the column indices for a CSR
+format matrix). However, some methods (such as
+:py:meth:`pyculib.sparse.Sparse.csrgemm_ez`), accept an instance of the
+:class:`pyculib.sparse.CudaSparseMatrix` class:
+
+.. py:class:: pyculib.sparse.CudaSparseMatrix()
+
+   Base class for a representation of a sparse matrix on a CUDA device. The
+   constructor takes no arguments.
+
+   .. py:method:: from_host_matrix(matrix, stream)
+
+      Initialise the matrix structure and values from an instance of a matrix on
+      the host. The host matrix must be of the corresponding host type, which is
+      documented for each subclass below.
+
+   .. py:method:: copy_to_host(stream)
+
+      Create an instance of the corresponding host matrix type and copy the
+      matrix structure and data into it from the device. See subclass
+      documentation for an indication of the corresponding matrix type.
+
+Subclasses of the sparse matrix type are:
+
+.. py:class:: pyculib.sparse.CudaBSRMatrix()
+
+   CUDA sparse matrix for which the corresponding type is a
+   :py:class:`scipy.sparse.bsr_matrix`.
+
+.. py:class:: pyculib.sparse.CudaCSRMatrix()
+
+   CUDA sparse matrix for which the corresponding type is a
+   :py:class:`scipy.sparse.csr_matrix`.
+
+.. py:class:: pyculib.sparse.CudaCSCMatrix()
+
+   CUDA sparse matrix for which the corresponding type is a
+   :py:class:`scipy.sparse.csc_matrix`.
+
+There are also some convenience methods for constructing CUDA sparse matrices in
+a similar manner to Scipy sparse matrices:
+
+.. automethod:: pyculib.sparse.bsr_matrix
+
+.. automethod:: pyculib.sparse.csr_matrix
+
+.. automethod:: pyculib.sparse.csc_matrix
+
+BLAS Level 1
+------------
+
+.. py:method:: pyculib.sparse.Sparse.axpyi(alpha, xVal, xInd, y)
+
+   Multiplies the sparse vector `x` by `alpha` and adds the result to the dense
+   vector `y`.
+
+   :param alpha: scalar
+   :param xVal: vector of non-zero values of `x`
+   :param xInd: vector of indices of non-zero values of `x`
+   :param y: dense vector
+   :return: dense vector
+
+.. py:method:: pyculib.sparse.Sparse.doti(xVal, xInd, y)
+
+   Computes the dot product of the sparse vector `x` and dense vector `y`.
+
+   :param xVal: vector of non-zero values of `x`
+   :param xInd: vector of indices of non-zero values of `x`
+   :param y: dense vector
+   :return: scalar
+
+.. py:method:: pyculib.sparse.Sparse.dotci(xVal, xInd, y)
+
+   Computes the dot product of the complex conjugate of the sparse vector `x`
+   and the dense vector `y`.
+
+   :param xVal: vector of non-zero values of `x`
+   :param xInd: vector of indices of non-zero values of `x`
+   :param y: dense vector
+   :return: scalar
+
+.. py:method:: pyculib.sparse.Sparse.gthr(y, xVal, xInd)
+
+   Gathers the elements of `y` at the indices `xInd` into the array `xVal`
+
+   :param xVal: vector of non-zero values of `x`
+   :param xInd: vector of indices of non-zero values of `x`
+   :param y: dense vector
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.gthrz(y, xVal, xInd)
+
+   Gathers the elements of `y` at the indices `xInd` into the array `xVal` and
+   zeroes out the gathered elements of `y`.
+
+   :param xVal: vector of non-zero values of `x`
+   :param xInd: vector of indices of non-zero values of `x`
+   :param y: dense vector
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.roti(xVal, xInd, y, c, s)
+
+   Applies the Givens rotation matrix, `G`:
+
+   .. math::
+
+      G = \left( \begin{array}{cc}
+      C  & S \\
+      -S & C
+      \end{array}\right)
+
+   to the sparse vector `x` and dense vector
+   `y`.
+
+   :param xVal: vector of non-zero values of `x`
+   :param xInd: vector of indices of non-zero values of `x`
+   :param y: dense vector
+   :param c: cosine element of the rotation matrix
+   :param s: sine element of the rotation matrix
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.sctr(xVal, xInd, y)
+
+   Scatters the elements of the sparse vector `x` into the dense vector `y`.
+   Elements of `y` whose indices are not listed in `xInd` are unmodified.
+
+   :param xVal: vector of non-zero values of `x`
+   :param xInd: vector of indices of non-zero values of `x`
+   :param y: dense vector
+   :return: None
+
+
+BLAS Level 2
+------------
+
+All level 2 routines follow the following naming convention for the following
+arguments:
+
+* alpha, beta -- (scalar) Can be real or complex numbers.
+* descr, descrA, descrB -- (descriptor) Matrix descriptor. An appropriate
+  descriptor may be obtained by calling
+  :py:meth:`pyculib.sparse.Sparse.matdescr`. `descr` only applies to the
+  only matrix argument. `descrA` and `descrB` apply to matrix `A` and matrix
+  `B`, respectively.
+* dir -- (string) Can be `'C'` to indicate column-major block storage or `'R'`
+  to indicate row-major block storage.
+* trans, transa, transb -- (string)
+                Select the operation `op` to apply to a matrix:
+
+                - `'N'`: `op(X) = X`, the identity operation;
+                - `'T'`: `op(X) = X**T`, the transpose;
+                - `'C'`: `op(X) = X**H`, the conjugate transpose.
+
+                `trans` only applies to the only matrix argument.
+                `transa` and `transb` apply to matrix `A` and matrix `B`,
+                respectively.
+
+
+.. py:method:: pyculib.sparse.Sparse.bsrmv_matrix(dir, trans, alpha, descr, bsrmat, x, beta, y)
+
+   Matrix-vector multiplication `y = alpha * op(A) * x + beta * y` with a
+   BSR-format matrix.
+
+   :param dir: block storage direction
+   :param trans: operation to apply to the matrix
+   :param alpha: scalar
+   :param descr: matrix descriptor
+   :param bsrmat: the matrix `A`
+   :param x: dense vector
+   :param beta: scalar
+   :param y: dense vector
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.bsrmv(dir, trans, mb, nb, nnzb, alpha, descr, bsrVal, bsrRowPtr, bsrColInd, blockDim, x, beta, y)
+
+   Matrix-vector multiplication `y = alpha * op(A) * x + beta * y` with a
+   BSR-format matrix. This function accepts the individual arrays that make up
+   the structure of a BSR matrix - if a
+   :class:`pyculib.sparse.CudaBSRMatrix` instance is to hand, it is
+   recommended to use the :py:meth:`bsrmv_matrix` method instead.
+
+   :param dir: block storage direction
+   :param trans: operation to apply to the matrix
+   :param mb: Number of block rows of the matrix
+   :param nb: Number of block columns of the matrix
+   :param nnzb: Number of nonzero blocks of the matrix
+   :param alpha: scalar
+   :param descr: matrix descriptor
+   :param bsrVal: vector of nonzero values of the matrix
+   :param bsrRowPtr: vector of block row pointers of the matrix
+   :param bsrColInd: vector of block column indices of the matrix
+   :param blockDim: block dimension of the matrix
+   :param x: dense vector
+   :param beta: scalar
+   :param y: dense vector
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.bsrxmv(dir, trans, sizeOfMask, mb, nb, nnzb, alpha, descr, bsrVal, bsrMaskPtr, bsrRowPtr, bsrEndPtr, bsrColInd, blockDim, x, beta, y)
+
+   Matrix-vector multiplication similar to :py:meth:`bsrmv`, but including a
+   mask operation: `y(mask) = (alpha * op(A) * x + beta * y)(mask)`. The blocks
+   of y to be updated are specified in `bsrMaskPtr`. Blocks whose indices are
+   not specified in `bsrMaskPtr` are left unmodified.
+
+   :param dir: block storage direction
+   :param trans: operation to apply to the matrix
+   :param sizeOfMask: number of updated blocks of rows of `y`
+   :param mb: Number of block rows of the matrix
+   :param nb: Number of block columns of the matrix
+   :param nnzb: Number of nonzero blocks of the matrix
+   :param alpha: scalar
+   :param descr: matrix descriptor
+   :param bsrVal: vector of nonzero values of the matrix
+   :param bsrMaskPtr: vector of indices of the block elements to be updated
+   :param bsrRowPtr: vector of block row pointers of the matrix
+   :param bsrEndPtr: vector of pointers to the end of every block row plus one
+   :param bsrColInd: vector of block column indices of the matrix
+   :param blockDim: block dimension of the matrix
+   :param x: dense vector
+   :param beta: scalar
+   :param y: dense vector
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.csrmv(trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y)
+
+   Matrix-vector multiplication `y = alpha * op(A) * x + beta * y` with a
+   CSR-format matrix.
+
+   :param trans: operation to apply to the matrix
+   :param m: Number of rows of the matrix
+   :param n: Number of columns of the matrix
+   :param nnz: Number of nonzeroes of the matrix
+   :param alpha: scalar
+   :param descr: matrix descriptor
+   :param csrVal: vector of nonzero values of the matrix
+   :param csrRowPtr: vector of row pointers of the matrix
+   :param csrColInd: vector of column indices of the matrix
+   :param x: dense vector
+   :param beta: scalar
+   :param y: dense vector
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.csrsv_analysis(trans, m, nnz, descr, csrVal, csrRowPtr, csrColInd)
+
+   Performs the analysis phase of the solution of the sparse triangular linear
+   system `op(A) * y = alpha * x`. This needs to be executed only once for a
+   given matrix and operation type.
+
+   :param trans: operation to apply to the matrix
+   :param m: number of rows of the matrix
+   :param nnz: number of nonzeroes of the matrix
+   :param descr: matrix descriptor
+   :param csrVal: vector of nonzero values of the matrix
+   :param csrRowPtr: vector of row pointers of the matrix
+   :param csrColInd: vector of column indices of the matrix
+   :return: the analysis result, which can be used as input to the solve phase
+
+.. py:method:: pyculib.sparse.Sparse.csrsv_solve(trans, m, alpha, descr, csrVal, csrRowPtr, csrColInd, info, x, y)
+
+   Performs the analysis phase of the solution of the sparse triangular linear
+   system `op(A) * y = alpha * x`.
+
+   :param trans: operation to apply to the matrix
+   :param m: number of rows of the matrix
+   :param alpha: scalar
+   :param descr: matrix descriptor
+   :param csrVal: vector of nonzero values of the matrix
+   :param csrRowPtr: vector of row pointers of the matrix
+   :param csrColInd: vector of column indices of the matrix
+   :param info: the analysis result from :py:meth:`csrsv_analysis`
+   :param x: dense vector
+   :param y: dense vector into which the solve result is stored
+   :return: None
+
+
+BLAS Level 3
+------------
+
+.. py:method:: pyculib.sparse.Sparse.csrmm(transA, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc)
+
+   Matrix-matrix multiplication `C = alpha * op(A) * B + beta * C` where `A` is
+   a sparse matrix in CSR format and `B` and `C` are dense matrices.
+
+   :param transA: operation to apply to `A`
+   :param m: number of rows of `A`
+   :param n: number of columns of `B` and `C`
+   :param k: number of columns of `A`
+   :param nnz: number of nonzeroes in `A`
+   :param alpha: scalar
+   :param descrA: matrix descriptor
+   :param csrValA: vector of nonzero values of `A`
+   :param csrRowPtrA: vector of row pointers of `A`
+   :param csrColIndA: vector of column indices of `A`
+   :param B: dense matrix
+   :param ldb: leading dimension of `B`
+   :param beta: scalar
+   :param C: dense matrix
+   :param ldc: leading dimension of `C`
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.csrmm2(transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc)
+
+   Matrix-matrix multiplication `C = alpha * op(A) * op(B) + beta * C` where `A` is
+   a sparse matrix in CSR format and `B` and `C` are dense matrices.
+
+   :param transA: operation to apply to `A`
+   :param transB: operation to apply to `B`
+   :param m: number of rows of `A`
+   :param n: number of columns of `B` and `C`
+   :param k: number of columns of `A`
+   :param nnz: number of nonzeroes in `A`
+   :param alpha: scalar
+   :param descrA: matrix descriptor
+   :param csrValA: vector of nonzero values of `A`
+   :param csrRowPtrA: vector of row pointers of `A`
+   :param csrColIndA: vector of column indices of `A`
+   :param B: dense matrix
+   :param ldb: leading dimension of `B`
+   :param beta: scalar
+   :param C: dense matrix
+   :param ldc: leading dimension of `C`
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.csrsm_analysis(transA, m, nnz, descrA, csrValA, csrRowPtrA, csrColIndA)
+
+   Performs the analysis phase of the solution of a sparse triangular linear
+   system `op(A) * Y = alpha * X` with multiple right-hand sides where `A` is a
+   sparse matrix in CSR format, and `X` and `Y` are dense matrices.
+
+   :param transA: operation to apply to `A`
+   :param m: number of rows of `A`
+   :param nnz: number of nonzeroes in `A`
+   :param descrA: matrix descriptor
+   :param csrValA: vector of nonzero values of `A`
+   :param csrRowPtrA: vector of row pointers of `A`
+   :param csrColIndA: vector of column indices of `A`
+   :return: the analysis result
+
+.. py:method:: pyculib.sparse.Sparse.csrsm_solve(transA, m, n, alpha, descrA, csrValA, csrRowPtrA, csrColIndA, info, X, ldx, Y, ldy)
+
+   Performs the analysis phase of the solution of a sparse triangular linear
+   system `op(A) * Y = alpha * X` with multiple right-hand sides where `A` is a
+   sparse matrix in CSR format, and `X` and `Y` are dense matrices.
+
+   :param transA: operation to apply to `A`
+   :param m: number of rows of `A`
+   :param n: number of columns of `B` and `C`
+   :param alpha: scalar
+   :param descrA: matrix descriptor
+   :param csrValA: vector of nonzero values of `A`
+   :param csrRowPtrA: vector of row pointers of `A`
+   :param csrColIndA: vector of column indices of `A`
+   :param info: the analysis result from :py:meth:`csrsm_analysis`
+   :param X: dense matrix
+   :param ldx: leading dimension of `X`
+   :param Y: dense matrix
+   :param ldy: leading dimension of `Y`
+   :return: None
+
+
+Extra Functions
+---------------
+
+.. py:method:: pyculib.sparse.Sparse.XcsrgeamNnz(m, n, descrA, nnzA, csrRowPtrA, csrColIndA, descrB, nnzB, csrRowPtrB, csrColIndB, descrC, csrRowPtrC)
+
+   Set up the sparsity pattern for the matrix operation `C = alpha * A + beta *
+   B` where `A`, `B`, and `C` are all sparse matrices in CSR format.
+
+   :param m: number of rows of all matrices
+   :param n: number of columns of all matrices
+   :param descrA: matrix descriptor for `A`
+   :param nnzA: number of nonzeroes in `A`
+   :param csrRowPtrA: vector of row pointers of `A`
+   :param csrColIndA: vector of column indices of `A`
+   :param descrB: matrix descriptor for `B`
+   :param nnzB: number of nonzeroes in `B`
+   :param csrRowPtrB: vector of row pointers of `B`
+   :param csrColIndB: vector of column indices of `B`
+   :param descrC: matrix descriptor for `B`
+   :param csrRowPtrC: vector of row pointers of `C`, written to by this method
+   :return: number of nonzeroes in `C`
+
+.. py:method:: pyculib.sparse.Sparse.csrgeam(m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC)
+
+   Performs the the matrix operation `C = alpha * A + beta * B` where `A`, `B`,
+   and `C` are all sparse matrices in CSR format.
+
+   :param m: number of rows of all matrices
+   :param n: number of columns of all matrices
+   :param alpha: scalar
+   :param descrA: matrix descriptor for `A`
+   :param nnzA: number of nonzeroes in `A`
+   :param csrValA: vector of nonzero values of `A`
+   :param csrRowPtrA: vector of row pointers of `A`
+   :param csrColIndA: vector of column indices of `A`
+   :param beta: scalar
+   :param descrB: matrix descriptor for `B`
+   :param nnzB: number of nonzeroes in `B`
+   :param csrValB: vector of nonzero values of `B`
+   :param csrRowPtrB: vector of row pointers of `B`
+   :param csrColIndB: vector of column indices of `B`
+   :param descrC: matrix descriptor for `B`
+   :param csrValC: vector of nonzero values of `C`
+   :param csrRowPtrC: vector of row pointers of `C`
+   :param csrColIndC: vector of column indices of `C`
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.XcsrgemmNnz(transA, transB, m, n, k, descrA, nnzA, csrRowPtrA, csrColIndA, descrB, nnzB, csrRowPtrB, csrColIndB, descrC, csrRowPtrC)
+
+   Set up the sparsity pattern for the matrix operation `C = op(A) * op(B)`
+   where `A`, `B`, and `C` are all sparse matrices in CSR format.
+
+   :param transA: operation to apply to `A`
+   :param transB: operation to apply to `B`
+   :param m: number of rows of `A` and `C`
+   :param n: number of columns of `B` and `C`
+   :param k: number of columns/rows of `A`/`B`
+   :param descrA: matrix descriptor for `A`
+   :param nnzA: number of nonzeroes in `A`
+   :param csrRowPtrA: vector of row pointers of `A`
+   :param csrColIndA: vector of column indices of `A`
+   :param descrB: matrix descriptor for `B`
+   :param nnzB: number of nonzeroes in `B`
+   :param csrRowPtrB: vector of row pointers of `B`
+   :param csrColIndB: vector of column indices of `B`
+   :param descrC: matrix descriptor for `C`
+   :param csrRowPtrC: vector of row pointers of `C`, written by this function
+   :return: number of nonzeroes in `C`
+
+.. py:method:: pyculib.sparse.Sparse.csrgemm(transA, transB, m, n, k, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC)
+
+   Perform the matrix operation `C = op(A) * op(B)` where `A`, `B`, and `C` are
+   all sparse matrices in CSR format.
+
+   :param transA: operation to apply to `A`
+   :param transB: operation to apply to `B`
+   :param m: number of rows of `A` and `C`
+   :param n: number of columns of `B` and `C`
+   :param k: number of columns/rows of `A`/`B`
+   :param descrA: matrix descriptor for `A`
+   :param nnzA: number of nonzeroes in `A`
+   :param csrValA: vector of nonzero values in `A`
+   :param csrRowPtrA: vector of row pointers of `A`
+   :param csrColIndA: vector of column indices of `A`
+   :param descrB: matrix descriptor for `B`
+   :param nnzB: number of nonzeroes in `B`
+   :param csrValB: vector of nonzero values in `B`
+   :param csrRowPtrB: vector of row pointers of `B`
+   :param csrColIndB: vector of column indices of `B`
+   :param descrC: matrix descriptor for `C`
+   :param csrValC: vector of nonzero values in `C`
+   :param csrRowPtrC: vector of row pointers of `C`
+   :param csrColIndC: vector of column indices of `C`
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.csrgemm_ez(A, B, transA='N', transB='N', descrA=None, descrB=None, descrC=None)
+
+    Performs the matrix operation `C = op(A) * op(B)` where `A`, `B` and `C`
+    are all sparse matrices in CSR format. This function accepts and returns
+    :py:class:`pyculib.sparse.CudaCSRMatrix` matrices, and makes
+    calls to :py:meth:`XcsrgemmNnz` and :py:meth:`csrgemm`.
+
+   :param A: :py:class:`pyculib.sparse.CudaCSRMatrix`
+   :param B: :py:class:`pyculib.sparse.CudaCSRMatrix`
+   :param transA: optional, operation to apply to `A`
+   :param transB: optional, operation to apply to `B`
+   :param descrA: optional, matrix descriptor for `A`
+   :param descrB: optional, matrix descriptor for `B`
+   :param descrC: optional, matrix descriptor for `C`
+   :return: :py:class:`pyculib.sparse.CudaCSRMatrix`
+
+
+Preconditioners
+---------------
+
+.. py:method:: pyculib.sparse.Sparse.csric0(trans, m, descr, csrValA, csrRowPtrA, csrColIndA, info)
+
+   Computes incomplete Cholesky factorization of a sparse matrix in CSR format
+   with 0 fill-in and no pivoting: `op(A) = R**T * R`. This method must follow a
+   call to :py:meth:`csrsv_analysis`. The matrix `A` is overwritten with the
+   upper or lower triangular factors `R` or `R**T`.
+
+   :param trans: operation to apply to the matrix
+   :param m: number of rows and columns of the matrix
+   :param descr: matrix descriptor
+   :param csrValA: vector of nonzero values in `A`
+   :param csrRowPtrA: vector of row pointers of `A`
+   :param csrColIndA: vector of column indices of `A`
+   :param info: analysis result
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.csrilu0(trans, m, descr, csrValA, csrRowPtrA, csrColIndA, info)
+
+   Computes incomplete-LU factorization of a sparse matrix in CSR format with 0
+   fill-in and no pivoting: `op(A) = L * U`. This method must follow a call to
+   :py:meth:`csrsv_analysis`. The matrix `A` is overwritten with the lower and
+   upper triangular factors `L` and `U`.
+
+   :param trans: operation to apply to the matrix
+   :param m: number of rows and columns of the matrix
+   :param descr: matrix descriptor
+   :param csrValA: vector of nonzero values in `A`
+   :param csrRowPtrA: vector of row pointers of `A`
+   :param csrColIndA: vector of column indices of `A`
+   :param info: analysis result
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.gtsv(m, n, dl, d, du, B, ldb)
+
+   Computes the solution of a tridiagonal linear system with multiple right-hand
+   sides: `A * Y = alpha * X`.
+
+   :param m: the size of the linear system
+   :param n: the number of right-hand sides in the system
+   :param dl: dense vector storing the lower-diagonal elements
+   :param d: dense vector storing the diagonal elements
+   :param du: dense vector storing the upper-diagonal elements
+   :param B: dense matrix holding the right-hand sides of the system
+   :param ldb: the leading dimension of `B`
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.gtsv_nopivot(m, n, dl, d, du, B, ldb)
+
+   Similar to :py:meth:`gtsv`, but computes the solution without performing any
+   pivoting.
+
+   :param m: the size of the linear system
+   :param n: the number of right-hand sides in the system
+   :param dl: dense vector storing the lower-diagonal elements
+   :param d: dense vector storing the diagonal elements
+   :param du: dense vector storing the upper-diagonal elements
+   :param B: dense matrix holding the right-hand sides of the system
+   :param ldb: the leading dimension of `B`
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.gtsvStridedBatch(m, dl, d, du, x, batchCount, batchStride)
+
+   Computes the solution of `i` tridiagonal linear systems: `A(i) * y(i) = alpha
+   * x(i)`.
+
+   :param m: the size of the linear systems
+   :param dl: stacked dense vector storing the lower-diagonal elements of each
+              system
+   :param d: stacked dense vector storing the diagonal elements of each system
+   :param du: stacked dense vector storing the upper-diagonal elements of each
+              system
+   :param x: dense matrix holding the right-hand sides of the systems
+   :param batchCount: number of systems to solve
+   :param batchStride: number of elements separating the vectors of each system
+   :return: None
+
+
+Format Conversion
+-----------------
+
+.. py:method:: pyculib.sparse.Sparse.bsr2csr(dirA, mb, nb, descrA, bsrValA, bsrRowPtrA, bsrColIndA, blockDim, descrC, csrValC, csrRowPtrC, csrColIndC)
+
+   Convert the sparse matrix `A` in BSR format to CSR format, stored in `C`.
+
+   :param dirA: row ('R') or column ('C') orientation of block storage
+   :param mb: number of block rows of `A`
+   :param nb: number of block columns of `A`
+   :param descrA: matrix descriptor for `A`
+   :param bsrValA: vector of nonzero values of `A`
+   :param bsrRowPtrA: vector of block row pointers of `A`
+   :param bsrColIndA: vector of block column indices of `A`
+   :param blockDim: block dimension of `A`
+   :param descrC: matrix descriptor for `C`
+   :param csrValA: vector of nonzero values in `C`
+   :param csrRowPtrA: vector of row pointers of `C`
+   :param csrColIndA: vector of column indices of `C`
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.Xcoo2csr(cooRowInd, nnz, m, csrRowPtr)
+
+   Converts an array containing uncompressed row indices corresponding to the
+   COO format into into an array of compressed row pointers corresponding to the
+   CSR format.
+
+   :param cooRowInd: integer array of uncompressed row indices
+   :param nnz: number of nonzeroes
+   :param m: number of matrix rows
+   :param csrRowPtr: vector of row pointers to be written to
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.csc2dense(m, n, descrA, cscValA, cscRowIndA, cscColPtrA, A, lda)
+
+   Convert the sparse matrix `A` in CSC format into a dense matrix.
+
+   :param m: number of rows of `A`
+   :param n: number of columns of `A`
+   :param descrA: matrix descriptor for `A`
+   :param cscValA: values in the CSC representation of `A`
+   :param cscRowIndA: row indices in the CSC representation of `A`
+   :param cscColPtrA: column pointers in the CSC representation of `A`
+   :param A: dense matrix representation of `A` to be written by this function.
+   :param lda: leading dimension of `A`
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.Xcsr2bsrNnz(dirA, m, n, descrA, csrRowPtrA, csrColIndA, blockDim, descrC, bsrRowPtrC)
+
+   Performs the analysis necessary for converting a matrix in CSR format into
+   BSR format.
+
+   :param dirA: row ('R') or column ('C') orientation of block storage
+   :param m: number of rows of matrix
+   :param n: number of columns of matrix
+   :param descrA: matrix descriptor for input matrix `A`
+   :param csrRowPtrA: row pointers of matrix
+   :param csrColIndA: column indices of matrix
+   :param blockDim: block dimension of output matrix `C`
+   :param descrC: matrix descriptor for output matrix `C`
+   :return: number of nonzeroes of matrix
+
+.. py:method:: pyculib.sparse.Sparse.csr2bsr(dirA, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, blockDim, descrC, bsrValC, bsrRowPtrC, bsrColIndC)
+
+   Performs conversion of a matrix from CSR format into BSR format.
+
+   :param dirA: row ('R') or column ('C') orientation of block storage
+   :param m: number of rows of matrix
+   :param n: number of columns of matrix
+   :param descrA: matrix descriptor for input matrix `A`
+   :param csrValA: nonzero values of matrix
+   :param csrRowPtrA: row pointers of matrix
+   :param csrColIndA: column indices of matrix
+   :param blockDim: block dimension of output matrix `C`
+   :param descrC: matrix descriptor for output matrix `C`
+   :param bsrValC: nonzero values of output matrix `C`
+   :param bsrRowPtrC: block row pointers of output matrix `C`
+   :param bsrColIndC: block column indices of output matrix `C`
+   :return: number of nonzeroes of matrix
+
+.. py:method:: pyculib.sparse.Sparse.Xcsr2coo(csrRowPtr, nnz, m, cooRowInd)
+
+   Converts an array of compressed row pointers corresponding to the CSR format
+   into an array of uncompressed row indices corresponding to the COO format.
+
+   :param csrRowPtr: vector of row pointers
+   :param nnz: number of nonzeroes
+   :param m: number of rows of matrix
+   :param cooRowInd: vector of uncompressed row indices written by this function
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.csr2csc(m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues)
+
+   Converts a sparse matrix in CSR format into a sparse matrix in CSC format.
+
+   :param m: number of rows of matrix
+   :param n: number of columns of matrix
+   :param nnz: number of nonzeroes of the matrix
+   :param csrVal: values in the CSR representation
+   :param csrRowPtr: row indices in the CSR representation
+   :param csrColInd: column pointers in the CSR representation
+   :param cscVal: values in the CSC representation
+   :param cscRowInd: row indices in the CSC representation
+   :param cscColPtr: column pointers in the CSC representation
+   :param copyValues: `'N'` or `'S'` for symbolic or numeric copy of values
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.csr2dense(m, n, descr, csrVal, csrRowPtr, csrColInd, A, lda)
+
+   Convert a sparse matrix in CSR format into dense format.
+
+   :param m: number of rows of matrix
+   :param n: number of columns of matrix
+   :param descr: matrix descriptor
+   :param csrVal: values in the CSR representation
+   :param csrRowPtr: row indices in the CSR representation
+   :param csrColInd: column pointers in the CSR representation
+   :param A: the dense representation, written to by this function
+   :param lda: leading dimension of the matrix
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.dense2csc(m, n, descrA, A, lda, nnzPerCol, cscVal, cscRowInd, cscColPtr)
+
+   Convert a dense matrix into a sparse matrix in CSC format. The `nnzPerCol`
+   parameter may be computed with a call to :py:meth:`nnz`.
+
+   :param m: number of rows of matrix
+   :param n: number of columns of matrix
+   :param descrA: matrix descriptor
+   :param A: the matrix in dense format
+   :param lda: leading dimension of the matrix
+   :param nnzPerCol: array containing the number of nonzero elements per column
+   :param cscVal: values in the CSC representation
+   :param cscRowInd: row indices in the CSC representation
+   :param cscColPtr: column pointers in the CSC representation
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.dense2csr(m, n, descrA, A, lda, nnzPerRow, csrVal, csrRowPtr, csrColInd)
+
+   Convert a dense matrix into a sparse matrix in CSR format. The `nnzPerRow`
+   parameter may be computed with a call to :py:meth:`nnz`.
+
+   :param m: number of rows of matrix
+   :param n: number of columns of matrix
+   :param descrA: matrix descriptor
+   :param A: the matrix in dense format
+   :param lda: leading dimension of the matrix
+   :param nnzPerRow: array containing the number of nonzero elements per row
+   :param csrVal: values in the CSR representation
+   :param csrRowPtr: row indices in the CSR representation
+   :param csrColInd: column pointers in the CSR representation
+   :return: None
+
+.. py:method:: pyculib.sparse.Sparse.nnz(dirA, m, n, descrA, A, lda, nnzPerRowCol)
+
+   Computes the number of nonzero elements per row or column of a dense matrix,
+   and the total number of nonzero elements in the matrix.
+
+   :param dirA: `'R'` for the number of nonzeroes per row, or `'C'` for per
+                column.
+   :param m: number of rows of matrix
+   :param n: number of columns of matrix
+   :param descrA: matrix descriptor
+   :param A: the matrix
+   :param lda: leading dimension of the matrix
+   :param nnzPerRowCol: array to contain the number of nonzeroes per row or
+                        column
+   :return: total number of nonzeroes in the matrix
diff --git a/docs/source/env-variables.rst b/docs/source/env-variables.rst
new file mode 100644
index 0000000..43fdb18
--- /dev/null
+++ b/docs/source/env-variables.rst
@@ -0,0 +1,7 @@
+=====================
+Environment variables
+=====================
+
+``PYCULIB_WARNINGS``
+
+If set to anything but 0 (zero), Pyculib may issue performance warnings, such as when input arguments need to be copied to adjust their data layout, or types, to match particular backend requirements.
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..5678440
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,51 @@
+=========
+Pyculib
+=========
+
+:emphasis:`High Performance Computing`
+
+Pyculib is a package that provides access to several numerical libraries that are optimized for performance on NVidia GPUs.
+
+Pyculib was originally part of Accelerate, developed by Continuum Analytics Inc.
+
+The current version, 0.1.0, was released on TODO.
+
+Features
+========
+
+* Bindings to the following :doc:`cuda-libs`:
+	* :doc:`cublas`
+	* :doc:`cufft`
+	* :doc:`cusparse`
+	* :doc:`curand`
+	* :doc:`sorting` algorithms from the CUB and Modern GPU libraries
+
+Installation
+============
+
+This section contains information related to:
+
+.. toctree::
+    :maxdepth: 1
+
+    install
+
+
+User guide
+==========
+
+This section contains information related to:
+
+.. toctree::
+    :maxdepth: 1
+
+    cuda-libs
+    env-variables
+
+Release notes
+=============
+
+.. toctree::
+   :maxdepth: 1
+
+   release-notes
diff --git a/docs/source/install.rst b/docs/source/install.rst
new file mode 100644
index 0000000..b5c5fa2
--- /dev/null
+++ b/docs/source/install.rst
@@ -0,0 +1,36 @@
+Requirements
+============
+
+* 64-bit operating system--Windows, macOS or Linux
+* Supported Python and Numpy combinations:
+   * Python XXX with Numpy XXX TODO: FILL IN
+* Numba 0.33
+
+CUDA feature requirements
+-------------------------
+
+* NVidia driver version XXX or later TODO: FILL IN
+* CUDA toolkit 7.5
+* At least one CUDA GPU with compute capability 2.0 or above
+
+
+Installing Pyculib
+=====================
+
+If you already have the `Anaconda free
+Python distribution <http://continuum.io/downloads.html>`, take the following steps to install Pyculib:  
+
+#. Run the command ``conda update conda``.
+#. Run the command ``conda install pyculib``.
+
+If you do not have Anaconda installed, see `Downloads <http://continuum.io/downloads.html>`_.
+
+NOTE: Pyculib can also be installed into your own non-Anaconda Python environment via pip or setuptools.
+
+Updating Pyculib
+===================
+
+To update Pyculib, take the following steps:
+
+#. Run the command ``conda update conda``.
+#. Run the command ``conda update pyculib``.
diff --git a/docs/source/release-notes.rst b/docs/source/release-notes.rst
new file mode 100644
index 0000000..93b55c5
--- /dev/null
+++ b/docs/source/release-notes.rst
@@ -0,0 +1,27 @@
+=============
+Release notes
+=============
+
+Version 1.0.0
+=============
+
+NumbaPro and Accelerate have been deprecated, and code generation features have
+been moved into open-source Numba. The CUDA library functions have been moved
+into Pyculib. There will be no further updates to NumbaPro or Accelerate.
+
+CUDA libraries
+--------------
+
+Pyculib CUDA library functionality is equivalent to that in Accelerate 2.+,
+with the following packages renamed:
+
+===========================  ===========================
+Accelerate package           Pyculib package
+===========================  ===========================
+``accelerate.cuda.blas``     ``pyculib.blas``
+``accelerate.cuda.fft``      ``pyculib.fft``
+``accelerate.cuda.rand``     ``pyculib.rand``
+``accelerate.cuda.sparse``   ``pyculib.sparse``
+``accelerate.cuda.sorting``  ``pyculib.sorting``
+===========================  ===========================
+
diff --git a/docs/source/sorting.rst b/docs/source/sorting.rst
new file mode 100644
index 0000000..60fb84e
--- /dev/null
+++ b/docs/source/sorting.rst
@@ -0,0 +1,25 @@
+============
+CUDA Sorting
+============
+
+Pyculib provides routines for sorting arrays on CUDA GPUs.
+
+Sorting Large Arrays
+====================
+
+The :py:class:`pyculib.sorting.RadixSort` class is recommended for
+sorting large (approx. more than 1 million items) arrays of numeric types.
+
+.. autoclass:: pyculib.sorting.RadixSort
+   :members:
+
+Sorting Many Small Arrays
+=========================
+
+Using :py:class:`pyculib.sorting.RadixSort` on small (approx. less than
+1 million items) arrays has significant overhead due to multiple kernel
+launches. 
+
+A better alternative is to use :py:func:`pyculib.sorting.segmented_sort`-which launches a single kernel for sorting a batch of many small arrays.
+
+.. autofunction:: pyculib.sorting.segmented_sort
diff --git a/pyculib/__init__.py b/pyculib/__init__.py
new file mode 100644
index 0000000..babdaba
--- /dev/null
+++ b/pyculib/__init__.py
@@ -0,0 +1,77 @@
+from __future__ import absolute_import
+import numba
+import re
+import unittest
+import sys
+import platform
+from pyculib import config
+from pyculib import warnings
+
+NUMBA_VERSION_REQ = (0, 33, 0)
+
+def check_numba_version():
+    m = re.match(r"(\d+)\.(\d+)\.(\d+).*", numba.__version__)
+    if m is None or tuple(map(int, m.groups())) < NUMBA_VERSION_REQ:
+        import warnings
+        warnings.showwarning(
+            "Numba version too old; expecting %d.%d.%d" % NUMBA_VERSION_REQ,
+            ImportWarning, __name__, 1)
+
+check_numba_version()
+
+def load_tests(loader, tests, pattern):
+    from .tests import test_cases
+
+    suite = unittest.TestSuite()
+    for test_class in test_cases:
+        tests = loader.loadTestsFromTestCase(test_class)
+        suite.addTests(tests)
+    return suite
+
+def cuda_compatible():
+    if sys.platform.startswith('darwin'):
+        ver = platform.mac_ver()[0]
+        # version string can contain two or three components
+        major, minor = ver.split('.', 1)
+        if '.' in minor:
+            minor, micro = minor.split('.', 1)
+        if (int(major), int(minor)) < (10, 9):
+            return False
+
+    is_64bits = sys.maxsize > 2**32
+    if not is_64bits:
+        return False
+
+    return True
+
+if cuda_compatible():
+    from numba import cuda
+    from . import blas, sparse, fft, rand, sorting
+
+def test():
+    success = True
+    if cuda_compatible() and cuda.is_available():
+        print('CUDA Library tests'.center(80, '~'), '\n')
+        print('cuBLAS'.center(80, '-'))
+        if not blas.test().wasSuccessful():
+            success = False
+        print('cuSPARSE'.center(80, '-'))
+        if not sparse.test().wasSuccessful():
+            success = False
+        print('cuFFT'.center(80, '-'))
+        if not fft.test().wasSuccessful():
+            success = False
+        print('cuRAND'.center(80, '-'))
+        if not rand.test().wasSuccessful():
+            success = False
+        print('Sorting'.center(80, '-'))
+        if not sorting.test().wasSuccessful():
+            success = False
+    else:
+        print('CUDA unavailable - skipped CUDA tests')
+
+    return success
+
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
diff --git a/pyculib/_version.py b/pyculib/_version.py
new file mode 100644
index 0000000..8a81283
--- /dev/null
+++ b/pyculib/_version.py
@@ -0,0 +1,520 @@
+
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = ""
+    cfg.parentdir_prefix = "pyculib"
+    cfg.versionfile_source = "pyculib/_version.py"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
diff --git a/pyculib/blas/__init__.py b/pyculib/blas/__init__.py
new file mode 100644
index 0000000..6d9a747
--- /dev/null
+++ b/pyculib/blas/__init__.py
@@ -0,0 +1,149 @@
+from __future__ import absolute_import, print_function
+from .api import Blas, validate_argument_dtype
+from pyculib.nputil import promote, alias, astype, colmajor
+import numpy as np
+from numbers import Number
+
+
+_blas = None
+
+def _normalize_op(op):
+
+    if op == 'n': return 'N'
+    elif op == 't': return 'T'
+    elif op == 'c': return 'C'
+    return op
+
+
+def dot(x, y, stream=None):
+    """Compute and return the vector dot product of x and y."""
+    global _blas
+
+    validate_argument_dtype(x, 'x')
+    validate_argument_dtype(y, 'y')
+    if not _blas: _blas = Blas()
+    _blas.stream = stream
+    dtype = promote(x.dtype, y.dtype)
+    # FIXME: the col-major constraint actually only applies to host arrays.
+    #        If x and y are already device arrays they can be strided.
+    return _blas.dot(colmajor(x, dtype, 'x'), colmajor(y, dtype, 'y'))
+
+def axpy(alpha, x, y, stream=None):
+    """y <- alpha*x + y """
+
+    global _blas
+
+    if not isinstance(alpha, Number): raise ValueError('alpha is not a numeric type')
+    validate_argument_dtype(x, 'x')
+    validate_argument_dtype(y, 'y')
+    if not _blas: _blas = Blas()
+    _blas.stream = stream
+    dtype = promote(promote(type(alpha), x.dtype), y.dtype)
+    yf = colmajor(y, dtype, 'y')
+    _blas.axpy(dtype.type(alpha), x.astype(dtype), yf)
+    if y.dtype == yf.dtype and not alias(y, yf):
+        y[:] = yf
+        return y
+    else:
+        return yf
+
+def gemv(trans, alpha, A, x, beta=0, y=None, stream=None):
+    """Generalized matrix-vector multiplication:
+
+    y <- alpha*trans(A)*x + beta*y
+
+    'beta' and 'y' are optional on input. Return 'y'."""
+
+    global _blas
+
+    if not isinstance(alpha, Number): raise ValueError('alpha is not a numeric type')
+    validate_argument_dtype(A, 'A')
+    validate_argument_dtype(x, 'x')
+    if not isinstance(beta, Number): raise ValueError('beta is not a numeric type')
+    if A.ndim != 2: raise ValueError('A is not a two-dimensional array')
+    if x.ndim != 1: raise ValueError('x is not a one-dimensional array')
+    if not _blas: _blas = Blas()
+    _blas.stream = stream
+    m, n = A.shape
+    trans = _normalize_op(trans)
+    if trans not in ('N', 'T', 'C'): raise ValueError('trans has invalid value')
+    dtype = promote(promote(type(alpha), A.dtype),
+                    promote(x.dtype, type(beta)))
+    if y is None:
+        y = np.empty(trans == 'N' and n or m, dtype=dtype)
+        yf = y
+    else:
+        validate_argument_dtype(y, 'y')
+        if y.ndim != 1: raise ValueError('y is not a one-dimensional array')
+        dtype = promote(dtype, y.dtype)
+        yf = colmajor(y, dtype, 'y')
+    if trans == 'N':
+        if A.shape[1] != x.shape[0]:
+            raise ValueError('arrays A and x have incompatible shapes')
+        if A.shape[0] != y.shape[0]:
+            raise ValueError('arrays A and y have incompatible shapes')
+    else:
+        if A.shape[0] != x.shape[0]:
+            raise ValueError('arrays A and x have incompatible shapes')
+        if A.shape[1] != y.shape[0]:
+            raise ValueError('arrays A and y have incompatible shapes')
+    _blas.gemv(trans, m, n, dtype.type(alpha), colmajor(A, dtype, 'A'),
+               x.astype(dtype), dtype.type(beta), yf)
+    if y.dtype == yf.dtype and not alias(y, yf):
+        y[:] = yf
+        return y
+    else:
+        return yf
+
+def gemm(transa, transb, alpha, A, B, beta=0, C=None, stream=None):
+    """Generalized matrix-matrix multiplication:
+
+    C <- alpha*transa(A)*transb(B) + beta*C
+
+    'beta' and 'C' are optional on input. Return 'C'."""
+
+    global _blas
+
+    if not isinstance(alpha, Number): raise ValueError('alpha is not a numeric type')
+    validate_argument_dtype(A, 'A')
+    validate_argument_dtype(B, 'B')
+    if not isinstance(beta, Number): raise ValueError('beta is not a numeric type')
+    if A.ndim != 2: raise ValueError('A is not a two-dimensional array')
+    if B.ndim != 2: raise ValueError('B is not a two-dimensional array')
+    if not _blas: _blas = Blas()
+    _blas.stream = stream
+    transa = _normalize_op(transa)
+    if transa not in ('N', 'T', 'C'): raise ValueError('transa has invalid value')
+    transb = _normalize_op(transb)
+    if transb not in ('N', 'T', 'C'): raise ValueError('transb has invalid value')
+    dtype = promote(promote(type(alpha), A.dtype),
+                    promote(B.dtype, type(beta)))
+    M = transa == 'N' and A.shape[0] or A.shape[1]
+    N = transb == 'N' and B.shape[1] or B.shape[0]
+    K = transa == 'N' and A.shape[1] or A.shape[0]
+    if C is None:
+        C = np.empty(shape=(M, N), order='F', dtype=dtype)
+        Cf = C
+    else:
+        validate_argument_dtype(C, 'C')
+        if C.ndim != 2: raise ValueError('C is not a two-dimensional array')
+        if C.shape[0] != M:
+            raise ValueError('arrays A and C have incompatible shapes')
+        if C.shape[1] != N:
+            raise ValueError('arrays B and C have incompatible shapes')
+        dtype = promote(dtype, C.dtype)
+        Cf = colmajor(C, dtype, 'C')
+
+    if transb == 'N':
+        if B.shape[0] != K: raise ValueError('arrays A and B have incompatible shapes')
+    else:
+        if B.shape[1] != K: raise ValueError('arrays A and B have incompatible shapes')
+
+    _blas.gemm(transa, transb, M, N, K, dtype.type(alpha),
+               colmajor(A, dtype, 'A'), colmajor(B, dtype, 'B'),
+               dtype.type(beta), Cf)
+    if C.dtype == Cf.dtype and not alias(C, Cf):
+        C[:] = Cf
+        return C
+    else:
+        return Cf
diff --git a/pyculib/blas/api.py b/pyculib/blas/api.py
new file mode 100644
index 0000000..5b4af52
--- /dev/null
+++ b/pyculib/blas/api.py
@@ -0,0 +1,501 @@
+from __future__ import print_function, absolute_import, division
+from contextlib import contextmanager
+import numpy as np
+from .binding import cuBlas
+from numba import cuda
+
+def validate_argument_dtype(var, name):
+
+    # Type map
+    if var.dtype.name not in ('float32', 'float64', 'complex64', 'complex128'):
+        raise TypeError('%s has unsupported type %s'%(name, var.dtype.name))
+
+def _dtype_vtable(table):
+    return dict((np.dtype(k), v) for k, v in table.items())
+
+def _sel_complex(real, imag):
+    return {float: real,
+            complex: imag,
+            np.float32: real,
+            np.float64: real,
+            np.complex64: imag,
+            np.complex128: imag,}
+
+def _auto_l2_functions(fname, tnames, argfmt, extras):
+    writebacks = set()
+    readonlys = set()
+    arglist = []
+    extras = [s.lstrip().rstrip() for s in  extras.split(',')]
+    dtypemap = {
+        np.dtype(np.float32): 'S',
+        np.dtype(np.float64): 'D',
+        np.dtype(np.complex64): 'C',
+        np.dtype(np.complex128): 'Z',
+    }
+    for i, a in enumerate(argfmt.split(',')):
+        a = a.lstrip().rstrip()
+        if ':' in a:
+            name, mode = a.split(':')
+            assert mode in 'wr', "invalid mode"
+            if mode == 'w':
+                writebacks.add(name)
+            else:
+                readonlys.add(name)
+        else:
+            name = a
+        arglist.append(name)
+
+    def prepare_args(args, kws):
+        for i, a in enumerate(args):
+            name = arglist[i]
+            assert name not in kws, "missing argument %s" % name
+            kws[name] = a
+        for a in extras:
+            if a.startswith('ld') and len(a) == 3:
+                kws[a] = kws[a[-1].upper()].shape[0]
+            elif a.startswith('inc') and len(a) == 4:
+                ary = kws[a[-1]]
+                kws[a] = ary.strides[0] // ary.dtype.itemsize
+            else:
+                assert False, 'unreachable'
+
+    devargs = list(writebacks | readonlys)
+
+    def autodevice(kws, stream):
+        newkws = kws.copy()
+        cleanups = []
+        for a in readonlys:
+            newkws[a], _ = cuda._auto_device(kws[a], stream=stream)
+        for a in writebacks:
+            dmem, conv = cuda._auto_device(kws[a], stream=stream)
+            newkws[a] = dmem
+            if conv:
+                cleanups.append((dmem, kws[a]))
+        return newkws, cleanups
+
+    def _dispatch(self, *args, **kws):
+        prepare_args(args, kws)
+        dtype = kws[devargs[0]].dtype
+        for i, darg in enumerate(devargs[1:]):
+            got = kws[darg].dtype
+            if got != dtype:
+                msg = "%dth array dtype mismatch: got %s but expect %s"
+                raise TypeError(msg % (i + 1, got, dtype))
+        typecode = dtypemap[dtype]
+        assert typecode in tnames, "unknown typecode"
+        fn = getattr(self._cublas, '%s%s' % (typecode, fname))
+        kws, cleanups = autodevice(kws, self.stream)
+        res = fn(**kws)
+        for dmem, ary in cleanups:
+            dmem.copy_to_host(ary, stream=self.stream)
+        return res
+
+    # changes how user see this function through help()
+    _dispatch.__name__ = fname
+    _dispatch.__doc__ = "%s(%s)" % (fname, argfmt)
+    return _dispatch
+
+class Blas(object):
+    '''All BLAS subprograms are available under the Blas object.
+
+    :param stream: Optional. A CUDA Stream.
+    '''
+    @cuda.require_context
+    def __init__(self, stream=0):
+        self._cublas = cuBlas()
+        if stream:
+            self._cublas.stream = stream
+
+    @property
+    def stream(self):
+        return self._cublas.stream
+
+    @stream.setter
+    def stream(self, stream):
+        self._cublas.stream = stream
+
+    @contextmanager
+    def _auto(self, *arys):
+        ctx = (cuda._auto_device(ary, stream=self.stream) for ary in arys)
+        darys, convs = zip(*ctx)
+        if len(darys) == 1:
+            yield darys[0]
+        else:
+            yield darys
+        for dary, conv, ary in zip(darys, convs, arys):
+            if conv:
+                dary.copy_to_host(ary, stream=self.stream)
+
+    @contextmanager
+    def _auto_read(self, *arys):
+        ctx = (cuda._auto_device(ary, stream=self.stream) for ary in arys)
+        darys, convs = zip(*ctx)
+        if len(darys) == 1:
+            yield darys[0]
+        else:
+            yield darys
+
+    def _dispatch(self, vtable, *keys):
+        rsvl = vtable
+        for k in keys:
+            if not isinstance(rsvl, dict):
+                break
+
+            try:
+                rsvl = rsvl[k]
+            except KeyError:
+                raise TypeError(k)
+
+        return getattr(self._cublas, rsvl)
+
+    def nrm2(self, x):
+        "Same as np.linalg.norm"
+        _sentry_same_dtype(x)
+        _sentry_ndim(1, x)
+        fn = self._dispatch(self.nrm2.vtable, x.dtype)
+        with self._auto_read(x) as dx:
+            return fn(x.size, dx, *_norm_stride(x))
+
+    nrm2.vtable = _dtype_vtable({np.float32:    'Snrm2',
+                                 np.float64:    'Dnrm2',
+                                 np.complex64:  'Scnrm2',
+                                 np.complex128: 'Dznrm2'})
+
+    def dot(self, x, y):
+        """Compute and return the vector dot product of x and y."""
+        _sentry_same_dtype(x, y)
+        _sentry_ndim(1, x, y)
+        _sentry_same_shape(x, y)
+        fn = self._dispatch(self.dot.vtable, x.dtype)
+        with self._auto_read(x, y) as (dx, dy):
+            result = fn(x.size, dx, _norm_stride(x)[0], dy, _norm_stride(y)[0])
+            result = x.dtype.type(result)
+            return result
+
+    dot.vtable = _dtype_vtable({np.float32:    'Sdot',
+                                np.float64:    'Ddot',
+                                np.complex64:  'Cdotu',
+                                np.complex128: 'Zdotu',})
+
+    def dotc(self, x, y):
+        "Same as np.vdot"
+        _sentry_same_dtype(x, y)
+        _sentry_ndim(1, x, y)
+        _sentry_same_shape(x, y)
+        fn = self._dispatch(self.dotc.vtable, x.dtype)
+        with self._auto_read(x, y) as (dx, dy):
+            return fn(x.size, dx, _norm_stride(x)[0], dy, _norm_stride(y)[0])
+
+    dotc.vtable = _dtype_vtable({np.complex64:  'Cdotc',
+                                 np.complex128: 'Zdotc',})
+
+    def scal(self, alpha, x):
+        "Same as x = alpha * x"
+        _sentry_ndim(1, x)
+        fn = self._dispatch(self.scal.vtable, x.dtype, type(alpha))
+        with self._auto(x) as dx:
+            return fn(x.size, alpha, dx, *_norm_stride(x))
+
+    scal.vtable = _dtype_vtable({np.float32:    'Sscal',
+                                 np.float64:    'Dscal',
+                                 np.complex64:  _sel_complex(imag='Cscal',
+                                                             real='Csscal'),
+                                 np.complex128: _sel_complex(imag='Zscal',
+                                                             real='Zdscal')})
+    def axpy(self, alpha, x, y):
+        "Same as y = alpha * x + y"
+        _sentry_ndim(1, x, y)
+        _sentry_same_dtype(x, y)
+        _sentry_same_shape(x, y)
+        fn = self._dispatch(self.axpy.vtable, x.dtype)
+        with self._auto_read(x) as dx:
+            with self._auto(y) as dy:
+                return fn(x.size, alpha, dx, _norm_stride(x)[0], dy,
+                          _norm_stride(y)[0])
+
+    axpy.vtable = _dtype_vtable({np.float32:    'Saxpy',
+                                 np.float64:    'Daxpy',
+                                 np.complex64:  'Caxpy',
+                                 np.complex128: 'Zaxpy'})
+
+    def amax(self, x):
+        "Same as np.argmax(x)"
+        _sentry_ndim(1, x)
+        fn = self._dispatch(self.amax.vtable, x.dtype)
+        with self._auto_read(x) as dx:
+            return fn(x.size, dx, _norm_stride(x)[0]) - 1
+
+    amax.vtable = _dtype_vtable({np.float32:    'Isamax',
+                                 np.float64:    'Idamax',
+                                 np.complex64:  'Icamax',
+                                 np.complex128: 'Izamax'})
+
+    def amin(self, x):
+        "Same as np.argmin(x)"
+        _sentry_ndim(1, x)
+        fn = self._dispatch(self.amin.vtable, x.dtype)
+        with self._auto_read(x) as dx:
+            return fn(x.size, dx, _norm_stride(x)[0]) - 1
+
+    amin.vtable = _dtype_vtable({np.float32:    'Isamin',
+                                 np.float64:    'Idamin',
+                                 np.complex64:  'Icamin',
+                                 np.complex128: 'Izamin'})
+
+    def asum(self, x):
+        "Same as np.sum(x)"
+        _sentry_ndim(1, x)
+        fn = self._dispatch(self.asum.vtable, x.dtype)
+        with self._auto_read(x) as dx:
+            return fn(x.size, dx, _norm_stride(x)[0])
+
+    asum.vtable = _dtype_vtable({np.float32:    'Sasum',
+                                 np.float64:    'Dasum',
+                                 np.complex64:  'Scasum',
+                                 np.complex128: 'Dzasum'})
+
+    def rot(self, x, y, c, s):
+        "Same as x, y = c * x + s * y, -s * x + c * y"
+        _sentry_ndim(1, x, y)
+        fn = self._dispatch(self.rot.vtable, x.dtype, type(s))
+        with self._auto(x, y) as (dx, dy):
+            return fn(x.size, dx, _norm_stride(x)[0], dy, _norm_stride(x)[0],
+                      c, s)
+
+    rot.vtable = _dtype_vtable({np.float32:    'Srot',
+                                np.float64:    'Drot',
+                                np.complex64:  _sel_complex(imag='Crot',
+                                                            real='Csrot'),
+                                np.complex128: _sel_complex(imag='Zrot',
+                                                            real='Zdrot')})
+
+    def rotg(self, a, b):
+        '''Compute the given rotation matrix given a column vector (a, b).
+        Returns r, z, c, s.
+
+        r: r = a ** 2 + b ** 2.
+
+        z: Use to recover c and s.
+
+        if abs(z) < 1:
+            c, s = 1 - z ** 2, z
+        elif abs(z) == 1:
+            c, s = 0, 1
+        else:
+            c, s = 1 / z, 1 - z ** 2
+
+        c: Cosine element of the rotation matrix.
+
+        s: Sine element of the rotation matrix.
+        '''
+        a, b = np.asarray(a), np.asarray(b)
+        _sentry_same_dtype(a, b)
+        fn = self._dispatch(self.rotg.vtable, a.dtype)
+        return fn(np.asscalar(a), np.asscalar(b))
+
+    rotg.vtable = _dtype_vtable({np.float32:        'Srotg',
+                                 np.float64:        'Drotg',
+                                 np.complex64:      'Crotg',
+                                 np.complex128:     'Zrotg'})
+
+    def rotm(self, x, y, param):
+        '''Applies the modified Givens transformation.
+
+        x, y = h11 * x + h12 * y, h21 * x + h22 * y
+
+        param --- [flag, h11, h21, h12, h22]
+
+        Refer to cuBLAS documentation for detail.
+        '''
+        _sentry_ndim(1, x, y)
+        _sentry_same_dtype(x, y)
+        _sentry_same_shape(x, y)
+        fn = self._dispatch(self.rotm.vtable, x.dtype)
+        with self._auto(x, y) as (dx, dy):
+            return fn(x.size, dx, _norm_stride(x)[0], dy, _norm_stride(y)[0],
+                      param)
+
+    rotm.vtable = _dtype_vtable({np.float32:       'Srotm',
+                                 np.float64:       'Drotm'})
+
+    def rotmg(self, d1, d2, x1, y1):
+        '''Constructs the modified Givens transformation.
+
+        Returns param that is usable in rotm.
+
+        Refer to cuBLAS documentation for detail.
+        '''
+        d1, d2, x1, y1 = map(np.asarray, [d1, d2, x1, y1])
+        _sentry_same_dtype(d1, d2, x1, y1)
+        fn = self._dispatch(self.rotmg.vtable, x1.dtype)
+        return fn(*map(np.asscalar, [d1, d2, x1, y1]))
+
+    rotmg.vtable = _dtype_vtable({np.float32: 'Srotmg',
+                                  np.float64: 'Drotmg'})
+
+    # Level 2
+
+    gbmv = _auto_l2_functions('gbmv', 'SDCZ',
+                             'trans, m, n, kl, ku, alpha, A:r, x:r, beta, y:w',
+                             'lda, incx, incy')
+
+    gemv = _auto_l2_functions('gemv', 'SDCZ',
+                              'trans, m, n, alpha, A:r, x:r, beta, y:w',
+                              'lda, incx, incy')
+
+    trmv = _auto_l2_functions('trmv', 'SDCZ',
+                              'uplo, trans, diag, n, A:r, x:w',
+                              'lda, incx')
+
+    tbmv = _auto_l2_functions('tbmv', 'SDCZ',
+                              'uplo, trans, diag, n, k, A:r, x:w',
+                              'lda, incx')
+
+    tpmv = _auto_l2_functions('tpmv', 'SDCZ',
+                              'uplo, trans, diag, n, AP:r, x:w',
+                              'incx')
+
+    trsv = _auto_l2_functions('trsv', 'SDCZ',
+                              'uplo, trans, diag, n, A:r, x:w',
+                              'lda, incx')
+
+    tpsv = _auto_l2_functions('tpsv', 'SDCZ',
+                              'uplo, trans, diag, n, AP:r, x:w',
+                              'incx')
+
+    tbsv = _auto_l2_functions('tbsv', 'SDCZ',
+                              'uplo, trans, diag, n, k, A:r, x:w',
+                              'lda, incx')
+
+    symv = _auto_l2_functions('symv', 'SDCZ',
+                              'uplo, n, alpha, A:r, x:r, beta, y:w',
+                              'lda, incx, incy')
+
+    hemv = _auto_l2_functions('hemv', 'CZ',
+                              'uplo, n, alpha, A:r, x:r, beta, y:w',
+                              'lda, incx, incy')
+
+    sbmv = _auto_l2_functions('sbmv', 'SDCZ',
+                              'uplo, n, k, alpha, A:r, x:r, beta, y:w',
+                              'lda, incx, incy')
+
+    hbmv = _auto_l2_functions('hbmv', 'CZ',
+                              'uplo, n, k, alpha, A:r, x:r, beta, y:w',
+                              'lda, incx, incy')
+
+    spmv = _auto_l2_functions('spmv', 'SD',
+                              'uplo, n, alpha, AP:r, x:r, beta, y:w',
+                              'incx, incy')
+
+    hpmv = _auto_l2_functions('hpmv', 'CZ',
+                              'uplo, n, alpha, AP:r, x:r, beta, y:w',
+                              'incx, incy')
+
+    ger = _auto_l2_functions('ger', 'SD',
+                             'm, n, alpha, x:r, y:r, A:w',
+                             'incx, incy, lda')
+
+    geru = _auto_l2_functions('geru', 'CZ',
+                              'm, n, alpha, x:r, y:r, A:w',
+                              'incx, incy, lda')
+
+    gerc = _auto_l2_functions('gerc', 'CZ',
+                              'm, n, alpha, x:r, y:r, A:w',
+                              'incx, incy, lda')
+
+    syr = _auto_l2_functions('syr', 'SDCZ', 'uplo, n, alpha, x:r, A:w',
+                             'incx, lda')
+
+    her = _auto_l2_functions('her', 'CZ', 'uplo, n, alpha, x:r, A:w',
+                             'incx, lda')
+
+    spr = _auto_l2_functions('spr', 'SD', 'uplo, n, alpha, x:r, AP:w',
+                             'incx')
+
+    hpr = _auto_l2_functions('hpr', 'CZ', 'uplo, n, alpha, x:r, AP:w',
+                             'incx')
+
+    syr2 = _auto_l2_functions('syr2', 'SDCZ',
+                              'uplo, n, alpha, x:r, y:r, A:w',
+                              'incx, incy, lda')
+
+    her2 = _auto_l2_functions('her2', 'CZ',
+                              'uplo, n, alpha, x:r, y:r, A:w',
+                              'incx, incy, lda')
+
+    spr2 = _auto_l2_functions('spr2', 'SDCZ',
+                              'uplo, n, alpha, x:r, y:r, A:w',
+                              'incx, incy')
+
+    hpr2 = _auto_l2_functions('hpr2', 'CZ',
+                              'uplo, n, alpha, x:r, y:r, A:w',
+                              'incx, incy')
+
+    # Level 3
+
+    gemm = _auto_l2_functions('gemm', 'SDCZ',
+                          'transa, transb, m, n, k, alpha, A:r, B:r, beta, C:w',
+                          'lda, ldb, ldc')
+
+    syrk = _auto_l2_functions('syrk', 'SDCZ',
+                              'uplo, trans, n, k, alpha, A:r, beta, C:w',
+                              'lda, ldc')
+
+    herk = _auto_l2_functions('herk', 'CZ',
+                              'uplo, trans, n, k, alpha, A:r, beta, C:w',
+                              'lda, ldc')
+
+    symm = _auto_l2_functions('symm', 'SDCZ',
+                              'side, uplo, m, n, alpha, A:r, B:r, beta, C:w',
+                              'lda, ldb, ldc')
+
+    hemm = _auto_l2_functions('hemm', 'CZ',
+                              'side, uplo, m, n, alpha, A:r, B:r, beta, C:w',
+                              'lda, ldb, ldc')
+
+    trsm = _auto_l2_functions('trsm', 'SDCZ',
+                              'side, uplo, trans, diag, m, n, alpha, A:r, B:w',
+                              'lda, ldb')
+
+    trmm = _auto_l2_functions('trmm', 'SDCZ',
+                          'side, uplo, trans, diag, m, n, alpha, A:r, B:r, C:w',
+                          'lda, ldb, ldc')
+
+    dgmm = _auto_l2_functions('dgmm', 'SDCZ',
+                              'side, m, n, A:r, x:r, C:w',
+                              'lda, ldc, incx')
+
+    geam = _auto_l2_functions('geam', 'SDCZ',
+                            'transa, transb, m, n, alpha, A:r, beta, B:r, C:w',
+                            'lda, ldb, ldc')
+
+#----------------
+# utils
+#----------------
+
+
+
+def _sentry_same_shape(*arys):
+    first = arys[0]
+    for ary in arys:
+        if ary.shape != first.shape:
+            raise ValueError("Expecting all arrays to have the same shape.")
+
+def _sentry_same_dtype(*arys):
+    first = arys[0]
+    for ary in arys:
+        if ary.dtype != first.dtype:
+            raise TypeError("All arrays must have the same dtype.")
+
+def _sentry_ndim(ndim, *arys):
+    for ary in arys:
+        if ary.ndim != ndim:
+            raise ValueError("Expecting %d dimension array." % ndim)
+
+def _norm_stride(ary):
+    retval = []
+    for stride in ary.strides:
+        if stride % ary.dtype.itemsize != 0:
+            raise ValueError("Misalignment.")
+        retval.append(stride // ary.dtype.itemsize)
+    return retval
diff --git a/pyculib/blas/binding.py b/pyculib/blas/binding.py
new file mode 100644
index 0000000..adc6d90
--- /dev/null
+++ b/pyculib/blas/binding.py
@@ -0,0 +1,1476 @@
+from __future__ import absolute_import
+import numpy as np
+from ctypes import c_float, c_double, byref, c_int, c_void_p, POINTER
+
+from numba.cuda.cudadrv.drvapi import cu_stream
+from numba.cuda.cudadrv.driver import device_pointer, host_pointer
+from pyculib.utils import (Lib, ctype_function, OwnerMixin,
+                                   c_double_complex, c_complex, memalign)
+
+INV_STATUS = dict(
+    CUBLAS_STATUS_SUCCESS=0,
+    CUBLAS_STATUS_NOT_INITIALIZED=1,
+    CUBLAS_STATUS_ALLOC_FAILED=3,
+    CUBLAS_STATUS_INVALID_VALUE=7,
+    CUBLAS_STATUS_ARCH_MISMATCH=8,
+    CUBLAS_STATUS_MAPPING_ERROR=11,
+    CUBLAS_STATUS_EXECUTION_FAILED=13,
+    CUBLAS_STATUS_INTERNAL_ERROR=14
+)
+
+STATUS = dict((v, k) for k, v in INV_STATUS.items())
+
+cublasStatus_t = c_int
+
+CUBLAS_FILL_MODE_LOWER = 0
+CUBLAS_FILL_MODE_UPPER = 1
+
+CUBLAS_FILL_MODE_MAP = {
+    'L': CUBLAS_FILL_MODE_LOWER,
+    'U': CUBLAS_FILL_MODE_UPPER,
+}
+
+cublasFillMode_t = c_int
+
+CUBLAS_DIAG_NON_UNIT = 0
+CUBLAS_DIAG_UNIT = 1
+
+cublasDiagType_t = c_int
+
+CUBLAS_DIAG_MAP = {
+    True: CUBLAS_DIAG_UNIT,
+    False: CUBLAS_DIAG_NON_UNIT,
+}
+
+CUBLAS_SIDE_LEFT = 0
+CUBLAS_SIDE_RIGHT = 1
+
+CUBLAS_SIDE_MAP = {
+    'L': CUBLAS_SIDE_LEFT,
+    'R': CUBLAS_SIDE_RIGHT,
+}
+
+cublasSideMode_t = c_int
+
+CUBLAS_OP_N = 0
+CUBLAS_OP_T = 1
+CUBLAS_OP_C = 2
+
+cublasOperation_t = c_int
+
+CUBLAS_POINTER_MODE_HOST = 0
+CUBLAS_POINTER_MODE_DEVICE = 1
+
+cublasPointerMode_t = c_int
+
+CUBLAS_ATOMICS_NOT_ALLOWED = 0
+CUBLAS_ATOMICS_ALLOWED = 1
+
+cublasAtomicsMode_t = c_int
+
+cublasHandle_t = c_void_p # opaque handle
+
+CUBLAS_OP_MAP = {'N': CUBLAS_OP_N,
+                 'T': CUBLAS_OP_T,
+                 'C': CUBLAS_OP_C,
+                 'H': CUBLAS_OP_C, }
+
+
+class CuBLASError(Exception):
+    def __init__(self, code):
+        super(CuBLASError, self).__init__(STATUS[code])
+
+
+class libcublas(Lib):
+    lib = 'cublas'
+    ErrorType = CuBLASError
+
+    cublasCreate_v2 = ctype_function(cublasStatus_t,
+                                     POINTER(cublasHandle_t))  # handle
+
+    cublasDestroy_v2 = ctype_function(cublasStatus_t,
+                                      cublasHandle_t)   # handle
+
+    cublasGetVersion_v2 = ctype_function(cublasStatus_t,
+                                         cublasHandle_t, # handle
+                                         POINTER(c_int)) # version
+
+    cublasSetStream_v2 = ctype_function(cublasStatus_t,
+                                        cublasHandle_t, # handle
+                                        cu_stream)      # streamId
+
+    cublasGetStream_v2 = ctype_function(cublasStatus_t,
+                                        cublasHandle_t, # handle
+                                        POINTER(cu_stream))      # streamId
+
+    cublasGetPointerMode_v2 = ctype_function(cublasStatus_t,
+                                             cublasHandle_t, # handle
+                                             POINTER(
+                                                 cublasPointerMode_t)) # mode
+
+    cublasSetPointerMode_v2 = ctype_function(cublasStatus_t,
+                                             cublasHandle_t, # handle
+                                             cublasPointerMode_t) # mode
+
+    cublasGetAtomicsMode = ctype_function(cublasStatus_t,
+                                          cublasHandle_t, # handle
+                                          POINTER(cublasAtomicsMode_t)) # mode
+
+    cublasSetAtomicsMode = ctype_function(cublasStatus_t,
+                                          cublasHandle_t, # handle
+                                          cublasAtomicsMode_t)  # mode
+
+    # Level 1
+    cublasSnrm2_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle
+                                    c_int, # n
+                                    c_void_p, # device array
+                                    c_int, # incx
+                                    c_void_p)       # result - host/device scalar
+
+    cublasDnrm2_v2 = cublasSnrm2_v2
+    cublasScnrm2_v2 = cublasSnrm2_v2
+    cublasDznrm2_v2 = cublasSnrm2_v2
+
+    cublasSdot_v2 = ctype_function(cublasStatus_t,
+                                   cublasHandle_t, # handle
+                                   c_int, # n
+                                   c_void_p, # x
+                                   c_int, # incx
+                                   c_void_p, # y
+                                   c_int, # incy,
+                                   c_void_p)   # result h/d ptr
+    cublasDdot_v2 = cublasSdot_v2
+    cublasCdotu_v2 = cublasSdot_v2
+    cublasZdotu_v2 = cublasSdot_v2
+    cublasCdotc_v2 = cublasSdot_v2
+    cublasZdotc_v2 = cublasSdot_v2
+
+    cublasSscal_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle
+                                    c_int, # n
+                                    c_void_p, # alpha h/d
+                                    c_void_p, # x
+                                    c_int)            # incx
+
+    cublasDscal_v2 = cublasSscal_v2
+    cublasCscal_v2 = cublasSscal_v2
+    cublasZscal_v2 = cublasSscal_v2
+    cublasCsscal_v2 = cublasSscal_v2
+    cublasZdscal_v2 = cublasSscal_v2
+
+    cublasSaxpy_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle
+                                    c_int, # n
+                                    c_void_p, # alpha h/d
+                                    c_void_p, # x
+                                    c_int, # incx
+                                    c_void_p, # y
+                                    c_int)          # incy
+    cublasDaxpy_v2 = cublasSaxpy_v2
+    cublasCaxpy_v2 = cublasSaxpy_v2
+    cublasZaxpy_v2 = cublasSaxpy_v2
+
+    cublasIsamax_v2 = ctype_function(cublasStatus_t,
+                                     cublasHandle_t, # handle
+                                     c_int, # n
+                                     c_void_p, # x
+                                     c_int, # incx
+                                     POINTER(c_int)) # result h/d ptr
+
+    cublasIdamax_v2 = cublasIsamax_v2
+    cublasIcamax_v2 = cublasIsamax_v2
+    cublasIzamax_v2 = cublasIsamax_v2
+
+    cublasIsamin_v2 = cublasIsamax_v2
+
+    cublasIdamin_v2 = cublasIsamin_v2
+    cublasIcamin_v2 = cublasIsamin_v2
+    cublasIzamin_v2 = cublasIsamin_v2
+
+    cublasSasum_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle
+                                    c_int, # n
+                                    c_void_p, # x
+                                    c_int, # incx
+                                    c_void_p)       # result h/d ptr
+
+    cublasDasum_v2 = cublasSasum_v2
+    cublasScasum_v2 = cublasSasum_v2
+    cublasDzasum_v2 = cublasSasum_v2
+
+    cublasSrot_v2 = ctype_function(cublasStatus_t,
+                                   cublasHandle_t, # handle,
+                                   c_int, # n
+                                   c_void_p, # x
+                                   c_int, # incx
+                                   c_void_p, # y
+                                   c_int, # incy
+                                   c_void_p, # c
+                                   c_void_p)        # s h/d ptr
+
+    cublasDrot_v2 = cublasSrot_v2
+    cublasCrot_v2 = cublasSrot_v2
+    cublasZrot_v2 = cublasSrot_v2
+    cublasCsrot_v2 = cublasSrot_v2
+    cublasZdrot_v2 = cublasSrot_v2
+
+    cublasSrotg_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle,
+                                    c_void_p, # a h/d ptr
+                                    c_void_p, # b h/d ptr
+                                    c_void_p, # c h/d ptr
+                                    c_void_p)           # s h/d ptr
+
+    cublasDrotg_v2 = cublasSrotg_v2
+    cublasCrotg_v2 = cublasSrotg_v2
+    cublasZrotg_v2 = cublasSrotg_v2
+
+    cublasSrotm_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle
+                                    c_int, # n
+                                    c_void_p, # x
+                                    c_int, # incx
+                                    c_void_p, # y
+                                    c_int, # incy
+                                    c_void_p)           # param h/d pointer
+    cublasDrotm_v2 = cublasSrotm_v2
+
+    cublasSrotmg_v2 = ctype_function(cublasStatus_t,
+                                     cublasHandle_t, # handle,
+                                     c_void_p, # d1 h/d ptr
+                                     c_void_p, # d2 h/d ptr
+                                     c_void_p, # x1 h/d ptr
+                                     c_void_p, # y1 h/d ptr
+                                     c_void_p)       # param h/d ptr
+
+    cublasDrotmg_v2 = cublasSrotmg_v2
+
+    #
+    # Level 2
+    #
+    cublasSgbmv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle,
+                                    cublasOperation_t, # trans,
+                                    c_int, # m,
+                                    c_int, # n,
+                                    c_int, # kl,
+                                    c_int, # ku,
+                                    c_void_p, # *alpha,
+                                    c_void_p, # *A,
+                                    c_int, # lda,
+                                    c_void_p, # *x,
+                                    c_int, # incx,
+                                    c_void_p, # *beta,
+                                    c_void_p, # *y,
+                                    c_int)             # incy)
+
+    cublasDgbmv_v2 = cublasSgbmv_v2
+    cublasCgbmv_v2 = cublasSgbmv_v2
+    cublasZgbmv_v2 = cublasSgbmv_v2
+
+    cublasSgemv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle,
+                                    cublasOperation_t, # trans,
+                                    c_int, # m,
+                                    c_int, # n,
+                                    c_void_p, # *alpha,
+                                    c_void_p, # *A,
+                                    c_int, # lda,
+                                    c_void_p, # *x,
+                                    c_int, # incx,
+                                    c_void_p, # *beta,
+                                    c_void_p, # *y,
+                                    c_int)                 # incy)
+
+    cublasDgemv_v2 = cublasSgemv_v2
+    cublasCgemv_v2 = cublasSgemv_v2
+    cublasZgemv_v2 = cublasSgemv_v2
+
+    cublasStrmv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle,
+                                    cublasFillMode_t, # uplo,
+                                    cublasOperation_t, # trans,
+                                    cublasDiagType_t, # diag,
+                                    c_int, # n,
+                                    c_void_p, # *A,
+                                    c_int, # lda,
+                                    c_void_p, # *x,
+                                    c_int)             # incx);
+
+    cublasDtrmv_v2 = cublasStrmv_v2
+    cublasCtrmv_v2 = cublasStrmv_v2
+    cublasZtrmv_v2 = cublasStrmv_v2
+
+    cublasStbmv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle,
+                                    cublasFillMode_t, # uplo,
+                                    cublasOperation_t, # trans,
+                                    cublasDiagType_t, # diag,
+                                    c_int, # n,
+                                    c_int, # k,
+                                    c_void_p, # *A,
+                                    c_int, # lda,
+                                    c_void_p, # *x,
+                                    c_int)                    # incx);
+
+    cublasDtbmv_v2 = cublasStbmv_v2
+    cublasCtbmv_v2 = cublasStbmv_v2
+    cublasZtbmv_v2 = cublasStbmv_v2
+
+    cublasStpmv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle,
+                                    cublasFillMode_t, # uplo,
+                                    cublasOperation_t, # trans,
+                                    cublasDiagType_t, # diag,
+                                    c_int, # n,
+                                    c_void_p, # *AP,
+                                    c_void_p, # *x,
+                                    c_int)              # incx);
+
+    cublasDtpmv_v2 = cublasStpmv_v2
+    cublasCtpmv_v2 = cublasStpmv_v2
+    cublasZtpmv_v2 = cublasStpmv_v2
+
+    cublasStrsv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, # handle,
+                                    cublasFillMode_t, # uplo,
+                                    cublasOperation_t, # trans,
+                                    cublasDiagType_t, # diag,
+                                    c_int, # n,
+                                    c_void_p, # *A,
+                                    c_int, # lda,
+                                    c_void_p, # *x,
+                                    c_int)                # incx);
+
+    cublasDtrsv_v2 = cublasStrsv_v2
+    cublasCtrsv_v2 = cublasStrsv_v2
+    cublasZtrsv_v2 = cublasStrsv_v2
+
+    cublasStpsv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    cublasOperation_t, #trans,
+                                    cublasDiagType_t, #diag,
+                                    c_int, #n,
+                                    c_void_p, #*AP,
+                                    c_void_p, #*x,
+                                    c_int)              #incx);
+
+    cublasDtpsv_v2 = cublasStpsv_v2
+    cublasCtpsv_v2 = cublasStpsv_v2
+    cublasZtpsv_v2 = cublasStpsv_v2
+
+    cublasStbsv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    cublasOperation_t, #trans,
+                                    cublasDiagType_t, #diag,
+                                    c_int, #n,
+                                    c_int, #k,
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*x,
+                                    c_int)              #incx);
+
+    cublasDtbsv_v2 = cublasStbsv_v2
+    cublasCtbsv_v2 = cublasStbsv_v2
+    cublasZtbsv_v2 = cublasStbsv_v2
+
+    cublasSsymv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    c_int, #n,
+                                    c_void_p, #*alpha,
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*x,
+                                    c_int, #incx,
+                                    c_void_p, #*beta,
+                                    c_void_p, #*y,
+                                    c_int)              #incy);
+
+    cublasDsymv_v2 = cublasSsymv_v2
+    cublasCsymv_v2 = cublasSsymv_v2
+    cublasZsymv_v2 = cublasSsymv_v2
+
+    cublasChemv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    c_int, #n,
+                                    c_void_p, #*alpha,
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*x,
+                                    c_int, #incx,
+                                    c_void_p, #*beta,
+                                    c_void_p, #*y,
+                                    c_int)             #incy);
+    cublasZhemv_v2 = cublasChemv_v2
+
+    cublasSsbmv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    c_int, #n,
+                                    c_int, #k,
+                                    c_void_p, #*alpha
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*x,
+                                    c_int, #incx,
+                                    c_void_p, #*beta
+                                    c_void_p, #*y,
+                                    c_int)             #incy);
+    cublasDsbmv_v2 = cublasSsbmv_v2
+
+    cublasChbmv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    c_int, #n,
+                                    c_int, #k,
+                                    c_void_p, #*alpha,
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*x,
+                                    c_int, #incx,
+                                    c_void_p, #*beta,
+                                    c_void_p, #*y,
+                                    c_int)                 #incy);
+    cublasZhbmv_v2 = cublasChbmv_v2
+
+    cublasSspmv_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    c_int, #n,
+                                    c_void_p, #*alpha,
+                                    c_void_p, #*AP,
+                                    c_void_p, #*x,
+                                    c_int, #incx,
+                                    c_void_p, #*beta,
+                                    c_void_p, #*y,
+                                    c_int)     #incy);
+
+    cublasDspmv_v2 = cublasSspmv_v2
+    cublasChpmv_v2 = cublasSspmv_v2
+    cublasZhpmv_v2 = cublasChpmv_v2
+
+    cublasSger_v2 = ctype_function(cublasStatus_t,
+                                   cublasHandle_t, #handle,
+                                   c_int, #m,
+                                   c_int, #n,
+                                   c_void_p, #*alpha,
+                                   c_void_p, #*x,
+                                   c_int, #incx,
+                                   c_void_p, #*y,
+                                   c_int, #incy,
+                                   c_void_p, #*A,
+                                   c_int)           #lda);
+    cublasDger_v2 = cublasSger_v2
+    cublasCgeru_v2 = cublasDger_v2
+    cublasCgerc_v2 = cublasDger_v2
+    cublasZgeru_v2 = cublasDger_v2
+    cublasZgerc_v2 = cublasDger_v2
+
+    cublasSsyr_v2 = ctype_function(cublasStatus_t,
+                                   cublasHandle_t, #handle,
+                                   cublasFillMode_t, #uplo,
+                                   c_int, #n,
+                                   c_void_p, #*alpha,
+                                   c_void_p, #*x,
+                                   c_int, #incx,
+                                   c_void_p, #*A,
+                                   c_int)              #lda);
+    cublasDsyr_v2 = cublasSsyr_v2
+    cublasCsyr_v2 = cublasSsyr_v2
+    cublasZsyr_v2 = cublasSsyr_v2
+
+    cublasCher_v2 = ctype_function(cublasStatus_t,
+                                   cublasHandle_t, #handle,
+                                   cublasFillMode_t, #uplo,
+                                   c_int, #n,
+                                   c_void_p, #*alpha,
+                                   c_void_p, #*x,
+                                   c_int, #incx,
+                                   c_void_p, #*A,
+                                   c_int)       #lda);
+
+    cublasZher_v2 = cublasCher_v2
+
+    cublasSspr_v2 = ctype_function(cublasStatus_t,
+                                   cublasHandle_t, #handle,
+                                   cublasFillMode_t, # uplo,
+                                   c_int, #n,
+                                   c_void_p, #*alpha,
+                                   c_void_p, #*x,
+                                   c_int, #incx,
+                                   c_void_p)         #*AP);
+
+    cublasDspr_v2 = cublasSspr_v2
+    cublasChpr_v2 = cublasSspr_v2
+    cublasZhpr_v2 = cublasSspr_v2
+
+    cublasSsyr2_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    c_int, #n,
+                                    c_void_p, #*alpha,
+                                    c_void_p, #*x,
+                                    c_int, #incx,
+                                    c_void_p, #*y,
+                                    c_int, #incy,
+                                    c_void_p, #*A,
+                                    c_int)                 #lda);
+
+    cublasDsyr2_v2 = cublasSsyr2_v2
+    cublasCsyr2_v2 = cublasSsyr2_v2
+    cublasZsyr2_v2 = cublasSsyr2_v2
+    cublasCher2_v2 = cublasSsyr2_v2
+    cublasZher2_v2 = cublasSsyr2_v2
+
+    cublasSspr2_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    c_int, #n,
+                                    c_void_p, #*alpha
+                                    c_void_p, #*x,
+                                    c_int, #incx,
+                                    c_void_p, #*y,
+                                    c_int, #incy,
+                                    c_void_p)          #*AP);
+
+    cublasDspr2_v2 = cublasSspr2_v2
+
+    cublasChpr2_v2 = cublasSspr2_v2
+    cublasZhpr2_v2 = cublasSspr2_v2
+
+    # Level 3
+    cublasSgemm_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasOperation_t, #transa,
+                                    cublasOperation_t, #transb,
+                                    c_int, #m,
+                                    c_int, #n,
+                                    c_int, #k,
+                                    c_void_p, #*alpha
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*B,
+                                    c_int, #ldb,
+                                    c_void_p, #*beta,
+                                    c_void_p, #*C,
+                                    c_int)         #ldc);
+
+    cublasDgemm_v2 = cublasSgemm_v2
+    cublasCgemm_v2 = cublasSgemm_v2
+    cublasZgemm_v2 = cublasSgemm_v2
+
+    cublasSsyrk_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    cublasOperation_t, #trans,
+                                    c_int, #n,
+                                    c_int, #k,
+                                    c_void_p, #*alpha,
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*beta,
+                                    c_void_p, #*C,
+                                    c_int)             #ldc);
+
+    cublasDsyrk_v2 = cublasSsyrk_v2
+    cublasCsyrk_v2 = cublasSsyrk_v2
+    cublasZsyrk_v2 = cublasSsyrk_v2
+
+    cublasCherk_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasFillMode_t, #uplo,
+                                    cublasOperation_t, #trans,
+                                    c_int, #n,
+                                    c_int, #k,
+                                    c_void_p, #*alpha,
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*beta
+                                    c_void_p, #*C,
+                                    c_int)                 #ldc);
+    cublasZherk_v2 = cublasCherk_v2
+
+    cublasSsymm_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasSideMode_t, #side,
+                                    cublasFillMode_t, #uplo,
+                                    c_int, #m,
+                                    c_int, #n,
+                                    c_void_p, #*alpha
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*B,
+                                    c_int, #ldb,
+                                    c_void_p, #*beta
+                                    c_void_p, #*C,
+                                    c_int)             #ldc);
+
+    cublasDsymm_v2 = cublasSsymm_v2
+    cublasCsymm_v2 = cublasSsymm_v2
+    cublasZsymm_v2 = cublasSsymm_v2
+
+    cublasChemm_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasSideMode_t, #side,
+                                    cublasFillMode_t, #uplo,
+                                    c_int, #m,
+                                    c_int, #n,
+                                    c_void_p, #*alpha
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*B,
+                                    c_int, #ldb,
+                                    c_void_p, #*beta
+                                    c_void_p, #*C,
+                                    c_int)             #ldc);
+    cublasZhemm_v2 = cublasChemm_v2
+
+    cublasStrsm_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasSideMode_t, #side,
+                                    cublasFillMode_t, #uplo,
+                                    cublasOperation_t, #trans,
+                                    cublasDiagType_t, #diag,
+                                    c_int, #m,
+                                    c_int, #n,
+                                    c_void_p, #*alpha
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*B,
+                                    c_int)                 #ldb);
+
+    cublasDtrsm_v2 = cublasStrsm_v2
+    cublasCtrsm_v2 = cublasStrsm_v2
+    cublasZtrsm_v2 = cublasStrsm_v2
+
+    cublasStrmm_v2 = ctype_function(cublasStatus_t,
+                                    cublasHandle_t, #handle,
+                                    cublasSideMode_t, #side,
+                                    cublasFillMode_t, #uplo,
+                                    cublasOperation_t, #trans,
+                                    cublasDiagType_t, #diag,
+                                    c_int, #m,
+                                    c_int, #n,
+                                    c_void_p, #*alpha
+                                    c_void_p, #*A,
+                                    c_int, #lda,
+                                    c_void_p, #*B,
+                                    c_int, #ldb,
+                                    c_void_p, #*C,
+                                    c_int)              #ldc);
+
+    cublasDtrmm_v2 = cublasStrmm_v2
+    cublasCtrmm_v2 = cublasStrmm_v2
+    cublasZtrmm_v2 = cublasStrmm_v2
+
+    cublasSdgmm = ctype_function(cublasStatus_t,
+                                 cublasHandle_t, #handle,
+                                 cublasSideMode_t, #mode,
+                                 c_int, #m,
+                                 c_int, #n,
+                                 c_void_p, #*A,
+                                 c_int, #lda,
+                                 c_void_p, #*x,
+                                 c_int, #incx,
+                                 c_void_p, #*C,
+                                 c_int)             #ldc);
+    cublasDdgmm = cublasSdgmm
+    cublasCdgmm = cublasSdgmm
+    cublasZdgmm = cublasSdgmm
+
+    cublasSgeam = ctype_function(cublasStatus_t,
+                                 cublasHandle_t, #handle,
+                                 cublasOperation_t, #transa,
+                                 cublasOperation_t, #transb,
+                                 c_int, #m,
+                                 c_int, #n,
+                                 c_void_p, #*alpha,
+                                 c_void_p, #*A,
+                                 c_int, #lda,
+                                 c_void_p, #*beta,
+                                 c_void_p, #*B,
+                                 c_int, #ldb,
+                                 c_void_p, #*C
+                                 c_int)                 #ldc);
+    cublasDgeam = cublasSgeam
+    cublasCgeam = cublasSgeam
+    cublasZgeam = cublasSgeam
+
+
+class _alloc_scalar(object):
+    def __init__(self, cty, value=None):
+        if cty == c_double_complex:
+            cval, data = memalign(cty, align=16)
+            self._data = data
+            self._cval = cval
+            self._as_parameter_ = byref(self._cval)
+            self.return_value = lambda: self._cval.value
+
+            if value is not None:
+                val = cty(value)
+                self._cval.real = val.real
+                self._cval.imag = val.imag
+
+        else:
+            if value is not None:
+                self._data = cty(value)
+            else:
+                self._data = cty()
+            self._as_parameter_ = byref(self._data)
+            self.return_value = lambda: self._data.value
+
+def _return_scalar(result):
+    return result.return_value()
+
+
+def _Tnrm2(fmt, cty):
+    def nrm2(self, n, x, incx):
+        result = _alloc_scalar(cty)
+        fn = getattr(self._api, 'cublas%snrm2_v2' % fmt)
+        fn(self._handle, int(n), device_pointer(x), int(incx), result)
+        return _return_scalar(result)
+
+    return nrm2
+
+
+def _Tdot(fmt, cty, postfix=''):
+    def dot(self, n, x, incx, y, incy):
+        result = _alloc_scalar(cty)
+        fn = getattr(self._api, 'cublas%sdot%s_v2' % (fmt, postfix))
+        fn(self._handle, int(n), device_pointer(x), int(incx),
+           device_pointer(y), int(incy), result)
+        return _return_scalar(result)
+
+    return dot
+
+
+def _Tscal(fmt, cty):
+    def scal(self, n, alpha, x, incx):
+        "Stores result to x"
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn = getattr(self._api, 'cublas%sscal_v2' % fmt)
+        fn(self._handle, int(n), c_alpha, device_pointer(x), int(incx))
+
+    return scal
+
+
+def _Taxpy(fmt, cty):
+    def axpy(self, n, alpha, x, incx, y, incy):
+        "Stores result to y"
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn = getattr(self._api, 'cublas%saxpy_v2' % fmt)
+        fn(self._handle, int(n), c_alpha, device_pointer(x), int(incx),
+           device_pointer(y), int(incy))
+
+    return axpy
+
+
+def _Itamax(fmt, cty):
+    def amax(self, n, x, incx):
+        result = _alloc_scalar(c_int)
+        fn = getattr(self._api, 'cublasI%samax_v2' % fmt)
+        fn(self._handle, int(n), device_pointer(x), int(incx),
+           result)
+        return _return_scalar(result)
+
+    return amax
+
+
+def _Itamin(fmt, cty):
+    def amin(self, n, x, incx):
+        result = _alloc_scalar(c_int)
+        fn = getattr(self._api, 'cublasI%samin_v2' % fmt)
+        fn(self._handle, int(n), device_pointer(x), int(incx), result)
+        return _return_scalar(result)
+
+    return amin
+
+
+def _Tasum(fmt, cty):
+    def asum(self, n, x, incx):
+        result = _alloc_scalar(cty)
+        fn = getattr(self._api, 'cublas%sasum_v2' % fmt)
+        fn(self._handle, int(n), device_pointer(x), int(incx), result)
+        return _return_scalar(result)
+
+    return asum
+
+
+def _Trot(fmt, cty, sty):
+    def rot(self, n, x, incx, y, incy, c, s):
+        "Stores to x and y"
+        c_c = _alloc_scalar(cty, c)
+        c_s = _alloc_scalar(sty, s)
+        fn = getattr(self._api, 'cublas%srot_v2' % fmt)
+        fn(self._handle, int(n), device_pointer(x), int(incx),
+           device_pointer(y), int(incy), c_c, c_s)
+
+    return rot
+
+
+def _Trotg(fmt, ty, cty):
+    def rotg(self, a, b):
+        c_a = _alloc_scalar(ty, a)
+        c_b = _alloc_scalar(ty, b)
+        c_c = _alloc_scalar(cty)
+        c_s = _alloc_scalar(ty)
+        fn = getattr(self._api, 'cublas%srotg_v2' % fmt)
+        fn(self._handle, c_a, c_b, c_c, c_s)
+        r, z, c, s = map(_return_scalar, [c_a, c_b, c_c, c_s])
+        return r, z, c, s
+
+    return rotg
+
+
+def _Trotm(fmt, dtype):
+    def rotm(self, n, x, incx, y, incy, param):
+        "Stores result to x, y"
+        fn = getattr(self._api, 'cublas%srotm_v2' % fmt)
+        assert len(param.shape) == 1, "param must be a 1-d array"
+        assert param.size >= 5, "param must have at least 5 elements"
+        assert param.dtype == np.dtype(dtype), "param dtype mismatch"
+        fn(self._handle, int(n), device_pointer(x), int(incx),
+           device_pointer(y), int(incy), host_pointer(param))
+
+    return rotm
+
+
+def _Trotmg(fmt, cty, dtype):
+    def rotmg(self, d1, d2, x1, y1):
+        fn = getattr(self._api, 'cublas%srotmg_v2' % fmt)
+        c_d1 = _alloc_scalar(cty, d1)
+        c_d2 = _alloc_scalar(cty, d2)
+        c_x1 = _alloc_scalar(cty, x1)
+        c_y1 = _alloc_scalar(cty, y1)
+        param = np.zeros(5, dtype=dtype)
+        fn(self._handle, c_d1, c_d2, c_x1, c_y1, host_pointer(param))
+        return param
+
+    return rotmg
+
+
+def _Tgbmv(fmt, cty, dtype):
+    def gbmv(self, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy):
+        '''This function performs the banded matrix-vector multiplication
+        '''
+        fn = getattr(self._api, 'cublas%sgbmv_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta = _alloc_scalar(cty, beta)
+        trans = CUBLAS_OP_MAP[trans]
+        fn(self._handle, trans, m, n, kl, ku, c_alpha, device_pointer(A),
+           lda, device_pointer(x), incx, c_beta, device_pointer(y), incy)
+
+    return gbmv
+
+
+def _Tgemv(fmt, cty, dtype):
+    def gemv(self, trans, m, n, alpha, A, lda, x, incx, beta, y, incy):
+        '''This function performs the banded matrix-vector multiplication
+        '''
+        fn = getattr(self._api, 'cublas%sgemv_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta = _alloc_scalar(cty, beta)
+        trans = CUBLAS_OP_MAP[trans]
+        fn(self._handle, trans, m, n, c_alpha, device_pointer(A),
+           lda, device_pointer(x), incx, c_beta, device_pointer(y), incy)
+
+    return gemv
+
+
+def _Ttrmv(fmt, dtype):
+    def trmv(self, uplo, trans, diag, n, A, lda, x, incx):
+        fn = getattr(self._api, 'cublas%strmv_v2' % fmt)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans],
+           CUBLAS_DIAG_MAP[diag], n, device_pointer(A), lda, device_pointer(x),
+           incx)
+
+    return trmv
+
+
+def _Ttbmv(fmt, dtype):
+    def tbmv(self, uplo, trans, diag, n, k, A, lda, x, incx):
+        fn = getattr(self._api, 'cublas%stbmv_v2' % fmt)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans],
+           CUBLAS_DIAG_MAP[diag], n, k, device_pointer(A), lda,
+           device_pointer(x), incx)
+
+    return tbmv
+
+
+def _Ttpmv(fmt, dtype):
+    def tpmv(self, uplo, trans, diag, n, AP, x, incx):
+        fn = getattr(self._api, 'cublas%stpmv_v2' % fmt)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans],
+           CUBLAS_DIAG_MAP[diag], n, device_pointer(AP), device_pointer(x),
+           incx)
+
+    return tpmv
+
+
+def _Ttrsv(fmt, dtype):
+    def trsv(self, uplo, trans, diag, n, A, lda, x, incx):
+        fn = getattr(self._api, 'cublas%strsv_v2' % fmt)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans],
+           CUBLAS_DIAG_MAP[diag], n, device_pointer(A), lda, device_pointer(x),
+           incx)
+
+    return trsv
+
+
+def _Ttpsv(fmt, dtype):
+    def tpsv(self, uplo, trans, diag, n, AP, x, incx):
+        fn = getattr(self._api, 'cublas%stpsv_v2' % fmt)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans],
+           CUBLAS_DIAG_MAP[diag], n, device_pointer(AP), device_pointer(x),
+           incx)
+
+    return tpsv
+
+
+def _Ttbsv(fmt, dtype):
+    def tbsv(self, uplo, trans, diag, n, k, A, lda, x, incx):
+        fn = getattr(self._api, 'cublas%stbsv_v2' % fmt)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans],
+           CUBLAS_DIAG_MAP[diag], n, k, device_pointer(A), lda,
+           device_pointer(x), incx)
+
+    return tbsv
+
+
+def _Tsymv(fmt, cty, dtype):
+    def symv(self, uplo, n, alpha, A, lda, x, incx, beta, y, incy):
+        fn = getattr(self._api, 'cublas%ssymv_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(A), lda, device_pointer(x), incx, c_beta,
+           device_pointer(y), incy)
+
+    return symv
+
+
+def _Themv(fmt, cty, dtype):
+    def symv(self, uplo, n, alpha, A, lda, x, incx, beta, y, incy):
+        fn = getattr(self._api, 'cublas%shemv_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(A), lda, device_pointer(x), incx, c_beta,
+           device_pointer(y), incy)
+
+    return symv
+
+
+def _Tsbmv(fmt, cty, dtype):
+    def sbmv(self, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy):
+        fn = getattr(self._api, 'cublas%ssbmv_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, k, c_alpha,
+           device_pointer(A), lda, device_pointer(x), incx, c_beta,
+           device_pointer(y), incy)
+
+    return sbmv
+
+
+def _Thbmv(fmt, cty, dtype):
+    def sbmv(self, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy):
+        fn = getattr(self._api, 'cublas%shbmv_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, k, c_alpha,
+           device_pointer(A), lda, device_pointer(x), incx, c_beta,
+           device_pointer(y), incy)
+
+    return sbmv
+
+
+def _Tspmv(fmt, cty, dtype):
+    def sbmv(self, uplo, n, alpha, AP, x, incx, beta, y, incy):
+        fn = getattr(self._api, 'cublas%sspmv_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(AP), device_pointer(x), incx, c_beta,
+           device_pointer(y), incy)
+
+    return sbmv
+
+
+def _Thpmv(fmt, cty, dtype):
+    def sbmv(self, uplo, n, alpha, AP, x, incx, beta, y, incy):
+        fn = getattr(self._api, 'cublas%shpmv_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(AP), device_pointer(x), incx, c_beta,
+           device_pointer(y), incy)
+
+    return sbmv
+
+
+def _Tger(fmt, cty, dtype):
+    def ger(self, m, n, alpha, x, incx, y, incy, A, lda):
+        fn = getattr(self._api, 'cublas%sger_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, m, n, c_alpha,
+           device_pointer(x), incx, device_pointer(y), incy,
+           device_pointer(A), lda)
+
+    return ger
+
+
+def _Tgeru(fmt, cty, dtype):
+    def ger(self, m, n, alpha, x, incx, y, incy, A, lda):
+        fn = getattr(self._api, 'cublas%sgeru_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, m, n, c_alpha,
+           device_pointer(x), incx, device_pointer(y), incy,
+           device_pointer(A), lda)
+
+    return ger
+
+
+def _Tgerc(fmt, cty, dtype):
+    def ger(self, m, n, alpha, x, incx, y, incy, A, lda):
+        fn = getattr(self._api, 'cublas%sgerc_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, m, n, c_alpha, device_pointer(x), incx,
+           device_pointer(y), incy, device_pointer(A), lda)
+
+    return ger
+
+
+def _Tsyr(fmt, cty, dtype):
+    def syr(self, uplo, n, alpha, x, incx, A, lda):
+        fn = getattr(self._api, 'cublas%ssyr_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(x), incx, device_pointer(A), lda)
+
+    return syr
+
+
+def _Ther(fmt, cty, dtype):
+    def her(self, uplo, n, alpha, x, incx, A, lda):
+        fn = getattr(self._api, 'cublas%sher_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(x), incx, device_pointer(A), lda)
+
+    return her
+
+
+def _Tspr(fmt, cty, dtype):
+    def spr(self, uplo, n, alpha, x, incx, AP):
+        fn = getattr(self._api, 'cublas%sspr_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(x), incx, device_pointer(AP))
+
+    return spr
+
+
+def _Thpr(fmt, cty, dtype):
+    def hpr(self, uplo, n, alpha, x, incx, AP):
+        fn = getattr(self._api, 'cublas%shpr_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(x), incx, device_pointer(AP))
+
+    return hpr
+
+
+def _Tsyr2(fmt, cty, dtype):
+    def syr2(self, uplo, n, alpha, x, incx, y, incy, A, lda):
+        fn = getattr(self._api, 'cublas%ssyr2_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(x), incx, device_pointer(y), incy, device_pointer(A),
+           lda)
+
+    return syr2
+
+
+def _Ther2(fmt, cty, dtype):
+    def her2(self, uplo, n, alpha, x, incx, y, incy, A, lda):
+        fn = getattr(self._api, 'cublas%sher2_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(x), incx, device_pointer(y), incy, device_pointer(A),
+           lda)
+
+    return her2
+
+
+def _Tspr2(fmt, cty, dtype):
+    def spr2(self, uplo, n, alpha, x, incx, y, incy, A):
+        fn = getattr(self._api, 'cublas%sspr2_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(x), incx, device_pointer(y), incy, device_pointer(A))
+
+    return spr2
+
+
+def _Thpr2(fmt, cty, dtype):
+    def spr2(self, uplo, n, alpha, x, incx, y, incy, A):
+        fn = getattr(self._api, 'cublas%shpr2_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], n, c_alpha,
+           device_pointer(x), incx, device_pointer(y), incy, device_pointer(A))
+
+    return spr2
+
+
+def _Tgemm(fmt, cty, dtype):
+    def gemm(self, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C,
+             ldc):
+        fn = getattr(self._api, 'cublas%sgemm_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta  = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_OP_MAP[transa], CUBLAS_OP_MAP[transb], m, n, k,
+           c_alpha, device_pointer(A), lda, device_pointer(B), ldb,
+           c_beta, device_pointer(C), ldc)
+
+    return gemm
+
+
+def _Tsyrk(fmt, cty, dtype):
+    def syrk(self, uplo, trans, n, k, alpha, A, lda, beta, C, ldc):
+        fn = getattr(self._api, 'cublas%ssyrk_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta  = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans], n, k,
+           c_alpha, device_pointer(A), lda, c_beta,
+           device_pointer(C), ldc)
+
+    return syrk
+
+
+def _Therk(fmt, cty, dtype):
+    def herk(self, uplo, trans, n, k, alpha, A, lda, beta, C, ldc):
+        fn = getattr(self._api, 'cublas%sherk_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_FILL_MODE_MAP[uplo], CUBLAS_OP_MAP[trans], n, k,
+           c_alpha, device_pointer(A), lda, c_beta,
+           device_pointer(C), ldc)
+
+    return herk
+
+
+def _Tsymm(fmt, cty, dtype):
+    def symm(self, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc):
+        fn = getattr(self._api, 'cublas%ssymm_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta  = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_SIDE_MAP[side], CUBLAS_FILL_MODE_MAP[uplo], m,
+           n, c_alpha, device_pointer(A), lda, device_pointer(B), ldb,
+           c_beta, device_pointer(C), ldc)
+
+    return symm
+
+
+def _Themm(fmt, cty, dtype):
+    def hemm(self, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc):
+        fn = getattr(self._api, 'cublas%shemm_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta  = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_SIDE_MAP[side], CUBLAS_FILL_MODE_MAP[uplo], m,
+           n, c_alpha, device_pointer(A), lda, device_pointer(B), ldb,
+           c_beta, device_pointer(C), ldc)
+
+    return hemm
+
+
+def _Ttrsm(fmt, cty, dtype):
+    def trsm(self, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb):
+        fn = getattr(self._api, 'cublas%strsm_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, CUBLAS_SIDE_MAP[side], CUBLAS_FILL_MODE_MAP[uplo],
+           CUBLAS_OP_MAP[trans], CUBLAS_DIAG_MAP[diag], m, n,
+           c_alpha, device_pointer(A), lda, device_pointer(B), ldb)
+
+    return trsm
+
+
+def _Ttrmm(fmt, cty, dtype):
+    def trmm(self, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+             C, ldc):
+        fn = getattr(self._api, 'cublas%strmm_v2' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        fn(self._handle, CUBLAS_SIDE_MAP[side], CUBLAS_FILL_MODE_MAP[uplo],
+           CUBLAS_OP_MAP[trans], CUBLAS_DIAG_MAP[diag], m, n,
+           c_alpha, device_pointer(A), lda, device_pointer(B), ldb,
+           device_pointer(C), ldc)
+
+    return trmm
+
+
+def _Tdgmm(fmt, cty, dtype):
+    def dgmm(self, side, m, n, A, lda, x, incx, C, ldc):
+        fn = getattr(self._api, 'cublas%sdgmm' % fmt)
+        fn(self._handle, CUBLAS_SIDE_MAP[side], m, n, device_pointer(A), lda,
+           device_pointer(x), incx, device_pointer(C), ldc)
+
+    return dgmm
+
+
+def _Tgeam(fmt, cty, dtype):
+    def geam(self, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc):
+        fn = getattr(self._api, 'cublas%sgeam' % fmt)
+        c_alpha = _alloc_scalar(cty, alpha)
+        c_beta  = _alloc_scalar(cty, beta)
+        fn(self._handle, CUBLAS_OP_MAP[transa], CUBLAS_OP_MAP[transb], m, n,
+           c_alpha, device_pointer(A), lda, c_beta,
+           device_pointer(B), ldb, device_pointer(C), ldc)
+
+    return geam
+
+
+class cuBlas(OwnerMixin):
+    def __init__(self):
+        self._api = libcublas()
+        self._handle = cublasHandle_t()
+        self._api.cublasCreate_v2(byref(self._handle))
+        self._finalizer_track((self._handle, self._api))
+        self._stream = 0
+
+    @classmethod
+    def _finalize(self, res):
+        handle, api = res
+        api.cublasDestroy_v2(handle)
+
+    @property
+    def version(self):
+        ver = c_int()
+        self._api.cublasGetVersion_v2(self._handle, byref(ver))
+        return ver.value
+
+    @property
+    def stream(self):
+        return self._stream
+
+    @stream.setter
+    def stream(self, stream):
+        self._stream = stream
+        self._api.cublasSetStream_v2(self._handle, self.stream and self.stream.handle or 0)
+
+    @property
+    def pointer_mode(self):
+        mode = cublasPointerMode_t()
+        self._api.cublasGetPointerMode_v2(self._handle, byref(mode))
+        return mode.value
+
+    @pointer_mode.setter
+    def pointer_mode(self, mode):
+        self._api.cublasSetPointerMode_v2(self._handle, int(mode))
+
+    @property
+    def atomics_mode(self):
+        mode = cublasAtomicsMode_t()
+        self._api.cublasGetAtomicsMode(self._handle, byref(mode))
+        return mode.value
+
+    @atomics_mode.setter
+    def atomics_mode(self, mode):
+        self._api.cublasSetAtomicsMode(self._handle, int(mode))
+
+    # Level 1
+
+    Snrm2 = _Tnrm2('S', c_float)
+    Dnrm2 = _Tnrm2('D', c_double)
+    Scnrm2 = _Tnrm2('Sc', c_float)
+    Dznrm2 = _Tnrm2('Dz', c_double)
+
+    Sdot = _Tdot('S', c_float)
+    Ddot = _Tdot('D', c_double)
+    Cdotu = _Tdot('C', c_complex, 'u')
+    Zdotu = _Tdot('Z', c_double_complex, 'u')
+    Cdotc = _Tdot('C', c_complex, 'c')
+    Zdotc = _Tdot('Z', c_double_complex, 'c')
+
+    Sscal = _Tscal('S', c_float)
+    Dscal = _Tscal('D', c_double)
+    Cscal = _Tscal('C', c_complex)
+    Zscal = _Tscal('Z', c_double_complex)
+    Csscal = _Tscal('Cs', c_float)
+    Zdscal = _Tscal('Zd', c_double)
+
+    Saxpy = _Taxpy('S', c_float)
+    Daxpy = _Taxpy('D', c_double)
+    Caxpy = _Taxpy('C', c_complex)
+    Zaxpy = _Taxpy('Z', c_double_complex)
+
+    Isamax = _Itamax('s', c_float)
+    Idamax = _Itamax('d', c_double)
+    Icamax = _Itamax('c', c_complex)
+    Izamax = _Itamax('z', c_double_complex)
+
+    Isamin = _Itamin('s', c_float)
+    Idamin = _Itamin('d', c_double)
+    Icamin = _Itamin('c', c_complex)
+    Izamin = _Itamin('z', c_double_complex)
+
+    Sasum = _Tasum('S', c_float)
+    Dasum = _Tasum('D', c_double)
+    Scasum = _Tasum('Sc', c_float)
+    Dzasum = _Tasum('Dz', c_double)
+
+    Srot = _Trot('S', c_float, c_float)
+    Drot = _Trot('D', c_double, c_double)
+    Crot = _Trot('C', c_float, c_complex)
+    Zrot = _Trot('Z', c_double, c_double_complex)
+    Csrot = _Trot('Cs', c_float, c_float)
+    Zdrot = _Trot('Zd', c_double, c_double)
+
+    Srotg = _Trotg('S', c_float, c_float)
+    Drotg = _Trotg('D', c_double, c_double)
+    Crotg = _Trotg('C', c_complex, c_float)
+    Zrotg = _Trotg('Z', c_double_complex, c_double)
+
+    Srotm = _Trotm('S', np.float32)
+    Drotm = _Trotm('D', np.float64)
+
+    Srotmg = _Trotmg('S', c_float, np.float32)
+    Drotmg = _Trotmg('D', c_double, np.float64)
+
+    Sgbmv = _Tgbmv('S', c_float, np.float32)
+    Dgbmv = _Tgbmv('D', c_double, np.float64)
+    Cgbmv = _Tgbmv('C', c_complex, np.complex64)
+    Zgbmv = _Tgbmv('Z', c_double_complex, np.complex128)
+
+    Sgemv = _Tgemv('S', c_float, np.float32)
+    Dgemv = _Tgemv('D', c_double, np.float64)
+    Cgemv = _Tgemv('C', c_complex, np.complex64)
+    Zgemv = _Tgemv('Z', c_double_complex, np.complex128)
+
+    Strmv = _Ttrmv('S', np.float32)
+    Dtrmv = _Ttrmv('D', np.float64)
+    Ctrmv = _Ttrmv('C', np.complex64)
+    Ztrmv = _Ttrmv('Z', np.complex128)
+
+    Stbmv = _Ttbmv('S', np.float32)
+    Dtbmv = _Ttbmv('D', np.float64)
+    Ctbmv = _Ttbmv('C', np.complex64)
+    Ztbmv = _Ttbmv('Z', np.complex128)
+
+    Stpmv = _Ttpmv('S', np.float32)
+    Dtpmv = _Ttpmv('D', np.float64)
+    Ctpmv = _Ttpmv('C', np.complex64)
+    Ztpmv = _Ttpmv('Z', np.complex128)
+
+    Strsv = _Ttrsv('S', np.float32)
+    Dtrsv = _Ttrsv('D', np.float64)
+    Ctrsv = _Ttrsv('C', np.complex64)
+    Ztrsv = _Ttrsv('Z', np.complex128)
+
+    Stpsv = _Ttpsv('S', np.float32)
+    Dtpsv = _Ttpsv('D', np.float64)
+    Ctpsv = _Ttpsv('C', np.complex64)
+    Ztpsv = _Ttpsv('Z', np.complex128)
+
+    Stbsv = _Ttbsv('S', np.float32)
+    Dtbsv = _Ttbsv('D', np.float64)
+    Ctbsv = _Ttbsv('C', np.complex64)
+    Ztbsv = _Ttbsv('Z', np.complex128)
+
+    Ssymv = _Tsymv('S', c_float, np.float32)
+    Dsymv = _Tsymv('D', c_double, np.float64)
+    Csymv = _Tsymv('C', c_complex, np.complex64)
+    Zsymv = _Tsymv('Z', c_double_complex, np.complex128)
+
+    Chemv = _Themv('C', c_complex, np.complex64)
+    Zhemv = _Themv('Z', c_double_complex, np.complex128)
+
+    Ssbmv = _Tsbmv('S', c_float, np.float32)
+    Dsbmv = _Tsbmv('D', c_double, np.float64)
+
+    Chbmv = _Thbmv('C', c_complex, np.complex64)
+    Zhbmv = _Thbmv('Z', c_double_complex, np.complex128)
+
+    Sspmv = _Tspmv('S', c_float, np.float32)
+    Dspmv = _Tspmv('D', c_double, np.float64)
+
+    Chpmv = _Thpmv('C', c_complex, np.complex64)
+    Zhpmv = _Thpmv('Z', c_double_complex, np.complex128)
+
+    Sger = _Tger('S', c_float, np.float32)
+    Dger = _Tger('D', c_double, np.float64)
+    Cgeru = _Tgeru('C', c_complex, np.complex64)
+    Cgerc = _Tgerc('C', c_complex, np.complex64)
+    Zgeru = _Tgeru('Z', c_double_complex, np.complex128)
+    Zgerc = _Tgerc('Z', c_double_complex, np.complex128)
+
+    Ssyr = _Tsyr('S', c_float, np.float32)
+    Dsyr = _Tsyr('D', c_double, np.float64)
+    Csyr = _Tsyr('C', c_complex, np.complex64)
+    Zsyr = _Tsyr('Z', c_double_complex, np.complex128)
+
+    Cher = _Ther('C', c_float, np.complex64)
+    Zher = _Ther('Z', c_double, np.complex128)
+
+    Sspr = _Tspr('S', c_float, np.float32)
+    Dspr = _Tspr('D', c_double, np.float64)
+    Chpr = _Thpr('C', c_float, np.complex64)
+    Zhpr = _Thpr('Z', c_double, np.complex128)
+
+    Ssyr2 = _Tsyr2('S', c_float, np.float32)
+    Dsyr2 = _Tsyr2('D', c_double, np.float64)
+    Csyr2 = _Tsyr2('C', c_complex, np.complex64)
+    Zsyr2 = _Tsyr2('Z', c_double_complex, np.complex128)
+    Cher2 = _Ther2('C', c_complex, np.complex64)
+    Zher2 = _Ther2('Z', c_double_complex, np.complex128)
+
+    Sspr2 = _Tspr2('S', c_float, np.float32)
+    Dspr2 = _Tspr2('D', c_double, np.float64)
+
+    Chpr2 = _Thpr2('C', c_complex, np.complex64)
+    Zhpr2 = _Thpr2('Z', c_double_complex, np.complex128)
+
+    Sgemm = _Tgemm('S', c_float, np.float32)
+    Dgemm = _Tgemm('D', c_double, np.float64)
+    Cgemm = _Tgemm('C', c_complex, np.complex64)
+    Zgemm = _Tgemm('Z', c_double_complex, np.complex128)
+
+    Ssyrk = _Tsyrk('S', c_float, np.float32)
+    Dsyrk = _Tsyrk('D', c_double, np.float64)
+    Csyrk = _Tsyrk('C', c_complex, np.complex64)
+    Zsyrk = _Tsyrk('Z', c_double_complex, np.complex128)
+
+    Cherk = _Therk('C', c_float, np.complex64)
+    Zherk = _Therk('Z', c_double, np.complex128)
+
+    Ssymm = _Tsymm('S', c_float, np.float32)
+    Dsymm = _Tsymm('D', c_double, np.float64)
+    Csymm = _Tsymm('C', c_complex, np.complex64)
+    Zsymm = _Tsymm('Z', c_double_complex, np.complex128)
+
+    Chemm = _Themm('C', c_complex, np.complex64)
+    Zhemm = _Themm('Z', c_double_complex, np.complex128)
+
+    Strsm = _Ttrsm('S', c_float, np.float32)
+    Dtrsm = _Ttrsm('D', c_double, np.float64)
+    Ctrsm = _Ttrsm('C', c_complex, np.complex64)
+    Ztrsm = _Ttrsm('Z', c_double_complex, np.complex128)
+
+    Strmm = _Ttrmm('S', c_float, np.float32)
+    Dtrmm = _Ttrmm('D', c_double, np.float64)
+    Ctrmm = _Ttrmm('C', c_complex, np.complex64)
+    Ztrmm = _Ttrmm('Z', c_double_complex, np.complex128)
+
+    Sdgmm = _Tdgmm('S', c_float, np.float32)
+    Ddgmm = _Tdgmm('D', c_double, np.float64)
+    Cdgmm = _Tdgmm('C', c_complex, np.complex64)
+    Zdgmm = _Tdgmm('Z', c_double_complex, np.complex128)
+
+    Sgeam = _Tgeam('S', c_float, np.float32)
+    Dgeam = _Tgeam('D', c_double, np.float64)
+    Cgeam = _Tgeam('C', c_complex, np.complex64)
+    Zgeam = _Tgeam('Z', c_double_complex, np.complex128)
diff --git a/pyculib/config.py b/pyculib/config.py
new file mode 100644
index 0000000..58813fd
--- /dev/null
+++ b/pyculib/config.py
@@ -0,0 +1,26 @@
+import os
+
+class Environment:
+
+    def __init__(self):
+        """Set config flags based on environment variables."""
+
+        self._environ = os.environ
+        WARNINGS = Environment._readenv("PYCULIB_WARNINGS", int, 0)
+
+        globals()['WARNINGS'] = WARNINGS
+
+    @staticmethod
+    def _readenv(name, ctor, default):
+        value = os.environ.get(name)
+        if value is None:
+            return default() if callable(default) else default
+        try:
+            return ctor(value)
+        except Exception:
+            warnings.warn("environ %s defined but failed to parse '%s'" %
+                          (name, res), RuntimeWarning)
+            return default
+
+_env = Environment()
+
diff --git a/pyculib/fft/__init__.py b/pyculib/fft/__init__.py
new file mode 100644
index 0000000..edfa8a1
--- /dev/null
+++ b/pyculib/fft/__init__.py
@@ -0,0 +1 @@
+from .api import FFTPlan, fft, ifft, fft_inplace, ifft_inplace
diff --git a/pyculib/fft/api.py b/pyculib/fft/api.py
new file mode 100644
index 0000000..2d96788
--- /dev/null
+++ b/pyculib/fft/api.py
@@ -0,0 +1,233 @@
+import numpy as np
+from . import binding as _cufft
+from numba import cuda
+from numba.cuda.kernels.transpose import transpose
+from numba.cuda.cudadrv.devicearray import DeviceNDArray
+
+def _prepare_types(pairs):
+    return dict((tuple(map(np.dtype, k)),
+                 getattr(_cufft, 'CUFFT_' + v))
+                for k, v in pairs.items())
+
+class FFTPlan(object):
+    '''
+    :param shape: Input array shape.
+    :param itype: Input data type.
+    :param otype: Output data type.
+    :param batch: Maximum number of operation to perform.
+    :param stream: A CUDA stream for all the operations to put on.
+    :param mode: Operation mode; e.g. MODE_NATIVE, MODE_FFTW_PADDING,
+                 MODE_FFTW_ASYMMETRIC, MODE_FFTW_ALL, MODE_DEFAULT.
+    '''
+
+    MODE_NATIVE = _cufft.CUFFT_COMPATIBILITY_NATIVE
+    MODE_FFTW_PADDING = _cufft.CUFFT_COMPATIBILITY_FFTW_PADDING
+    MODE_FFTW_ASYMMETRIC = _cufft.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC
+    MODE_FFTW_ALL = _cufft.CUFFT_COMPATIBILITY_FFTW_ALL
+    MODE_DEFAULT = _cufft.CUFFT_COMPATIBILITY_DEFAULT
+
+    SUPPORTED_TYPES = _prepare_types({
+        (np.float32, np.complex64)     : 'R2C',
+        (np.float64, np.complex128)    : 'D2Z',
+        (np.complex64, np.float32)     : 'C2R',
+        (np.complex128, np.float64)    : 'Z2D',
+        (np.complex64, np.complex64)   : 'C2C',
+        (np.complex128, np.complex128) : 'Z2Z',
+    })
+
+    @cuda.require_context
+    def __init__(self, shape, itype, otype, batch=1, stream=0,
+                 mode=MODE_DEFAULT):
+
+        itype = np.dtype(itype)
+        otype = np.dtype(otype)
+
+        try:
+            operation = self.SUPPORTED_TYPES[(itype, otype)]
+        except KeyError:
+            raise ValueError("Invalid type combination")
+
+        if operation in (_cufft.CUFFT_R2C, _cufft.CUFFT_D2Z):
+            direction = 'forward'
+        elif operation in (_cufft.CUFFT_C2R, _cufft.CUFFT_Z2D):
+            direction = 'inverse'
+            shape = shape[:-1] + ((shape[-1] - 1) * 2,)
+        else:
+            direction = 'both'
+
+        self._plan = _cufft.Plan.many(shape, operation, batch=batch)
+        if stream:
+            self._plan.set_stream(stream)
+        self._plan.set_compatibility_mode(mode)
+
+        complex_types = [np.dtype(x) for x in (np.complex64, np.complex128)]
+
+        if itype in complex_types and otype in complex_types:
+            ishape = oshape = shape
+        elif itype in complex_types:
+            ishape = oshape = shape[:-1] + (shape[-1]//2 + 1,)
+        elif otype in complex_types:
+            ishape = shape
+            oshape = shape[:-1] + (shape[-1]//2 + 1,)
+        else:
+            raise ValueError("Invalid type combination")
+
+        self.operation = operation
+        self.itype = itype
+        self.otype = otype
+        self.shape = shape
+        self.ishape = ishape
+        self.oshape = oshape
+        self.batch = batch
+        self.stream = stream
+        self.mode = mode
+        self.direction = direction
+
+    def _prepare(self, ary, out):
+        if ary.shape < self.ishape:
+            raise ValueError("Incompatible input array shape")
+
+        if ary.dtype != self.itype:
+            raise ValueError("Incompatiable input array dtype")
+
+        do_host_copy = False
+        if out is not None:
+            h_out = out
+            d_out, do_host_copy = cuda._auto_device(out, copy=False,
+                                                     stream=self.stream)
+        else:
+            h_out = np.empty(shape=self.oshape, dtype=self.otype)
+            d_out = from_array_like(h_out, self.stream)
+            do_host_copy = True
+
+        if h_out.shape < self.oshape:
+            raise ValueError("Incompatible output shape")
+
+        d_ary, _ = cuda._auto_device(ary, stream=self.stream)
+        return d_ary, d_out, h_out, do_host_copy
+
+    def _maybe_transpose(self, d_ary, d_out):
+        """Transpose device arrays into row-major format if needed, as cuFFT
+        can't handle column-major data."""
+
+        transpose_in = len(d_ary.shape) == 2 and d_ary.is_f_contiguous()
+        transpose_out = len(d_out.shape) == 2 and d_out.is_f_contiguous()
+        if transpose_in:
+            # Create a row-major device array
+            used_in = DeviceNDArray(shape=(d_ary.shape[1], d_ary.shape[0]),
+                                    strides=(d_ary.dtype.itemsize,
+                                             d_ary.dtype.itemsize*d_ary.shape[1]),
+                                    dtype=d_ary.dtype)
+            transpose(d_ary, used_in)
+        else:
+            used_in = d_ary
+        if transpose_out:
+            # Create a row-major device array
+            used_out = DeviceNDArray(shape=d_out.shape,
+                                     strides=(d_out.dtype.itemsize*d_out.shape[1],
+                                              d_out.dtype.itemsize),
+                                     dtype=d_out.dtype)
+        else:
+            used_out = d_out
+        return used_in, used_out, transpose_out
+    
+    def forward(self, ary, out=None):
+        '''Perform forward FFT
+
+        :param ary: Input array
+        :param out: Optional output array
+
+        :returns: The output array or a new numpy array is `out` is None.
+
+        .. note:: If `ary` is `out`, an inplace operation is performed.
+        '''
+        if self.direction not in ('both', 'forward'):
+            raise TypeError("Invalid operation")
+        d_ary, d_out, h_out, do_host_copy = self._prepare(ary, out)
+        used_in, used_out, transpose_out = self._maybe_transpose(d_ary, d_out)
+        self._plan.forward(used_in, used_out)
+        if do_host_copy:
+            if transpose_out:
+                # reshape the array so it can be transposed back
+                used_out.shape=(used_out.shape[1], used_out.shape[0])
+                used_out.strides=(used_out.dtype.itemsize,
+                                  used_out.dtype.itemsize*used_out.shape[0])
+                # and finally transpose it
+                transpose(used_out, d_out)
+            d_out.copy_to_host(h_out)
+        return h_out
+
+    def inverse(self, ary, out=None):
+        '''Perform inverse FFT
+
+        :param ary: Input array
+        :param out: Optional output array
+
+        :returns: The output array or a new numpy array is `out` is None.
+
+        .. note: If `ary` is `out`, an inplace operation is performed.
+        '''
+        if self.direction not in ('both', 'inverse'):
+            raise TypeError("Invalid operation")
+        d_ary, d_out, h_out, do_host_copy = self._prepare(ary, out)
+        used_in, used_out, transpose_out = self._maybe_transpose(d_ary, d_out)
+        self._plan.inverse(used_in, used_out)
+        if do_host_copy:
+            if transpose_out:
+                # reshape the array so it can be transposed back
+                used_out.shape=(used_out.shape[1], used_out.shape[0])
+                used_out.strides=(used_out.dtype.itemsize,
+                                  used_out.dtype.itemsize*used_out.shape[0])
+                # and finally transpose it
+                transpose(used_out, d_out)
+            d_out.copy_to_host(h_out)
+        return h_out
+
+#
+# Simple one-off functions
+#
+
+def fft(ary, out, stream=None):
+    '''Perform forward FFT on `ary` and output to `out`.
+
+    out --- can be a numpy array or a GPU device array with 1 <= ndim <= 3
+    stream --- a CUDA stream
+    '''
+    plan = FFTPlan(ary.shape, ary.dtype, out.dtype, stream=stream)
+    plan.forward(ary, out)
+    return out
+
+def ifft(ary, out, stream=None):
+    '''Perform inverse FFT on `ary` and output to `out`.
+
+    out --- can be a numpy array or a GPU device array with 1 <= ndim <= 3
+    stream --- a CUDA stream
+    '''
+    plan = FFTPlan(ary.shape, ary.dtype, out.dtype, stream=stream)
+    plan.inverse(ary, out)
+    return out
+
+def fft_inplace(ary, stream=None):
+    '''Perform inplace forward FFT. `ary` must have complex dtype.
+
+    out --- can be a numpy array or a GPU device array with 1 <= ndim <= 3
+    stream --- a CUDA stream
+    '''
+    d_ary, conv = cuda._auto_device(ary, stream=stream)
+    fft(d_ary, d_ary, stream=stream)
+    if conv:
+        d_ary.copy_to_host(ary)
+    return ary
+
+
+def ifft_inplace(ary, stream=None):
+    '''Perform inplace inverse FFT. `ary` must have complex dtype.
+
+    out --- can be a numpy array or a GPU device array with 1 <= ndim <= 3
+    stream --- a CUDA stream
+    '''
+    d_ary, conv = cuda._auto_device(ary, stream=stream)
+    ifft(d_ary, d_ary, stream=stream)
+    if conv:
+        d_ary.copy_to_host(ary)
+    return ary
diff --git a/pyculib/fft/binding.py b/pyculib/fft/binding.py
new file mode 100644
index 0000000..55b273d
--- /dev/null
+++ b/pyculib/fft/binding.py
@@ -0,0 +1,261 @@
+from __future__ import absolute_import
+import numpy as np
+from ctypes import c_void_p, c_int, POINTER, byref
+
+from numba.cuda.cudadrv.drvapi import cu_stream
+from numba.cuda.cudadrv.driver import device_pointer
+from pyculib.utils import Lib, ctype_function, finalizer
+
+STATUS = {
+    0x0: 'CUFFT_SUCCESS',
+    0x1: 'CUFFT_INVALID_PLAN',
+    0x2: 'CUFFT_ALLOC_FAILED',
+    0x3: 'CUFFT_INVALID_TYPE',
+    0x4: 'CUFFT_INVALID_VALUE',
+    0x5: 'CUFFT_INTERNAL_ERROR',
+    0x6: 'CUFFT_EXEC_FAILED',
+    0x7: 'CUFFT_SETUP_FAILED',
+    0x8: 'CUFFT_INVALID_SIZE',
+    0x9: 'CUFFT_UNALIGNED_DATA',
+}
+
+cufftResult = c_int
+
+CUFFT_FORWARD = -1
+CUFFT_INVERSE = 1
+
+CUFFT_R2C = 0x2a  # Real to Complex (interleaved)
+CUFFT_C2R = 0x2c  # Complex (interleaved) to Real
+CUFFT_C2C = 0x29  # Complex to Complex, interleaved
+CUFFT_D2Z = 0x6a  # Double to Double-Complex
+CUFFT_Z2D = 0x6c  # Double-Complex to Double
+CUFFT_Z2Z = 0x69  # Double-Complex to Double-Complex
+
+cufftType = c_int
+
+CUFFT_COMPATIBILITY_NATIVE = 0x00
+CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01  # The default value
+CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC = 0x02
+CUFFT_COMPATIBILITY_FFTW_ALL = 0x03
+
+CUFFT_COMPATIBILITY_DEFAULT = CUFFT_COMPATIBILITY_FFTW_PADDING
+
+cufftCompatibility = c_int
+
+cufftHandle = c_int
+
+
+class CuFFTError(Exception):
+    def __init__(self, code):
+        super(CuFFTError, self).__init__(STATUS[code])
+
+
+class libcufft(Lib):
+    lib = 'cufft'
+    ErrorType = CuFFTError
+
+    @property
+    def version(self):
+        ver = c_int(0)
+        self.cufftGetVersion(byref(ver))
+        return ver.value
+
+    cufftGetVersion = ctype_function(cufftResult, POINTER(c_int))
+
+    cufftPlan1d = ctype_function(cufftResult,
+                                 POINTER(cufftHandle),  # plan
+                                 c_int,  # nx
+                                 cufftType,  # type
+                                 c_int,
+                                 # batch - deprecated - use cufftPlanMany
+    )
+
+    cufftPlan2d = ctype_function(cufftResult,
+                                 POINTER(cufftHandle),  # plan
+                                 c_int,  # nx
+                                 c_int,  # ny
+                                 cufftType  # type
+    )
+
+    cufftPlan3d = ctype_function(cufftResult,
+                                 POINTER(cufftHandle),  # plan
+                                 c_int,  # nx
+                                 c_int,  # ny
+                                 c_int,  # nz
+                                 cufftType  # type
+    )
+
+    cufftPlanMany = ctype_function(cufftResult,
+                                   POINTER(cufftHandle),  # plan
+                                   c_int,  # rank
+                                   c_void_p,  # POINTER(c_int) n
+                                   c_void_p,  # POINTER(c_int) inembed
+                                   c_int,  # istride
+                                   c_int,  # idist
+                                   c_void_p,  # POINTER(c_int) onembed
+                                   c_int,  # ostride
+                                   c_int,  # odist
+                                   cufftType,  # type
+                                   c_int,  # batch
+    )
+
+    cufftDestroy = ctype_function(cufftResult,
+                                  cufftHandle,  # plan
+    )
+
+    cufftExecC2C = ctype_function(cufftResult,
+                                  cufftHandle,  # plan
+                                  c_void_p,  # POINTER(cufftComplex) idata
+                                  c_void_p,  # POINTER(cufftComplex) odata
+                                  c_int,  # direction
+    )
+
+    cufftExecR2C = ctype_function(cufftResult,
+                                  cufftHandle,  # plan
+                                  c_void_p,  # POINTER(cufftReal) idata
+                                  c_void_p,  # POINTER(cufftComplex) odata
+                                  c_int,
+    )
+
+    cufftExecC2R = ctype_function(cufftResult,
+                                  cufftHandle,  # plan
+                                  c_void_p,  # POINTER(cufftComplex) idata
+                                  c_void_p,  # POINTER(cufftReal) odata
+                                  c_int,
+    )
+
+    cufftExecZ2Z = ctype_function(cufftResult,
+                                  cufftHandle,  # plan
+                                  c_void_p,  # POINTER(cufftDoubleComplex) idata
+                                  c_void_p,  # POINTER(cufftDoubleComplex) odata
+                                  c_int,  # direction
+    )
+
+    cufftExecD2Z = ctype_function(cufftResult,
+                                  cufftHandle,  # plan
+                                  c_void_p,  # POINTER(cufftDoubleReal) idata
+                                  c_void_p,  # POINTER(cufftDoubleComplex) odata
+                                  c_int,
+    )
+
+    cufftExecZ2D = ctype_function(cufftResult,
+                                  cufftHandle,  # plan
+                                  c_void_p,  # POINTER(cufftDoubleComplex) idata
+                                  c_void_p,  # POINTER(cufftDoubleReal) odata
+                                  c_int,
+    )
+
+    cufftSetStream = ctype_function(cufftResult,
+                                    cufftHandle,  # plan,
+                                    cu_stream,  # stream
+    )
+
+    cufftSetCompatibilityMode = ctype_function(cufftResult,
+                                               cufftHandle,  # plan,
+                                               cufftCompatibility  # mode
+    )
+
+
+cufft_dtype_to_name = {
+    CUFFT_R2C: 'R2C',
+    CUFFT_C2R: 'C2R',
+    CUFFT_C2C: 'C2C',
+    CUFFT_D2Z: 'D2Z',
+    CUFFT_Z2D: 'Z2D',
+    CUFFT_Z2Z: 'Z2Z',
+}
+
+
+class Plan(finalizer.OwnerMixin):
+    @classmethod
+    def one(cls, dtype, nx):
+        "cufftPlan1d"
+        inst = object.__new__(cls)
+        inst._api = libcufft()
+        inst._handle = cufftHandle()
+        BATCH = 1  # deprecated args to cufftPlan1d
+        inst._api.cufftPlan1d(byref(inst._handle), int(nx), int(dtype),
+                              BATCH)
+        inst.dtype = dtype
+        inst._finalizer_track((inst._handle, inst._api))
+        return inst
+
+    @classmethod
+    def two(cls, dtype, nx, ny):
+        "cufftPlan2d"
+        inst = object.__new__(cls)
+        inst._api = libcufft()
+        inst._handle = cufftHandle()
+        inst._api.cufftPlan2d(byref(inst._handle), int(nx), int(ny),
+                              int(dtype))
+        inst.dtype = dtype
+        inst._finalizer_track((inst._handle, inst._api))
+        return inst
+
+    @classmethod
+    def three(cls, dtype, nx, ny, nz):
+        "cufftPlan3d"
+        inst = object.__new__(cls)
+        inst._api = libcufft()
+        inst._handle = cufftHandle()
+        inst._api.cufftPlan3d(byref(inst._handle), int(nx), int(ny),
+                              int(nz), int(dtype))
+        inst.dtype = dtype
+        inst._finalizer_track((inst._handle, inst._api))
+        return inst
+
+    @classmethod
+    def many(cls, shape, dtype, batch=1):
+        "cufftPlanMany"
+        inst = object.__new__(cls)
+        inst._api = libcufft()
+        inst._handle = cufftHandle()
+
+        c_shape = np.asarray(shape, dtype=np.int32)
+        inst._api.cufftPlanMany(byref(inst._handle),
+                                len(shape),
+                                c_shape.ctypes.data,
+                                None, 1, 0,
+                                None, 1, 0,
+                                int(dtype), int(batch))
+        inst.shape = shape
+        inst.dtype = dtype
+        inst.batch = batch
+        inst._finalizer_track((inst._handle, inst._api))
+        return inst
+
+    @classmethod
+    def _finalize(cls, res):
+        handle, api = res
+        api.cufftDestroy(handle)
+
+    def set_stream(self, stream):
+        "Associate a CUDA stream to this plan object"
+        return self._api.cufftSetStream(self._handle, stream.handle)
+
+    def set_compatibility_mode(self, mode):
+        return self._api.cufftSetCompatibilityMode(self._handle, mode)
+
+    def set_native_mode(self):
+        return self.set_compatibility_mode(CUFFT_COMPATIBILITY_NATIVE)
+
+    def set_fftw_padding_mode(self):
+        return self.set_compatibility_mode(CUFFT_COMPATIBILITY_FFTW_PADDING)
+
+    def set_fftw_asymmetric_mode(self):
+        return self.set_compatibility_mode(CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC)
+
+    def set_fftw_all_mode(self):
+        return self.set_compatibility_mode(CUFFT_COMPATIBILITY_FFTW_ALL)
+
+    def exe(self, idata, odata, dir):
+        postfix = cufft_dtype_to_name[self.dtype]
+        meth = getattr(self._api, 'cufftExec' + postfix)
+        return meth(self._handle, device_pointer(idata),
+                    device_pointer(odata), int(dir))
+
+    def forward(self, idata, odata):
+        return self.exe(idata, odata, dir=CUFFT_FORWARD)
+
+    def inverse(self, idata, odata):
+        return self.exe(idata, odata, dir=CUFFT_INVERSE)
diff --git a/pyculib/nputil.py b/pyculib/nputil.py
new file mode 100644
index 0000000..982e967
--- /dev/null
+++ b/pyculib/nputil.py
@@ -0,0 +1,45 @@
+import numpy as np
+from pyculib import warnings
+
+promote = np.promote_types   # type promotion
+
+def alias(a, b):
+    """Check whether the arrays `a` and `b` alias."""
+
+    if a is b:
+        return True
+    elif a.base is None and b.base is None:
+        return False
+    else:
+        return a.base is b or a is b.base or a.base is b.base
+
+def astype(x, dtype, var, stacklevel=3):
+    """Return `x` or a copy of `x`, with its type converted to `dtype`.
+    `var` is the name of `x` as seen by users of a public API, which may be
+    used in a warning message. `stacklevel` corresponds to the number of frames
+    to skip when reporting the warning."""
+
+    # stacklevel=3 means the warning will be reported against the BLAS call,
+    # not against this (astype()) function.
+    # Make this a variable as sometimes the call is nested, so the number of
+    # frames needs to be adjusted.
+    if dtype != x.dtype:
+        warnings.warn("%s (%s) is converted to %s"%(var, x.dtype, dtype),
+                      warnings.PerformanceWarning, stacklevel=stacklevel)
+    return x.astype(dtype, copy=False)
+
+
+def colmajor(x, dtype, var):
+    """Return `x` or a copy of `x`, with its dimension ordering converted to
+    column-major, and its type converted to `dtype`.
+    `var` is the name of `x` as seen by users of a public API, which may be
+    used in a warning message."""
+
+    if not x.flags['F_CONTIGUOUS']:
+        warnings.warn("%s is converted to column-major layout"%(var),
+                      warnings.PerformanceWarning, stacklevel=3)
+        return np.asfortranarray(x, dtype=dtype)
+    else:
+        return astype(x, dtype, var, stacklevel=4)
+
+
diff --git a/pyculib/rand/__init__.py b/pyculib/rand/__init__.py
new file mode 100644
index 0000000..1e9c70b
--- /dev/null
+++ b/pyculib/rand/__init__.py
@@ -0,0 +1,2 @@
+from .api import PRNG, QRNG
+from .api import uniform, normal, lognormal, poisson, quasi
diff --git a/pyculib/rand/api.py b/pyculib/rand/api.py
new file mode 100644
index 0000000..573b53b
--- /dev/null
+++ b/pyculib/rand/api.py
@@ -0,0 +1,371 @@
+import numpy as np
+import time
+from . import binding
+from numba import cuda
+
+class RNG(object):
+    "cuRAND pseudo random number generator"
+    def __init__(self, gen):
+        self._gen = gen
+        self.__stream = 0
+
+    @property
+    def offset(self):
+        return self.__offset
+
+    @offset.setter
+    def offset(self, offset):
+        self.__offset = offset
+        self._gen.set_offset(offset)
+
+    @property
+    def stream(self):
+        '''Associate a CUDA stream to the generator object.
+        All subsequent calls will use this stream.'''
+        return self.__stream
+
+    @stream.setter
+    def stream(self, stream):
+        self.__stream = stream
+        self._gen.set_stream(stream)
+
+    def _require_array(self, ary):
+        if ary.ndim != 1:
+            raise TypeError("Only accept 1-D array")
+        if ary.strides[0] != ary.dtype.itemsize:
+            raise TypeError("Only accept unit strided array")
+
+
+class PRNG(RNG):
+    '''cuRAND pseudo random number generator
+
+    :param rndtype: Algorithm type.  All possible values are listed as
+                    class attributes of this class, e.g. TEST, DEFAULT,
+                    XORWOW, MRG32K3A, MTGP32.
+    :param seed: Seed for the RNG.
+    :param offset: Offset to the random number stream.
+    :param stream: CUDA stream.
+
+    Example:
+
+    >>> from pyculib import rand
+    >>> from numpy import empty
+    >>> prng = rand.PRNG(rndtype=rand.PRNG.XORWOW)
+    >>> r = empty(10)
+    >>> prng.uniform(r)
+    >>> r
+    array([ ... ])
+    '''
+
+    TEST     = binding.CURAND_RNG_TEST
+    DEFAULT  = binding.CURAND_RNG_PSEUDO_DEFAULT
+    XORWOW   = binding.CURAND_RNG_PSEUDO_XORWOW
+    MRG32K3A = binding.CURAND_RNG_PSEUDO_MRG32K3A
+    MTGP32   = binding.CURAND_RNG_PSEUDO_MTGP32
+
+    @cuda.require_context
+    def __init__(self, rndtype=DEFAULT, seed=None, offset=None, stream=None):
+        super(PRNG, self).__init__(binding.Generator(rndtype))
+        self.rndtype = rndtype
+        if seed is not None:
+            self.seed = seed
+        if offset is not None:
+            self.offset = offset
+        if stream is not None:
+            self.stream = stream
+
+    @property
+    def seed(self):
+        "Mutatable attribute for the seed for the RNG"
+        return self.__seed
+
+    @seed.setter
+    def seed(self, seed):
+        self.__seed = seed
+        self._gen.set_pseudo_random_generator_seed(seed)
+
+    def uniform(self, ary, size=None):
+        '''Generate floating point random number sampled
+           from a uniform distribution and fill into ary.
+
+        :param ary: Numpy array or cuda device array.
+        :param size: Number of samples. Default to array size.
+        '''
+        self._require_array(ary)
+        size = size or ary.size
+        dary, conv = cuda._auto_device(ary, stream=self.stream)
+        self._gen.generate_uniform(dary, size)
+        if conv:
+            dary.copy_to_host(ary, stream=self.stream)
+
+    def normal(self, ary, mean, sigma, size=None):
+        '''Generate floating point random number sampled
+        from a normal distribution and fill into ary.
+
+        :param ary: Numpy array or cuda device array.
+        :param mean: Center of the distribution.
+        :param sigma: Standard deviation of the distribution.
+        :param size: Number of samples. Default to array size.
+        '''
+        self._require_array(ary)
+        size = size or ary.size
+        dary, conv = cuda._auto_device(ary, stream=self.stream)
+        self._gen.generate_normal(dary, size, mean, sigma)
+        if conv:
+            dary.copy_to_host(ary, stream=self.stream)
+
+
+    def lognormal(self, ary, mean, sigma, size=None):
+        '''Generate floating point random number sampled
+           from a log-normal distribution and fill into ary.
+
+        :param ary: Numpy array or cuda device array.
+        :param mean: Center of the distribution.
+        :param sigma: Standard deviation of the distribution.
+        :param size: Number of samples. Default to array size.
+        '''
+        self._require_array(ary)
+        size = size or ary.size
+        dary, conv = cuda._auto_device(ary, stream=self.stream)
+        self._gen.generate_log_normal(dary, size, mean, sigma)
+        if conv:
+            dary.copy_to_host(ary, stream=self.stream)
+
+    def poisson(self, ary, lmbd, size=None):
+        '''Generate floating point random number sampled
+        from a poisson distribution and fill into ary.
+
+        :param ary: Numpy array or cuda device array.
+        :param lmbda: Lambda for the distribution.
+        :param size: Number of samples. Default to array size.
+        '''
+        self._require_array(ary)
+        size = size or ary.size
+        dary, conv = cuda._auto_device(ary, stream=self.stream)
+        self._gen.generate_poisson(dary, lmbd, size)
+        if conv:
+            dary.copy_to_host(ary, stream=self.stream)
+
+
+class QRNG(RNG):
+    '''cuRAND quasi random number generator
+
+    :param rndtype: Algorithm type.
+                    Also control output data type.
+                    All possible values are listed as class
+                    attributes of this class, e.g. TEST, DEFAULT, SOBOL32,
+                    SCRAMBLED_SOBOL32, SOBOL64, SCRAMABLED_SOBOL64.
+    :param ndim: Number of dimension for the QRNG.
+    :param offset: Offset to the random number stream.
+    :param stream: CUDA stream.
+    '''
+
+
+    TEST                = binding.CURAND_RNG_TEST
+    DEFAULT             = binding.CURAND_RNG_QUASI_DEFAULT
+    SOBOL32             = binding.CURAND_RNG_QUASI_SOBOL32
+    SCRAMBLED_SOBOL32   = binding.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32
+    SOBOL64             = binding.CURAND_RNG_QUASI_SOBOL64
+    SCRAMBLED_SOBOL64   = binding.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64
+
+    @cuda.require_context
+    def __init__(self, rndtype=DEFAULT, ndim=None, offset=None, stream=None):
+        super(QRNG, self).__init__(binding.Generator(rndtype))
+        self.rndtype = rndtype
+        if ndim is not None:
+            self.ndim = ndim
+        if offset is not None:
+            self.offset = offset
+        if stream is not None:
+            self.stream = stream
+
+    @property
+    def ndim(self, ndim):
+        '''Mutatable attribute for number of dimension for the QRNG.
+        '''
+        return self.__ndim
+
+    @ndim.setter
+    def ndim(self, ndim):
+        self.__ndim = ndim
+        self._gen.set_quasi_random_generator_dimensions(ndim)
+
+    def generate(self, ary, size=None):
+        """Generate quasi random number in ary.
+
+        :param ary: Numpy array or cuda device array.
+
+        :param size: Number of samples;
+                     Default to array size.  Must be multiple of ndim.
+        """
+        self._require_array(ary)
+        size = size or ary.size
+        dary, conv = cuda._auto_device(ary, stream=self.stream)
+        self._gen.generate(dary, size)
+        if conv:
+            dary.copy_to_host(ary, stream=self.stream)
+
+
+#
+# Top level function entry points.
+#
+
+_global_rng = {}
+
+def _get_prng():
+    key = 'prng'
+    prng = _global_rng.get(key)
+    if not prng:
+        prng = PRNG()
+        prng.seed = int(time.time())
+        _global_rng[key] = prng
+    return prng
+
+
+def _get_qrng(bits):
+    assert bits in (32, 64), "not 32 or 64 bit"
+    key = 'qrng%d' % bits
+    qrng = _global_rng.get(key)
+    if not qrng:
+        qrng = QRNG(rndtype=getattr(QRNG, 'SOBOL%d' % bits))
+        _global_rng[key] = qrng
+    return qrng
+
+def uniform(size, dtype=np.float, device=False):
+    '''Generate floating point random number sampled
+    from a uniform distribution
+
+    :param size: Number of samples.
+    :param dtype: np.float32 or np.float64.
+    :param device: Set to True to return a device array instead or numpy array.
+
+    :returns: A numpy array or a device array.
+
+    >>> from pyculib import rand
+    >>> rand.uniform(size=10)
+    array([...])
+
+    .. seealso:: :py:meth:`pyculib.rand.PRNG.uniform`
+    '''
+    ary = np.empty(size, dtype=dtype)
+    devary = cuda.to_device(ary, copy=False)
+    prng = _get_prng()
+    prng.uniform(devary, size)
+    if device:
+        return devary
+    else:
+        devary.copy_to_host(ary)
+        return ary
+
+def normal(mean, sigma, size, dtype=np.float, device=False):
+    '''Generate floating point random number sampled
+    from a normal distribution
+
+    :param mean: Center point of the distribution.
+    :param sigma: Standard deviation of the distribution.
+    :param size: --- Number of samples.
+    :param dtype: np.float32 or np.float64.
+    :param device: Set to True to return a device array instead or ndarray.
+    :returns: A numpy array or a device array.
+
+    >>> from pyculib import rand
+    >>> rand.normal(mean=0, sigma=1, size=10)
+    array([...])
+
+    .. seealso:: :py:meth:`pyculib.rand.PRNG.normal`
+
+    '''
+    ary = np.empty(size, dtype=dtype)
+    devary = cuda.to_device(ary, copy=False)
+    prng = _get_prng()
+    prng.normal(devary, mean, sigma, size)
+    if device:
+        return devary
+    else:
+        devary.copy_to_host(ary)
+        return ary
+
+def lognormal(mean, sigma, size, dtype=np.float, device=False):
+    '''Generate floating point random number sampled
+    from a log-normal distribution.
+
+    :param mean: Center point of the distribution.
+    :param sigma: Standard deviation of the distribution.
+    :param size: Number of samples.
+    :param dtype: np.float32 or np.float64.
+    :param device: set to True to return a device array instead or ndarray.
+    :returns: A numpy array or a device array.
+
+    >>> from pyculib import rand
+    >>> rand.lognormal(mean=0, sigma=1, size=10)
+    array([...])
+
+    .. seealso:: :py:meth:`pyculib.rand.PRNG.lognormal`
+
+    '''
+    ary = np.empty(size, dtype=dtype)
+    devary = cuda.to_device(ary, copy=False)
+    prng = _get_prng()
+    prng.lognormal(devary, mean, sigma, size)
+    if device:
+        return devary
+    else:
+        devary.copy_to_host(ary)
+        return ary
+
+def poisson(lmbd, size, device=False):
+    '''Generate int32 random number sampled
+    from a poisson distribution.
+
+    :param lmbda: Lambda of the distribution.
+    :param size:  Number of samples
+    :param device: Set to True to return a device array instead or ndarray.
+    :returns: A numpy array or a device array.
+
+    >>> from pyculib import rand
+    >>> rand.poisson(lmbd=1, size=10)
+    array([...], dtype=uint32)
+
+    .. seealso:: :py:meth:`pyculib.rand.PRNG.poisson`
+    '''
+    ary = np.empty(size, dtype=np.uint32)
+    devary = cuda.to_device(ary, copy=False)
+    prng = _get_prng()
+    prng.poisson(devary, lmbd, size)
+    if device:
+        return devary
+    else:
+        devary.copy_to_host(ary)
+        return ary
+
+def quasi(size, bits=32, nd=1, device=False):
+    '''Generate quasi random number using SOBOL{bits} RNG type.
+
+    :param size: Number of samples.
+    :param bits: Bit length of output element; e.g. 32 or 64.
+    :param nd: Number of dimension .
+    :param device: Set to True to return a device array instead or ndarray.
+    :returns: A numpy array or a device array.
+
+    >>> from pyculib import rand
+    >>> rand.quasi(10)
+    array([...], dtype=uint32)
+
+    .. seealso:: :py:meth:`pyculib.rand.QRNG.generate`
+    '''
+    if bits == 64:
+        dtype = np.uint64
+    elif bits == 32:
+        dtype = np.uint32
+    else:
+        raise ValueError("Only accept bits = 32 or 64")
+    ary = np.empty(size, dtype=dtype)
+    devary = cuda.to_device(ary, copy=False)
+    qrng = _get_qrng(bits)
+    qrng.ndim = nd
+    qrng.generate(devary, size)
+    if device:
+        return devary
+    else:
+        devary.copy_to_host(ary)
+        return ary
diff --git a/pyculib/rand/binding.py b/pyculib/rand/binding.py
new file mode 100644
index 0000000..8914017
--- /dev/null
+++ b/pyculib/rand/binding.py
@@ -0,0 +1,298 @@
+from __future__ import absolute_import
+import numpy as np
+from ctypes import (c_float, c_int, c_void_p, POINTER, byref, cast, c_ulonglong,
+                    c_uint, c_double, c_size_t)
+
+from numba.cuda.cudadrv.drvapi import cu_stream
+from numba.cuda.cudadrv.driver import device_pointer
+from pyculib.utils import Lib, ctype_function, finalizer
+
+# enum curandStatus
+STATUS = {
+    0: ('CURAND_STATUS_SUCCESS',
+        'No errors'),
+    100: ('CURAND_STATUS_VERSION_MISMATCH',
+          'Header file and linked library version do not match'),
+    101: ('CURAND_STATUS_NOT_INITIALIZED',
+          'Generator not initialized'),
+    102: ('CURAND_STATUS_ALLOCATION_FAILED',
+          'Memory allocation failed'),
+    103: ('CURAND_STATUS_TYPE_ERROR',
+          'Generator is wrong type'),
+    104: ('CURAND_STATUS_OUT_OF_RANGE',
+          'Argument out of range'),
+    105: ('CURAND_STATUS_LENGTH_NOT_MULTIPLE',
+          'Length requested is not a multple of dimension'),
+    106: ('CURAND_STATUS_DOUBLE_PRECISION_REQUIRED',
+          'GPU does not have double precision required by MRG32k3a'),
+    201: ('CURAND_STATUS_LAUNCH_FAILURE',
+          'Kernel launch failure'),
+    202: ('CURAND_STATUS_PREEXISTING_FAILURE',
+          'Preexisting failure on library entry'),
+    203: ('CURAND_STATUS_INITIALIZATION_FAILED',
+          'Initialization of CUDA failed'),
+    204: ('CURAND_STATUS_ARCH_MISMATCH',
+          'Architecture mismatch, GPU does not support requested feature'),
+    999: ('CURAND_STATUS_INTERNAL_ERROR',
+          'Internal library error'),
+}
+curandStatus_t = c_int
+
+
+# enum curandRngType
+CURAND_RNG_TEST = 0
+## Default pseudorandom generator
+CURAND_RNG_PSEUDO_DEFAULT = 100
+## XORWOW pseudorandom generator
+CURAND_RNG_PSEUDO_XORWOW = 101
+## MRG32k3a pseudorandom generator
+CURAND_RNG_PSEUDO_MRG32K3A = 121
+## Mersenne Twister pseudorandom generator
+CURAND_RNG_PSEUDO_MTGP32 = 141
+## Default quasirandom generator
+CURAND_RNG_QUASI_DEFAULT = 200
+## Sobol32 quasirandom generator
+CURAND_RNG_QUASI_SOBOL32 = 201
+## Scrambled Sobol32 quasirandom generator
+CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202
+## Sobol64 quasirandom generator
+CURAND_RNG_QUASI_SOBOL64 = 203
+## Scrambled Sobol64 quasirandom generator
+CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204
+curandRngType_t = c_int
+
+# enum curandOrdering
+## Best ordering for pseudorandom results
+CURAND_ORDERING_PSEUDO_BEST = 100
+## Specific default 4096 thread sequence for pseudorandom results
+CURAND_ORDERING_PSEUDO_DEFAULT = 101
+## Specific seeding pattern for fast lower quality pseudorandom results
+CURAND_ORDERING_PSEUDO_SEEDED = 102
+## Specific n-dimensional ordering for quasirandom results
+CURAND_ORDERING_QUASI_DEFAULT = 201
+curandOrdering_t = c_int
+
+# enum curandDirectionVectorSet
+## Specific set of 32-bit direction vectors generated from polynomials
+## recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions
+CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101
+## Specific set of 32-bit direction vectors generated from polynomials
+## recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions,
+## and scrambled
+CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 102
+## Specific set of 64-bit direction vectors generated from polynomials
+## recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions
+CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103
+## Specific set of 64-bit direction vectors generated from polynomials
+## recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions,
+## and scrambled
+CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 104
+curandDirectionVectorSet_t = c_int
+
+# enum curandMethod
+CURAND_CHOOSE_BEST = 0
+CURAND_ITR = 1
+CURAND_KNUTH = 2
+CURAND_HITR = 3
+CURAND_M1 = 4
+CURAND_M2 = 5
+CURAND_BINARY_SEARCH = 6
+CURAND_DISCRETE_GAUSS = 7
+CURAND_REJECTION = 8
+CURAND_DEVICE_API = 9
+CURAND_FAST_REJECTION = 10
+CURAND_3RD = 11
+CURAND_DEFINITION = 12
+CURAND_POISSON = 13
+curandMethod_t = c_int
+
+curandGenerator_t = c_void_p
+p_curandGenerator_t = POINTER(curandGenerator_t)
+
+
+class CuRandError(Exception):
+    def __init__(self, code):
+        super(CuRandError, self).__init__(STATUS[code])
+
+
+class libcurand(Lib):
+    lib = 'curand'
+    ErrorType = CuRandError
+
+    @property
+    def version(self):
+        ver = c_int(0)
+        self.curandGetVersion(byref(ver))
+        return ver.value
+
+    curandGetVersion = ctype_function(curandStatus_t, POINTER(c_int))
+
+    curandCreateGenerator = ctype_function(
+        curandStatus_t,
+        p_curandGenerator_t, # generator reference
+        curandRngType_t)     # rng_type
+
+    curandDestroyGenerator = ctype_function(
+        curandStatus_t,
+        curandGenerator_t)
+
+    curandSetStream = ctype_function(curandStatus_t,
+                                     curandGenerator_t,
+                                     cu_stream)
+
+    curandSetGeneratorOffset = ctype_function(curandStatus_t,
+                                              curandGenerator_t,
+                                              c_ulonglong)
+
+    curandSetPseudoRandomGeneratorSeed = ctype_function(
+        curandStatus_t,
+        curandGenerator_t,
+        c_ulonglong)
+
+    curandSetQuasiRandomGeneratorDimensions = ctype_function(
+        curandStatus_t,
+        curandGenerator_t,
+        c_uint)
+
+    curandGenerate = ctype_function(curandStatus_t,
+                                    curandGenerator_t,
+                                    POINTER(c_uint),
+                                    c_size_t)
+
+    curandGenerateLongLong = ctype_function(curandStatus_t,
+                                            curandGenerator_t,
+                                            POINTER(c_ulonglong),
+                                            c_size_t)
+
+    curandGenerateUniform = ctype_function(curandStatus_t,
+                                           curandGenerator_t,
+                                           POINTER(c_float),
+                                           c_size_t)
+
+    curandGenerateUniformDouble = ctype_function(curandStatus_t,
+                                                 curandGenerator_t,
+                                                 POINTER(c_double),
+                                                 c_size_t)
+
+    curandGenerateNormal = ctype_function(curandStatus_t,
+                                          curandGenerator_t,
+                                          POINTER(c_float),
+                                          c_size_t,
+                                          c_float,
+                                          c_float)
+
+    curandGenerateNormalDouble = ctype_function(curandStatus_t,
+                                                curandGenerator_t,
+                                                POINTER(c_double),
+                                                c_size_t,
+                                                c_double,
+                                                c_double)
+
+    curandGenerateLogNormal = ctype_function(curandStatus_t,
+                                             curandGenerator_t,
+                                             POINTER(c_float),
+                                             c_size_t,
+                                             c_float,
+                                             c_float)
+
+    curandGenerateLogNormalDouble = ctype_function(curandStatus_t,
+                                                   curandGenerator_t,
+                                                   POINTER(c_double),
+                                                   c_size_t,
+                                                   c_double,
+                                                   c_double)
+
+    curandGeneratePoisson = ctype_function(curandStatus_t,
+                                           curandGenerator_t,
+                                           POINTER(c_uint),
+                                           c_size_t,
+                                           c_double)
+
+
+class Generator(finalizer.OwnerMixin):
+    def __init__(self, rng_type=CURAND_RNG_TEST):
+        self._api = libcurand()
+        self._handle = curandGenerator_t(0)
+        self._api.curandCreateGenerator(byref(self._handle), rng_type)
+        self._finalizer_track((self._handle, self._api))
+
+    @classmethod
+    def _finalize(cls, res):
+        handle, api = res
+        api.curandDestroyGenerator(handle)
+
+    def set_stream(self, stream):
+        return self._api.curandSetStream(self._handle, stream.handle)
+
+    def set_offset(self, offset):
+        return self._api.curandSetGeneratorOffset(self._handle, offset)
+
+    def set_pseudo_random_generator_seed(self, seed):
+        return self._api.curandSetPseudoRandomGeneratorSeed(self._handle, seed)
+
+    def set_quasi_random_generator_dimensions(self, num_dim):
+        return self._api.curandSetQuasiRandomGeneratorDimensions(self._handle,
+                                                                 num_dim)
+
+    def generate(self, devout, num):
+        fn, ptr = self.__uint32_or_uint64(devout,
+                                          self._api.curandGenerate,
+                                          self._api.curandGenerateLongLong)
+        return fn(self._handle, ptr, num)
+
+    def generate_uniform(self, devout, num):
+        '''
+        devout --- device array for the output
+        num    --- # of float to generate
+        '''
+        fn, ptr = self.__float_or_double(devout,
+                                         self._api.curandGenerateUniform,
+                                         self._api.curandGenerateUniformDouble)
+        return fn(self._handle, ptr, num)
+
+    def generate_normal(self, devout, num, mean, stddev):
+        fn, ptr = self.__float_or_double(devout,
+                                         self._api.curandGenerateNormal,
+                                         self._api.curandGenerateNormalDouble)
+        return fn(self._handle, ptr, num, mean, stddev)
+
+    def generate_log_normal(self, devout, num, mean, stddev):
+        fn, ptr = self.__float_or_double(
+            devout,
+            self._api.curandGenerateLogNormal,
+            self._api.curandGenerateLogNormalDouble)
+        return fn(self._handle, ptr, num, mean, stddev)
+
+    def generate_poisson(self, devout, num, lmbd):
+        if devout.dtype not in (np.dtype(np.uint32), np.dtype(np.int32)):
+            raise ValueError("Only accept int32 or uint32 arrays")
+        dptr = device_pointer(devout)
+        ptr = cast(c_void_p(dptr), POINTER(c_uint))
+        return self._api.curandGeneratePoisson(self._handle, ptr, num, lmbd)
+
+    def __float_or_double(self, devary, floatfn, doublefn):
+        if devary.dtype == np.float32:
+            fn = floatfn
+            fty = c_float
+        elif devary.dtype == np.float64:
+            fn = doublefn
+            fty = c_double
+        else:
+            raise ValueError("Only accept float or double arrays.")
+        dptr = device_pointer(devary)
+        ptr = cast(c_void_p(dptr), POINTER(fty))
+        return fn, ptr
+
+    def __uint32_or_uint64(self, devary, fn32, fn64):
+        if devary.dtype in (np.dtype(np.uint32), np.dtype(np.int32)):
+            fn = self._api.curandGenerate
+            ity = c_uint
+        elif devary.dtype in (np.dtype(np.uint64), np.dtype(np.int64)):
+            fn = self._api.curandGenerateLongLong
+            ity = c_ulonglong
+        else:
+            raise ValueError("Only accept int32, int64, "
+                             "uint32 or uint64 arrays")
+        dptr = device_pointer(devary)
+        ptr = cast(c_void_p(dptr), POINTER(ity))
+        return fn, ptr
diff --git a/pyculib/runtests.py b/pyculib/runtests.py
new file mode 100644
index 0000000..d583710
--- /dev/null
+++ b/pyculib/runtests.py
@@ -0,0 +1,20 @@
+import sys
+from os.path import dirname
+from numba.testing import run_tests
+
+def _main(argv, **kwds):
+    # This helper function assumes the first element of argv
+    # is the name of the calling program.
+    # The 'main' API function is invoked in-process, and thus
+    # will synthesize that name.
+    return run_tests(argv, defaultTest='pyculib',
+                     topleveldir=dirname(dirname(__file__)), **kwds).wasSuccessful()
+
+def main(*argv, **kwds):
+    """keyword arguments are accepted for backward compatiblity only.
+    See `numba.testing.run_tests()` documentation for details."""
+
+    return _main(['<main>'] + list(argv), **kwds)
+
+if __name__ == '__main__':
+    sys.exit(0 if _main(sys.argv) else 1)
diff --git a/pyculib/sorting/__init__.py b/pyculib/sorting/__init__.py
new file mode 100644
index 0000000..c735d15
--- /dev/null
+++ b/pyculib/sorting/__init__.py
@@ -0,0 +1,2 @@
+from .radixsort import RadixSort
+from .segsort import segmented_sort
diff --git a/pyculib/sorting/common.py b/pyculib/sorting/common.py
new file mode 100644
index 0000000..f6fb046
--- /dev/null
+++ b/pyculib/sorting/common.py
@@ -0,0 +1,26 @@
+from __future__ import print_function, absolute_import, division
+from numba import findlib
+import ctypes
+import os
+import platform
+import warnings
+
+def library_extension():
+    p = platform.system()
+    if p == 'Linux':
+        return 'so'
+    if p == 'Windows':
+        return 'dll'
+    if p == 'Darwin':
+        return 'dylib'
+
+def load_lib(libname):
+    fullname = 'pyculib_%s.%s' % (libname, library_extension())
+    devlib = os.path.join(os.path.abspath(os.path.dirname(__file__)), fullname)
+    if os.path.exists(devlib):
+        libpath = devlib
+        warnings.warn('Using in-tree library %s' % libpath)
+    else:
+        libpath = os.path.join(findlib.get_lib_dir(), fullname)
+
+    return ctypes.CDLL(libpath)
diff --git a/pyculib/sorting/radixsort.py b/pyculib/sorting/radixsort.py
new file mode 100644
index 0000000..cbcf787
--- /dev/null
+++ b/pyculib/sorting/radixsort.py
@@ -0,0 +1,275 @@
+"""
+Uses radixsort implementation from CUB which has the following license:
+
+Copyright (c) 2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+from __future__ import print_function, absolute_import, division
+import ctypes
+from .common import load_lib
+from contextlib import contextmanager
+from numba.cuda.cudadrv.driver import device_pointer
+from numba.cuda.cudadrv.drvapi import cu_stream
+from numba.cuda.cudadrv.devicearray import auto_device, is_cuda_ndarray
+from numba import cuda
+import numpy as np
+
+lib = load_lib('radixsort')
+
+_argtypes = [
+    ctypes.c_void_p, # temp
+    ctypes.c_uint, # count
+    ctypes.c_void_p, # d_key
+    ctypes.c_void_p, # d_key_alt
+    ctypes.c_void_p, # d_vals
+    ctypes.c_void_p, # d_vals_alt
+    cu_stream,
+    ctypes.c_int, # descending
+    ctypes.c_uint, # begin_bit
+    ctypes.c_uint, # end_bit
+]
+
+_support_types = {
+    np.float32: 'float',
+    np.float64: 'double',
+    np.int32: 'int32',
+    np.uint32: 'uint32',
+    np.int64: 'int64',
+    np.uint64: 'uint64'
+}
+
+_overloads = {}
+
+
+def _init():
+    for ty, name in _support_types.items():
+        dtype = np.dtype(ty)
+        fn = getattr(lib, "radixsort_{0}".format(name))
+        _overloads[dtype] = fn
+        fn.argtypes = _argtypes
+        fn.restype = ctypes.c_void_p
+
+
+_init()
+
+lib.radixsort_cleanup.argtypes = [ctypes.c_void_p]
+
+
+def _devptr(p):
+    if p is None:
+        return None
+    else:
+        return device_pointer(p)
+
+
+@contextmanager
+def _autodevice(ary, stream, firstk=None):
+    if ary is not None:
+        dptr, conv = auto_device(ary, stream=stream)
+        yield dptr
+        if conv:
+            if firstk is None:
+                dptr.copy_to_host(ary, stream=stream)
+            else:
+                dptr.bind(stream)[:firstk].copy_to_host(ary[:firstk],
+                                                        stream=stream)
+    else:
+        yield None
+
+
+@cuda.jit
+def _cu_arange(ary, count):
+    i = cuda.grid(1)
+    if i < count:
+        ary[i] = i
+
+
+class RadixSort(object):
+    """Provides radix sort and radix select.
+
+    The algorithm implemented here is best for large arrays (``N > 1e6``) due to
+    the latency introduced by its use of multiple kernel launches. It is
+    recommended to use ``segmented_sort`` instead for batches of smaller arrays.
+
+    :type maxcount: int
+    :param maxcount: Maximum number of items to sort
+    :type dtype: numpy.dtype
+    :param dtype: The element type to sort
+    :type descending: bool
+    :param descending: Sort in descending order?
+    :param stream: The CUDA stream to run the kernels in
+    """
+
+    def __init__(self, maxcount, dtype, descending=False, stream=0):
+        self.maxcount = int(maxcount)
+        self.dtype = np.dtype(dtype)
+        self._arysize = int(self.maxcount * self.dtype.itemsize)
+        self.descending = descending
+        self.stream = stream
+        self._sort = _overloads[self.dtype]
+        self._cleanup = lib.radixsort_cleanup
+
+        ctx = cuda.current_context()
+        self._temp_keys = ctx.memalloc(self._arysize)
+        self._temp_vals = ctx.memalloc(self._arysize)
+        self._temp = self._call(temp=None, keys=None, vals=None)
+
+    def __del__(self):
+        try:
+            self.close()
+        except:
+            pass
+
+    def close(self):
+        """Explicitly release internal resources
+
+        Called automatically when the object is deleted.
+        """
+        if self._temp is not None:
+            self._cleanup(self._temp)
+            self._temp = None
+
+    def _call(self, temp, keys, vals, begin_bit=0, end_bit=None):
+        stream = self.stream.handle if self.stream else self.stream
+        begin_bit = begin_bit
+        end_bit = end_bit or self.dtype.itemsize * 8
+        descending = int(self.descending)
+
+        count = self.maxcount
+        if keys:
+            count = keys.size
+
+        return self._sort(
+            temp,
+            ctypes.c_uint(count),
+            _devptr(keys),
+            _devptr(self._temp_keys),
+            _devptr(vals),
+            _devptr(self._temp_vals),
+            stream,
+            descending,
+            begin_bit,
+            end_bit
+        )
+
+    def _sentry(self, ary):
+        if ary.dtype != self.dtype:
+            raise TypeError("dtype mismatch")
+        if ary.size > self.maxcount:
+            raise ValueError("keys array too long")
+
+    def sort(self, keys, vals=None, begin_bit=0, end_bit=None):
+        """
+        Perform a inplace sort on ``keys``.  Memory transfer is performed
+        automatically.
+
+        :type keys: numpy.ndarray
+        :param keys: Keys to sort inplace
+        :type vals: numpy.ndarray
+        :param vals: Optional. Additional values to be reordered along the sort.
+                     It is modified in place. Only the ``uint32`` dtype is
+                     supported in this version.
+        :type begin_bit: int
+        :param begin_bit: The first bit to sort
+        :type end_bit: int
+        :param end_bit: Optional. The last bit to sort
+        """
+        self._sentry(keys)
+        with _autodevice(keys, self.stream) as d_keys:
+            with _autodevice(vals, self.stream) as d_vals:
+                self._call(self._temp, keys=d_keys, vals=d_vals,
+                           begin_bit=begin_bit, end_bit=end_bit)
+
+    def select(self, k, keys, vals=None, begin_bit=0, end_bit=None):
+        """Perform a inplace k-select on ``keys``.
+
+        Memory transfer is performed automatically.
+
+        :type keys: numpy.ndarray
+        :param keys: Keys to sort inplace
+        :type vals: numpy.ndarray
+        :param vals: Optional. Additional values to be reordered along the sort.
+                     It is modified in place. Only the ``uint32`` dtype is
+                     supported in this version.
+        :type begin_bit: int
+        :param begin_bit: The first bit to sort
+        :type end_bit: int
+        :param end_bit: Optional. The last bit to sort
+        """
+        self._sentry(keys)
+        with _autodevice(keys, self.stream, firstk=k) as d_keys:
+            with _autodevice(vals, self.stream, firstk=k) as d_vals:
+                self._call(self._temp, keys=d_keys, vals=d_vals,
+                           begin_bit=begin_bit, end_bit=end_bit)
+
+    def init_arg(self, size):
+        """Initialize an empty CUDA ndarray of uint32 with ascending integers
+        starting from zero
+
+        :type size: int
+        :param size: Number of elements for the output array
+        :return: An array with values ``[0, 1, 2, ...m size - 1 ]``
+        """
+        d_vals = cuda.device_array(size, dtype=np.uint32, stream=self.stream)
+        _cu_arange.forall(d_vals.size, stream=self.stream)(d_vals, size)
+        return d_vals
+
+    def argselect(self, k, keys, begin_bit=0, end_bit=None):
+        """Similar to ``RadixSort.select`` but returns the new sorted indices.
+
+        :type keys: numpy.ndarray
+        :param keys: Keys to sort inplace
+        :type begin_bit: int
+        :param begin_bit: The first bit to sort
+        :type end_bit: int
+        :param end_bit: Optional. The last bit to sort
+        :return: The indices indicating the new ordering as an array on the CUDA
+                 device or on the host.
+        """
+        d_vals = self.init_arg(keys.size)
+        self.select(k, keys, vals=d_vals, begin_bit=begin_bit, end_bit=end_bit)
+        res = d_vals.bind(self.stream)[:k]
+        if not is_cuda_ndarray(keys):
+            res = res.copy_to_host(stream=self.stream)
+        return res
+
+    def argsort(self, keys, begin_bit=0, end_bit=None):
+        """Similar to ``RadixSort.sort`` but returns the new sorted indices.
+
+        :type keys: numpy.ndarray
+        :param keys: Keys to sort inplace
+        :type begin_bit: int
+        :param begin_bit: The first bit to sort
+        :type end_bit: int
+        :param end_bit: Optional. The last bit to sort
+        :return: The indices indicating the new ordering as an array on the CUDA
+                 device or on the host.
+        """
+        d_vals = self.init_arg(keys.size)
+        self.sort(keys, vals=d_vals, begin_bit=begin_bit, end_bit=end_bit)
+        res = d_vals
+        if not is_cuda_ndarray(keys):
+            res = res.copy_to_host(stream=self.stream)
+        return res
+
diff --git a/pyculib/sorting/segsort.py b/pyculib/sorting/segsort.py
new file mode 100644
index 0000000..36f1a98
--- /dev/null
+++ b/pyculib/sorting/segsort.py
@@ -0,0 +1,116 @@
+"""
+Uses segmented sort implementation from ModernGPU which has the following
+license:
+
+Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+from __future__ import print_function, absolute_import, division
+import ctypes
+from .common import load_lib
+from contextlib import contextmanager
+from numba.cuda.cudadrv.driver import device_pointer
+from numba.cuda.cudadrv.drvapi import cu_stream
+from numba.cuda.cudadrv.devicearray import auto_device
+import numpy as np
+
+lib = load_lib('segsort')
+
+_argtypes = [
+    # d_key
+    ctypes.c_void_p,
+    # d_vals
+    ctypes.c_void_p,
+    # N
+    ctypes.c_uint,
+    # segments
+    ctypes.c_void_p,
+    # Nseg
+    ctypes.c_uint,
+    # stream
+    cu_stream,
+]
+
+_support_types = {
+    np.float32: 'float32',
+    np.float64: 'float64',
+    np.int32: 'int32',
+    np.uint32: 'uint32',
+    np.int64: 'int64',
+    np.uint64: 'uint64'
+}
+
+_overloads = {}
+
+
+def _init():
+    for k, v in _support_types.items():
+        fn = getattr(lib, 'segsortpairs_{0}'.format(v))
+        fn.argtypes = _argtypes
+        _overloads[np.dtype(k)] = fn
+
+
+_init()
+
+
+@contextmanager
+def _autodevice(ary, stream):
+    if ary is not None:
+        dptr, conv = auto_device(ary, stream=stream)
+        yield dptr
+        if conv:
+            dptr.copy_to_host(ary, stream=stream)
+    else:
+        yield None
+
+
+def _segmentedsort(d_keys, d_vals, d_segments, stream):
+    _overloads[d_keys.dtype](device_pointer(d_keys),
+                             device_pointer(d_vals),
+                             d_keys.size,
+                             device_pointer(d_segments),
+                             d_segments.size,
+                             stream.handle if stream else 0)
+
+
+def segmented_sort(keys, vals, segments, stream=0):
+    """Performs an inplace sort on small segments (N < 1e6).
+
+    :type keys: numpy.ndarray
+    :param keys: Keys to sort inplace.
+    :type vals: numpy.ndarray
+    :param vals: Values to be reordered inplace along the sort. Only the
+                 ``uint32`` dtype is supported in this implementation.
+    :type segments: numpy.ndarray
+    :param segments: Segment separation location. e.g. ``array([3, 6, 8])`` for
+                     segments of  ``keys[:3]``, ``keys[3:6]``, ``keys[6:8]``,
+                     ``keys[8:]``.
+    :param stream: Optional. A cuda stream in which the kernels are executed.
+    """
+    with _autodevice(keys, stream) as d_keys:
+        with _autodevice(vals, stream) as d_vals:
+            d_segments, _ = auto_device(segments, stream=stream)
+            _segmentedsort(d_keys, d_vals, d_segments, stream)
+
diff --git a/pyculib/sparse/__init__.py b/pyculib/sparse/__init__.py
new file mode 100644
index 0000000..a84a6f5
--- /dev/null
+++ b/pyculib/sparse/__init__.py
@@ -0,0 +1,2 @@
+from __future__ import print_function, absolute_import
+from .api import *
diff --git a/pyculib/sparse/api.py b/pyculib/sparse/api.py
new file mode 100644
index 0000000..68b1e1a
--- /dev/null
+++ b/pyculib/sparse/api.py
@@ -0,0 +1,762 @@
+from __future__ import print_function, absolute_import, division
+from contextlib import contextmanager
+import numpy as np
+import scipy.sparse as ss
+from numba import cuda
+from .binding import (cuSparse, CUSPARSE_INDEX_BASE_ZERO,
+                      CUSPARSE_INDEX_BASE_ONE)
+
+dtype_to_char = {
+    np.dtype(np.float32): 'S',
+    np.dtype(np.float64): 'D',
+    np.dtype(np.complex64): 'C',
+    np.dtype(np.complex128): 'Z',
+}
+
+
+def _sentry_ndim(ndim, **kws):
+    for k, a in kws.items():
+        if a.ndim != ndim:
+            raise ValueError("%s.ndim must be %dD" % (k, ndim))
+
+
+def _sentry_dtype(dtype, **kws):
+    for k, a in kws.items():
+        if a.dtype != dtype:
+            raise TypeError("%s.dtype is not %s" % (k, dtype))
+
+
+@contextmanager
+def _readonly(*arys):
+    ds = []
+    for a in arys:
+        dmem, _ = cuda._auto_device(a)
+        ds.append(dmem)
+    yield ds
+
+
+@contextmanager
+def _readwrite(*arys):
+    ds = []
+    ws = []
+    for a in arys:
+        dmem, conv = cuda._auto_device(a)
+        ds.append(dmem)
+        if conv:
+            ws.append((a, dmem))
+    yield ds
+    for a, d in ws:
+        d.copy_to_host(a)
+
+
+class Sparse(object):
+    """All cuSPARSE functions are available under the Sparse object.
+
+    :param idxbase: The base for indexing, either 0 or 1. Optional, defaults
+           to 0.
+    """
+
+    @cuda.require_context
+    def __init__(self, idxbase=0):
+        """
+        Args
+        ----
+        - idxbase int
+            Index base. Must be 0 or 1
+        """
+        if idxbase not in (0, 1):
+            raise ValueError("Invalid index base")
+
+        self.api = cuSparse()
+        self.idxbase = (CUSPARSE_INDEX_BASE_ZERO,
+                        CUSPARSE_INDEX_BASE_ONE)[idxbase]
+
+    @property
+    def stream(self):
+        return self.api.stream
+
+    @stream.setter
+    def stream(self, stream):
+        self.api.stream = stream
+
+    def _get_api(self, fname, dtype):
+        ch = dtype_to_char[np.dtype(dtype)]
+        fn = "%s%s" % (ch, fname)
+        return getattr(self.api, fn)
+
+    def matdescr(self, indexbase=None, diagtype='N', fillmode='L',
+                 matrixtype='G'):
+        descr = self.api.matdescr()
+        descr.indexbase = self.idxbase if indexbase is None else indexbase
+        descr.diagtype = diagtype
+        descr.fillmode = fillmode
+        descr.matrixtype = matrixtype
+        return descr
+
+    # ------------------------------------------------------------------------
+    # Level 1 API
+
+    def axpyi(self, alpha, xVal, xInd, y):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-axpyi
+        """
+        _sentry_ndim(1, xVal=xVal, xInd=xVal, y=y)
+        _sentry_dtype(np.int32, xInd=xInd)
+        _sentry_dtype(xVal.dtype, y=y)
+        fn = self._get_api("axpyi", xVal.dtype)
+        nnz = xVal.size
+        with _readonly(xVal, xInd) as [dxval, dxind]:
+            with _readwrite(y) as [dy]:
+                fn(nnz=nnz, alpha=alpha, xVal=dxval, xInd=dxind, y=dy,
+                   idxBase=self.idxbase)
+        return y
+
+    def doti(self, xVal, xInd, y):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-doti
+        """
+        _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y)
+        _sentry_dtype(np.int32, xInd=xInd)
+        _sentry_dtype(xVal.dtype, y=y)
+        fn = self._get_api("doti", xVal.dtype)
+        nnz = xVal.size
+        with _readonly(xVal, xInd) as [dxval, dxind]:
+            with _readwrite(y) as [dy]:
+                result = fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy,
+                            idxBase=self.idxbase)
+        return result
+
+    def dotci(self, xVal, xInd, y):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-dotci
+        """
+        _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y)
+        _sentry_dtype(np.int32, xInd=xInd)
+        _sentry_dtype(xVal.dtype, y=y)
+        fn = self._get_api("dotci", xVal.dtype)
+        nnz = xVal.size
+        with _readonly(xVal, xInd) as [dxval, dxind]:
+            with _readwrite(y) as [dy]:
+                result = fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy,
+                            idxBase=self.idxbase)
+        return result
+
+    def gthr(self, y, xVal, xInd):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-gthr
+        """
+        _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y)
+        _sentry_dtype(np.int32, xInd=xInd)
+        _sentry_dtype(xVal.dtype, y=y)
+        fn = self._get_api("gthr", xVal.dtype)
+        nnz = xVal.size
+        with _readonly(y, xInd) as [dy, dxind]:
+            with _readwrite(xVal) as [dxval]:
+                fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy, idxBase=self.idxbase)
+
+    def gthrz(self, y, xVal, xInd):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-gthrz
+        """
+        _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y)
+        _sentry_dtype(np.int32, xInd=xInd)
+        _sentry_dtype(xVal.dtype, y=y)
+        fn = self._get_api("gthrz", xVal.dtype)
+        nnz = xVal.size
+        with _readonly(xInd) as [dxind]:
+            with _readwrite(y, xVal) as [dy, dxval]:
+                fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy, idxBase=self.idxbase)
+
+    def roti(self, xVal, xInd, y, c, s):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-roti
+        """
+        _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y)
+        _sentry_dtype(np.int32, xInd=xInd)
+        _sentry_dtype(xVal.dtype, y=y)
+        fn = self._get_api("roti", xVal.dtype)
+        nnz = xVal.size
+        with _readonly(xInd) as [dxind]:
+            with _readwrite(y, xVal) as [dy, dxval]:
+                fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy, c=c, s=s,
+                   idxBase=self.idxbase)
+
+    def sctr(self, xVal, xInd, y):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-sctr
+        """
+        _sentry_ndim(1, xVal=xVal, xInd=xInd, y=y)
+        _sentry_dtype(np.int32, xInd=xInd)
+        _sentry_dtype(xVal.dtype, y=y)
+        fn = self._get_api("sctr", xVal.dtype)
+        nnz = xVal.size
+        with _readonly(xVal, xInd) as [dxval, dxind]:
+            with _readwrite(y) as [dy]:
+                fn(nnz=nnz, xVal=dxval, xInd=dxind, y=dy, idxBase=self.idxbase)
+
+    # ------------------------------------------------------------------------
+    # Level 2 API
+
+    def bsrmv_matrix(self, dir, trans, alpha, descr, bsrmat, x, beta, y):
+        bsrVal = bsrmat.data
+        bsrRowPtr = bsrmat.indptr
+        bsrColInd = bsrmat.indices
+        nnzb = bsrColInd.size
+        m, n = bsrmat.shape
+        blockDim, blockDim1 = bsrmat.blocksize
+        assert blockDim == blockDim1
+
+        mb = (m + blockDim - 1) // blockDim
+        nb = (n + blockDim - 1) // blockDim
+
+        self.bsrmv(dir, trans, mb, nb, nnzb, alpha, descr, bsrVal,
+                   bsrRowPtr, bsrColInd, blockDim, x, beta, y)
+
+    def bsrmv(self, dir, trans, mb, nb, nnzb, alpha, descr, bsrVal,
+              bsrRowPtr, bsrColInd, blockDim, x, beta, y):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-bsrmv
+        """
+        _sentry_ndim(1, x=x, y=y)
+        _sentry_dtype(bsrVal.dtype, x=x, y=y)
+        fn = self._get_api("bsrmv", bsrVal.dtype)
+
+        with _readonly(bsrVal, bsrRowPtr, bsrColInd, x) \
+            as [dbsrVal, dbsrRowPtr, dbsrColInd, dx]:
+            with _readwrite(y) as [dy]:
+                fn(dirA=dir, transA=trans, mb=mb, nb=nb, nnzb=nnzb,
+                   alpha=alpha, descrA=descr, bsrValA=dbsrVal,
+                   bsrRowPtrA=dbsrRowPtr, bsrColIndA=dbsrColInd,
+                   blockDim=blockDim, x=dx, beta=beta, y=dy)
+
+    def bsrxmv(self, dir, trans, sizeOfMask, mb, nb, nnzb, alpha, descr,
+               bsrVal, bsrMaskPtr, bsrRowPtr, bsrEndPtr, bsrColInd, blockDim,
+               x, beta, y):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-bsrxmv
+        """
+
+        _sentry_ndim(1, x=x, y=y)
+        _sentry_dtype(bsrVal.dtype, x=x, y=y)
+        fn = self._get_api("bsrxmv", bsrVal.dtype)
+
+        with _readonly(bsrVal, bsrRowPtr, bsrColInd, bsrMaskPtr, bsrEndPtr, x) \
+            as [dbsrVal, dbsrRowPtr, dbsrColInd, dbsrMaskPtr, dbsrEndPtr, dx]:
+            with _readwrite(y) as [dy]:
+                fn(dirA=dir, transA=trans, sizeOfMask=sizeOfMask,
+                   mb=mb, nb=nb, nnzb=nnzb, alpha=alpha, descrA=descr,
+                   bsrValA=dbsrVal, bsrRowPtrA=dbsrRowPtr,
+                   bsrColIndA=dbsrColInd, bsrMaskPtrA=dbsrMaskPtr,
+                   bsrEndPtrA=dbsrEndPtr, blockDim=blockDim, x=dx, beta=beta,
+                   y=dy)
+
+    def csrmv(self, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr,
+              csrColInd, x, beta, y):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrmv
+        """
+        _sentry_ndim(1, x=x, y=y)
+        _sentry_dtype(csrVal.dtype, x=x, y=y)
+        fn = self._get_api("csrmv", csrVal.dtype)
+        with _readonly(csrVal, csrRowPtr, csrColInd, x) \
+            as [dcsrVal, dcsrRowPtr, dcsrColInd, dx]:
+            with _readwrite(y) as [dy]:
+                fn(transA=trans, m=m, n=n, nnz=nnz,
+                   alpha=alpha, descrA=descr, csrValA=dcsrVal,
+                   csrRowPtrA=dcsrRowPtr, csrColIndA=dcsrColInd, x=dx,
+                   beta=beta, y=dy)
+
+    def csrsv_analysis(self, trans, m, nnz, descr, csrVal, csrRowPtr,
+                       csrColInd):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrsvanalysis
+
+        Returns
+        -------
+        SolveAnalysisInfo
+        """
+        fn = self._get_api("csrsv_analysis", csrVal.dtype)
+        info = self.api.solve_analysis_info()
+        with _readonly(csrVal, csrRowPtr, csrColInd) \
+            as [dcsrVal, dcsrRowPtr, dcsrColInd]:
+            fn(transA=trans, m=m, nnz=nnz, descrA=descr, csrValA=dcsrVal,
+               csrRowPtrA=dcsrRowPtr, csrColIndA=dcsrColInd, info=info)
+        return info
+
+    def csrsv_solve(self, trans, m, alpha, descr, csrVal, csrRowPtr,
+                    csrColInd, info, x, y):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrsvsolve
+        """
+        _sentry_ndim(1, x=x, y=y)
+        _sentry_dtype(csrVal.dtype, x=x, y=y)
+        fn = self._get_api("csrsv_solve", csrVal.dtype)
+        with _readonly(csrVal, csrRowPtr, csrColInd, x) \
+            as [dcsrVal, dcsrRowPtr, dcsrColInd, dx]:
+            with _readwrite(y) as [dy]:
+                fn(transA=trans, m=m, alpha=alpha, descrA=descr,
+                   csrValA=dcsrVal, csrRowPtrA=dcsrRowPtr,
+                   csrColIndA=dcsrColInd, info=info, x=dx, y=dy)
+
+    hybmv = NotImplemented
+    hybmv_analysis = NotImplemented
+    hybmv_solve = NotImplemented
+
+    # ------------------------------------------------------------------------
+    # Level 3 API
+
+    def csrmm(self, transA, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
+              csrColIndA, B, ldb, beta, C, ldc):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrmm
+        """
+        _sentry_dtype(csrValA.dtype, B=B, C=C)
+        fn = self._get_api("csrmm", csrValA.dtype)
+        with _readonly(csrValA, csrRowPtrA, csrColIndA, B) \
+            as [dcsrValA, dcsrRowPtrA, dcsrColIndA, dB]:
+            with _readwrite(C) as [dC]:
+                fn(transA=transA, m=m, n=n, k=k, nnz=nnz, alpha=alpha,
+                   descrA=descrA, csrValA=dcsrValA, csrRowPtrA=dcsrRowPtrA,
+                   csrColIndA=dcsrColIndA, B=dB, ldb=ldb, beta=beta, C=dC,
+                   ldc=ldc)
+
+    def csrmm2(self, transA, transB, m, n, k, nnz, alpha, descrA, csrValA,
+               csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrmm2
+        """
+        _sentry_dtype(csrValA.dtype, B=B, C=C)
+        fn = self._get_api("csrmm2", csrValA.dtype)
+        with _readonly(csrValA, csrRowPtrA, csrColIndA, B) \
+            as [dcsrValA, dcsrRowPtrA, dcsrColIndA, dB]:
+            with _readwrite(C) as [dC]:
+                fn(transa=transA, transb=transB, m=m, n=n, k=k, nnz=nnz,
+                   alpha=alpha,
+                   descrA=descrA, csrValA=dcsrValA, csrRowPtrA=dcsrRowPtrA,
+                   csrColIndA=dcsrColIndA, B=dB, ldb=ldb, beta=beta, C=dC,
+                   ldc=ldc)
+
+    def csrsm_analysis(self, transA, m, nnz, descrA, csrValA, csrRowPtrA,
+                       csrColIndA):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrsmanalysis
+        """
+        fn = self._get_api("csrsm_analysis", csrValA.dtype)
+        info = self.api.solve_analysis_info()
+        with _readonly(csrValA, csrRowPtrA, csrColIndA) \
+            as [dcsrValA, dcsrRowPtrA, dcsrColIndA]:
+            fn(transA=transA, m=m, nnz=nnz, descrA=descrA, csrValA=dcsrValA,
+               csrRowPtrA=dcsrRowPtrA, csrColIndA=dcsrColIndA, info=info)
+        return info
+
+    def csrsm_solve(self, transA, m, n, alpha, descrA, csrValA, csrRowPtrA,
+                    csrColIndA, info, X, ldx, Y, ldy):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrsmsolve
+        """
+        fn = self._get_api("csrsm_solve", csrValA.dtype)
+        with _readonly(csrValA, csrRowPtrA, csrColIndA, X) \
+            as [dcsrValA, dcsrRowPtrA, dcsrColIndA, dX]:
+            with _readwrite(Y) as [dY]:
+                fn(transA=transA, m=m, n=n, alpha=alpha, descrA=descrA,
+                   csrValA=dcsrValA, csrRowPtrA=dcsrRowPtrA,
+                   csrColIndA=dcsrColIndA, info=info, x=dX, ldx=ldx, y=dY,
+                   ldy=ldy)
+
+    # ------------------------------------------------------------------------
+    # Extra API
+
+    def XcsrgeamNnz(self, m, n, descrA, nnzA, csrRowPtrA, csrColIndA, descrB,
+                    nnzB, csrRowPtrB, csrColIndB, descrC, csrRowPtrC):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgeam
+
+        Returns
+        -------
+        int
+            nnzC
+        """
+        fn = self.api.XcsrgeamNnz
+        with _readonly(csrRowPtrA, csrColIndA, csrRowPtrB, csrColIndB) \
+            as (dcsrRowPtrA, dcsrColIndA, dcsrRowPtrB, dcsrColIndB):
+            with _readwrite(csrRowPtrC) as [dcsrRowPtrC]:
+                nnzC = fn(m=m, n=n, descrA=descrA, nnzA=nnzA,
+                          csrRowPtrA=dcsrRowPtrA,
+                          csrColIndA=dcsrColIndA, descrB=descrB, nnzB=nnzB,
+                          csrRowPtrB=dcsrRowPtrB, csrColIndB=dcsrColIndB,
+                          descrC=descrC, csrRowPtrC=dcsrRowPtrC,
+                          nnzTotalDevHostPtr=0)
+        return nnzC
+
+    def csrgeam(self, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA,
+                csrColIndA, beta, descrB, nnzB, csrValB, csrRowPtrB,
+                csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgeam
+        """
+        fn = self._get_api("csrgeam", csrValA.dtype)
+        with _readonly(csrValA, csrRowPtrA, csrColIndA, csrValB, csrRowPtrB,
+                       csrColIndB, csrRowPtrC) \
+            as [dcsrValA, dcsrRowPtrA, dcsrColIndA, dcsrValB, dcsrRowPtrB,
+                dcsrColIndB, dcsrRowPtrC]:
+            with _readwrite(csrValC, csrColIndC) as [dcsrValC, dcsrColIndC]:
+                fn(m=m, n=n, alpha=alpha, descrA=descrA, nnzA=nnzA,
+                   csrValA=dcsrValA, csrRowPtrA=dcsrRowPtrA,
+                   csrColIndA=dcsrColIndA, csrValB=dcsrValB,
+                   descrB=descrB, nnzB=nnzB, beta=beta,
+                   csrRowPtrB=dcsrRowPtrB, csrColIndB=dcsrColIndB,
+                   descrC=descrC, csrValC=dcsrValC,
+                   csrRowPtrC=dcsrRowPtrC, csrColIndC=dcsrColIndC)
+
+
+    def XcsrgemmNnz(self, transA, transB, m, n, k, descrA, nnzA, csrRowPtrA,
+                    csrColIndA, descrB, nnzB, csrRowPtrB, csrColIndB, descrC,
+                    csrRowPtrC):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm
+
+        Returns
+        -------
+        int
+            nnzC
+        """
+        fn = self.api.XcsrgemmNnz
+        with _readonly(csrRowPtrA, csrColIndA, csrRowPtrB, csrColIndB) \
+            as (dcsrRowPtrA, dcsrColIndA, dcsrRowPtrB, dcsrColIndB):
+            with _readwrite(csrRowPtrC) as [dcsrRowPtrC]:
+                nnzC = fn(transA=transA, transB=transB, k=k, m=m, n=n,
+                          descrA=descrA, nnzA=nnzA,
+                          csrRowPtrA=dcsrRowPtrA,
+                          csrColIndA=dcsrColIndA, descrB=descrB, nnzB=nnzB,
+                          csrRowPtrB=dcsrRowPtrB, csrColIndB=dcsrColIndB,
+                          descrC=descrC, csrRowPtrC=dcsrRowPtrC,
+                          nnzTotalDevHostPtr=0)
+        return nnzC
+
+    def csrgemm(self, transA, transB, m, n, k, descrA, nnzA, csrValA,
+                csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, csrRowPtrB,
+                csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm
+        """
+        fn = self._get_api("csrgemm", csrValA.dtype)
+        with _readonly(csrValA, csrRowPtrA, csrColIndA, csrValB, csrRowPtrB,
+                       csrColIndB, csrRowPtrC) \
+            as [dcsrValA, dcsrRowPtrA, dcsrColIndA, dcsrValB, dcsrRowPtrB,
+                dcsrColIndB, dcsrRowPtrC]:
+            with _readwrite(csrValC, csrColIndC) as [dcsrValC, dcsrColIndC]:
+                fn(transA=transA, transB=transB, m=m, n=n, k=k, descrA=descrA,
+                   nnzA=nnzA, csrValA=dcsrValA, csrRowPtrA=dcsrRowPtrA,
+                   csrColIndA=dcsrColIndA, csrValB=dcsrValB,
+                   descrB=descrB, nnzB=nnzB,
+                   csrRowPtrB=dcsrRowPtrB, csrColIndB=dcsrColIndB,
+                   descrC=descrC, csrValC=dcsrValC,
+                   csrRowPtrC=dcsrRowPtrC, csrColIndC=dcsrColIndC)
+
+    def csrgemm_ez(self, matA, matB, transA='N', transB='N', descrA=None,
+                   descrB=None, descrC=None):
+        """
+        Raises ValueError if the result is entirely zero.
+
+        Returns
+        -------
+        CudaSparseMatrix
+            a csr matrix of the matrix product (matA * matB).
+
+        Notes
+        -----
+        Calls XcsrgemmNnz and csrgemm
+        """
+        tmpdescr = self.matdescr()
+        descrA = descrA or tmpdescr
+        descrB = descrB or tmpdescr
+        descrC = descrC or tmpdescr
+
+        dtype = matA.dtype
+        m, ka = matA.shape
+        kb, n = matB.shape
+        if ka != kb:
+            raise ValueError("incompatible matrices")
+        k = ka
+
+        indptrC = cuda.device_array(m + 1, dtype='int32')
+        nnz = self.XcsrgemmNnz(transA, transB, m, n, k, descrA, matA.nnz,
+                               matA.indptr, matA.indices, descrB, matB.nnz,
+                               matB.indptr, matB.indices, descrC, indptrC)
+
+        if nnz == 0:
+            raise ValueError("result is entirely zero")
+
+        dataC = cuda.device_array(nnz, dtype=dtype)
+        indicesC = cuda.device_array(nnz, dtype='int32')
+        self.csrgemm(transA, transB, m, n, k, descrA, matA.nnz, matA.data,
+                     matA.indptr, matA.indices, descrB, matB.nnz, matB.data,
+                     matB.indptr, matB.indices, descrC, dataC, indptrC,
+                     indicesC)
+
+        return CudaCSRMatrix().from_attributes(data=dataC, indices=indicesC,
+                                               indptr=indptrC, shape=(m, n),
+                                               dtype=dtype, nnz=nnz)
+
+    # ------------------------------------------------------------------------
+    # Preconditioners
+
+    def csric0(self, trans, m, descr, csrValM, csrRowPtrA, csrColIndA, info):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csric0
+        """
+        fn = self._get_api("csric0", csrValM.dtype)
+        with _readonly(csrRowPtrA, csrColIndA) as [dcsrRowPtrA, dcsrColIndA]:
+            with _readwrite(csrValM) as [dcsrValM]:
+                fn(trans=trans, m=m, descrA=descr,
+                   csrValA_ValM=dcsrValM, csrRowPtrA=dcsrRowPtrA,
+                   csrColIndA=dcsrColIndA, info=info)
+
+    def csrilu0(self, trans, m, descr, csrValM, csrRowPtrA, csrColIndA, info):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrilu0
+        """
+        fn = self._get_api("csrilu0", csrValM.dtype)
+        with _readonly(csrRowPtrA, csrColIndA) as [dcsrRowPtrA, dcsrColIndA]:
+            with _readwrite(csrValM) as [dcsrValM]:
+                fn(trans=trans, m=m, descrA=descr,
+                   csrValA_ValM=dcsrValM, csrRowPtrA=dcsrRowPtrA,
+                   csrColIndA=dcsrColIndA, info=info)
+
+    def gtsv(self, m, n, dl, d, du, B, ldb):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-gtsv
+        """
+        fn = self._get_api("gtsv", B.dtype)
+        with _readonly(dl, d, du) as [ddl, dd, ddu]:
+            with _readwrite(B) as [dB]:
+                fn(m=m, n=n, dl=ddl, d=dd, du=ddu, B=dB, ldb=ldb)
+
+    def gtsv_nopivot(self, m, n, dl, d, du, B, ldb):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-gtsv_nopivot
+        """
+        fn = self._get_api("gtsv_nopivot", B.dtype)
+        with _readonly(dl, d, du) as [ddl, dd, ddu]:
+            with _readwrite(B) as [dB]:
+                fn(m=m, n=n, dl=ddl, d=dd, du=ddu, B=dB, ldb=ldb)
+
+    def gtsvStridedBatch(self, m, dl, d, du, x, batchCount, batchStride):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-gtsvstridedbatch
+        """
+        fn = self._get_api("gtsvStridedBatch", x.dtype)
+        with _readonly(dl, d, du) as [ddl, dd, ddu]:
+            with _readwrite(x) as [dx]:
+                fn(m=m, dl=ddl, d=dd, du=ddu, x=dx,
+                   batchCount=batchCount, batchStride=batchStride)
+
+    # ------------------------------------------------------------------------
+    # Format Conversion
+
+    def bsr2csr(self, dirA, mb, nb, descrA, bsrValA, bsrRowPtrA, bsrColIndA,
+                blockDim, descrC, csrValC, csrRowPtrC, csrColIndC):
+        fn = self._get_api('bsr2csr', bsrValA.dtype)
+        with _readonly(bsrValA, bsrRowPtrA, bsrColIndA) as [dbsrValA,
+                                                            dbsrRowPtrA,
+                                                            dbsrColIndA]:
+            with _readwrite(csrValC, csrRowPtrC, csrColIndC) as [dcsrValC,
+                                                                 dcsrRowPtrC,
+                                                                 dcsrColIndC]:
+                fn(dirA=dirA, mb=mb, nb=nb, descrA=descrA, bsrValA=dbsrValA,
+                   bsrRowPtrA=dbsrRowPtrA, bsrColIndA=dbsrColIndA,
+                   blockDim=blockDim, descrC=descrC, csrValC=dcsrValC,
+                   csrRowPtrC=dcsrRowPtrC, csrColIndC=dcsrColIndC)
+
+    def Xcoo2csr(self, cooRowInd, nnz, m, csrRowPtr):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-bsr2csr
+        """
+        fn = self.api.Xcoo2csr
+        with _readonly(cooRowInd) as [dcooRowInd]:
+            with _readwrite(csrRowPtr) as [dcsrRowPtr]:
+                fn(cooRowInd=dcooRowInd, nnz=nnz, m=m, csrRowPtr=dcsrRowPtr,
+                   idxBase=self.idxbase)
+
+    def csc2dense(self, m, n, descrA, cscValA, cscRowIndA, cscColPtrA, A, lda):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csc2dense
+        """
+        fn = self._get_api('csc2dense', cscValA.dtype)
+        with _readonly(cscValA, cscRowIndA, cscColPtrA) as [dcscValA,
+                                                            dcscRowIndA,
+                                                            dcscColPtrA]:
+            with _readwrite(A) as [dA]:
+                fn(m=m, n=n, descrA=descrA, cscValA=dcscValA,
+                   cscRowIndA=dcscRowIndA, cscColPtrA=dcscColPtrA, A=dA,
+                   lda=lda)
+
+    csc2hyb = NotImplemented
+
+    def Xcsr2bsrNnz(self, dirA, m, n, descrA, csrRowPtrA, csrColIndA,
+                    blockDim, descrC, bsrRowPtrC):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csr2bsr
+        """
+        fn = self.api.Xcsr2bsrNnz
+        with _readonly(csrRowPtrA, csrColIndA, bsrRowPtrC) as [dcsrRowPtrA,
+                                                               dcsrColIndA,
+                                                               dbsrRowPtrC]:
+            nnz = fn(dirA=dirA, m=m, n=n, descrA=descrA,
+                     csrRowPtrA=dcsrRowPtrA,
+                     csrColIndA=dcsrColIndA,
+                     blockDim=blockDim,
+                     descrC=descrC, bsrRowPtrC=dbsrRowPtrC,
+                     nnzTotalDevHostPtr=0)
+        return nnz
+
+    def csr2bsr(self, dirA, m, n, descrA, csrValA, csrRowPtrA, csrColIndA,
+                blockDim, descrC, bsrValC, bsrRowPtrC, bsrColIndC):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csr2bsr
+        """
+        fn = self._get_api('csr2bsr', csrValA.dtype)
+        with _readonly(csrValA, csrRowPtrA, csrColIndA) as [dcsrValA,
+                                                            dcsrRowPtrA,
+                                                            dcsrColIndA]:
+            with _readwrite(bsrValC, bsrRowPtrC, bsrColIndC) as [dbsrValC,
+                                                                 dbsrRowPtrC,
+                                                                 dbsrColIndC]:
+                nnz = fn(dirA=dirA, m=m, n=n, descrA=descrA, csrValA=dcsrValA,
+                         csrRowPtrA=dcsrRowPtrA, csrColIndA=dcsrColIndA,
+                         blockDim=blockDim, descrC=descrC, bsrValC=dbsrValC,
+                         bsrRowPtrC=dbsrRowPtrC, bsrColIndC=dbsrColIndC)
+        return nnz
+
+    def Xcsr2coo(self, csrRowPtr, nnz, m, cooRowInd):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csr2coo
+        """
+        fn = self.api.Xcsr2coo
+        with _readonly(csrRowPtr) as [dcsrRowPtr]:
+            with _readwrite(cooRowInd) as [dcooRowInd]:
+                fn(csrRowPtr=dcsrRowPtr, nnz=nnz, m=m, cooRowInd=dcooRowInd,
+                   idxBase=self.idxbase)
+
+    def csr2csc(self, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+                cscRowInd, cscColPtr, copyValues):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csr2csc
+        """
+        fn = self._get_api('csr2csc', csrVal.dtype)
+        with _readonly(csrVal, csrRowPtr, csrColInd) as [dcsrVal, dcsrRowPtr,
+                                                         dcsrColInd]:
+            with _readwrite(cscVal, cscRowInd, cscColPtr) as [dcscVal,
+                                                              dcscRowInd,
+                                                              dcscColPtr]:
+                fn(m=m, n=n, nnz=nnz, csrVal=dcsrVal, csrRowPtr=dcsrRowPtr,
+                   csrColInd=dcsrColInd, cscVal=dcscVal, cscRowInd=dcscRowInd,
+                   cscColPtr=dcscColPtr, copyValues=copyValues,
+                   idxBase=self.idxbase)
+
+    def csr2dense(self, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csr2dense
+        """
+        fn = self._get_api('csr2dense', csrValA.dtype)
+        with _readonly(csrValA, csrRowPtrA, csrColIndA) as [dcsrValA,
+                                                            dcsrRowPtrA,
+                                                            dcsrColIndA]:
+            with _readwrite(A) as [dA]:
+                fn(m=m, n=n, descrA=descrA, csrValA=dcsrValA,
+                   csrRowPtrA=dcsrRowPtrA, csrColIndA=dcsrColIndA, A=dA,
+                   lda=lda)
+
+    csr2hyb = NotImplemented
+
+    def dense2csc(self, m, n, descrA, A, lda, nnzPerCol, cscValA, cscRowIndA,
+                  cscColPtrA):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-dense2csc
+        """
+        fn = self._get_api('dense2csc', cscValA.dtype)
+        with _readonly(A, nnzPerCol) as [dA, nnzPerCol]:
+            with _readwrite(cscValA, cscRowIndA, cscColPtrA) as [dcscValA,
+                                                                 dcscRowIndA,
+                                                                 dcscColPtrA]:
+                fn(m=m, n=n, descrA=descrA, A=dA, lda=lda,
+                   nnzPerCol=nnzPerCol, cscValA=dcscValA,
+                   cscRowIndA=dcscRowIndA,
+                   cscColPtrA=dcscColPtrA)
+
+    def dense2csr(self, m, n, descrA, A, lda, nnzPerRow, csrValA,
+                  csrRowPtrA, csrColIndA):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-dense2csr
+
+        Returns
+        -------
+        int
+            nnzTotalDevHostPtr
+        """
+        fn = self._get_api('dense2csr', A.dtype)
+        with _readonly(A, nnzPerRow) as [dA, nnzPerRow]:
+            with _readwrite(csrValA, csrRowPtrA, csrColIndA) as [dcsrValA,
+                                                                 dcsrRowPtrA,
+                                                                 dcsrColIndA]:
+                fn(m=m, n=n, descrA=descrA, A=dA, lda=lda,
+                   nnzPerRow=nnzPerRow, csrValA=dcsrValA,
+                   csrRowPtrA=dcsrRowPtrA, csrColIndA=dcsrColIndA)
+
+    dense2hyb = NotImplemented
+    hyb2csc = NotImplemented
+    hyb2csr = NotImplemented
+    hyb2dense = NotImplemented
+
+    def nnz(self, dirA, m, n, descrA, A, lda, nnzPerRowCol):
+        """http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-nnz
+        """
+        fn = self._get_api('nnz', A.dtype)
+        with _readonly(A) as [dA]:
+            with _readwrite(nnzPerRowCol) as [dnnzPerRowCol]:
+                nnzTotal = fn(dirA=dirA, m=m, n=n, descrA=descrA, A=dA,
+                              nnzPerRowCol=dnnzPerRowCol, lda=lda,
+                              nnzTotalDevHostPtr=0)
+        return nnzTotal
+
+
+# ------------------------------------------------------------------------
+# Matrix Ctors
+
+class CudaSparseMatrix(object):
+    def from_host_matrix(self, matrix, stream=0):
+        dtype = matrix.dtype
+        shape = matrix.shape
+        nnz = matrix.nnz
+        data = cuda.to_device(matrix.data, stream=stream)
+        indices = cuda.to_device(matrix.indices, stream=stream)
+        indptr = cuda.to_device(matrix.indptr, stream=stream)
+        self.from_attributes(dtype=dtype, shape=shape, nnz=nnz, data=data,
+                             indices=indices, indptr=indptr)
+        return self
+
+    def from_attributes(self, dtype, shape, nnz, data, indices, indptr):
+        self.dtype = dtype
+        self.shape = shape
+        self.ndim = len(shape)
+        self.nnz = nnz
+        self.data = data
+        self.indices = indices
+        self.indptr = indptr
+        return self
+
+    def copy_to_host(self, stream=0):
+        data = self.data.copy_to_host(stream=stream)
+        indices = self.indices.copy_to_host(stream=stream)
+        indptr = self.indptr.copy_to_host(stream=stream)
+        return self.host_constructor((data, indices, indptr), shape=self.shape)
+
+
+class CudaBSRMatrix(CudaSparseMatrix):
+    host_constructor = ss.bsr_matrix
+
+    def from_host_matrix(self, matrix, stream=0):
+        super(CudaBSRMatrix, self).from_host_matrix(matrix, stream=stream)
+        self.blocksize = matrix.blocksize
+        return self
+
+
+class CudaCSCMatrix(CudaSparseMatrix):
+    host_constructor = ss.csc_matrix
+
+
+class CudaCSRMatrix(CudaSparseMatrix):
+    host_constructor = ss.csr_matrix
+
+
+def bsr_matrix(*args, **kws):
+    """Takes the same arguments as ``scipy.sparse.bsr_matrix``.
+
+    Returns a BSR CUDA matrix.
+    """
+    mat = ss.bsr_matrix(*args, **kws)
+    return CudaBSRMatrix().from_host_matrix(mat)
+
+
+def csc_matrix(*args, **kws):
+    """Takes the same arguments as ``scipy.sparse.csc_matrix``.
+
+    Returns a CSC CUDA matrix.
+    """
+    mat = ss.csc_matrix(*args, **kws)
+    return CudaCSCMatrix().from_host_matrix(mat)
+
+
+def csr_matrix(*args, **kws):
+    """Takes the same arguments as ``scipy.sparse.csr_matrix``.
+
+    Returns a CSR CUDA matrix.
+    """
+    mat = ss.csr_matrix(*args, **kws)
+    return CudaCSRMatrix().from_host_matrix(mat)
diff --git a/pyculib/sparse/binding.py b/pyculib/sparse/binding.py
new file mode 100644
index 0000000..b1a44c5
--- /dev/null
+++ b/pyculib/sparse/binding.py
@@ -0,0 +1,1307 @@
+from __future__ import print_function, absolute_import, division
+
+from ctypes import c_float, c_double, byref, c_int, c_void_p
+
+from numba.cuda.cudadrv.drvapi import cu_stream
+from numba.cuda.cudadrv.driver import device_pointer
+from pyculib.utils import (Lib, ctype_function, finalizer,
+                                   c_complex, c_double_complex, memalign)
+
+from . import decls
+
+INV_STATUS = dict(
+    CUSPARSE_STATUS_SUCCESS=0,
+    CUSPARSE_STATUS_NOT_INITIALIZED=1,
+    CUSPARSE_STATUS_ALLOC_FAILED=2,
+    CUSPARSE_STATUS_INVALID_VALUE=3,
+    CUSPARSE_STATUS_ARCH_MISMATCH=4,
+    CUSPARSE_STATUS_MAPPING_ERROR=5,
+    CUSPARSE_STATUS_EXECUTION_FAILED=6,
+    CUSPARSE_STATUS_INTERNAL_ERROR=7,
+    CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED=8
+)
+
+STATUS = dict((v, k) for k, v in INV_STATUS.items())
+
+CUSPARSE_INDEX_BASE_ZERO = 0
+CUSPARSE_INDEX_BASE_ONE = 1
+
+CUSPARSE_DIAG_TYPE_NON_UNIT = 0
+CUSPARSE_DIAG_TYPE_UNIT = 1
+
+CUSPARSE_FILL_MODE_LOWER = 0
+CUSPARSE_FILL_MODE_UPPER = 1
+
+CUSPARSE_MATRIX_TYPE_GENERAL = 0
+CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1
+CUSPARSE_MATRIX_TYPE_HERMITIAN = 2
+CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
+
+CUSPARSE_ACTION_SYMBOLIC = 0
+CUSPARSE_ACTION_NUMERIC = 1
+
+CUSPARSE_POINTER_MODE_HOST = 0
+CUSPARSE_POINTER_MODE_DEVICE = 1
+
+CUSPARSE_OPERATION_NON_TRANSPOSE = 0
+CUSPARSE_OPERATION_TRANSPOSE = 1
+CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
+
+CUSPARSE_DIRECTION_ROW = 0
+CUSPARSE_DIRECTION_COLUMN = 1
+
+# automatically decide how to split the
+# data into regular/irregular part
+CUSPARSE_HYB_PARTITION_AUTO = 0
+# store data into regular part up to a user
+# specified treshhold
+CUSPARSE_HYB_PARTITION_USER = 1
+# store all data in the regular part
+CUSPARSE_HYB_PARTITION_MAX = 2
+
+cusparseHandle_t = c_void_p
+cusparseMatDescr_t = c_void_p
+cusparseSolveAnalysisInfo_t = c_void_p
+cusparseHybMat_t = c_void_p
+
+cusparseStatus_t = c_int
+cusparseIndexBase_t = c_int
+cusparsePointerMode_t = c_int
+cusparseAction_t = c_int
+cusparseFillMode_t = c_int
+cusparseDiagType_t = c_int
+cusparseOperation_t = c_int
+cusparseDirection_t = c_int
+cusparseHybPartition_t = c_int
+cusparseMatrixType_t = c_int
+
+_c_types = {
+    'int': c_int,
+    'float': c_float,
+    'double': c_double,
+    'cuComplex': c_complex,
+    'cuDoubleComplex': c_double_complex,
+    'cudaStream_t': cu_stream,
+    'cusparseStatus_t': cusparseStatus_t,
+    'cusparseMatDescr_t': cusparseMatDescr_t,
+    'cusparseSolveAnalysisInfo_t': cusparseSolveAnalysisInfo_t,
+    'cusparseHybMat_t': cusparseHybMat_t,
+    'cusparseHandle_t': cusparseHandle_t,
+    'cusparsePointerMode_t': cusparsePointerMode_t,
+    'cusparseAction_t': cusparseAction_t,
+    'cusparseFillMode_t': cusparseFillMode_t,
+    'cusparseDiagType_t': cusparseDiagType_t,
+    'cusparseOperation_t': cusparseOperation_t,
+    'cusparseDirection_t': cusparseDirection_t,
+    'cusparseHybPartition_t': cusparseHybPartition_t,
+    'cusparseIndexBase_t': cusparseIndexBase_t,
+    'cusparseMatrixType_t': cusparseMatrixType_t,
+}
+
+
+class CuSparseError(Exception):
+    def __init__(self, code):
+        super(CuSparseError, self).__init__(STATUS[code])
+
+
+def _get_type(k):
+    try:
+        return _c_types[k]
+    except KeyError:
+        if k[-1] == '*':
+            return c_void_p
+            #return POINTER(_get_type(k[:-1]))
+        raise
+
+
+def _init_ctype_function(name, decl):
+    res, args = decl
+    types = [_get_type(a) for _, a in args]
+    return ctype_function(_get_type(res), *types)
+
+
+def _declarations():
+    for k in dir(decls):
+        if not k.startswith('_'):
+            yield k, getattr(decls, k)
+
+
+class _libcusparse(Lib):
+    lib = 'cusparse'
+    ErrorType = CuSparseError
+
+
+def _init_libcusparse():
+    gv = {}
+    for k, v in _declarations():
+        gv[k] = _init_ctype_function(k, v)
+    base = _libcusparse
+    return type('libcusparse', (base,), gv)
+
+
+libcusparse = _init_libcusparse()
+
+DIAGTYPE = (CUSPARSE_DIAG_TYPE_NON_UNIT,
+            CUSPARSE_DIAG_TYPE_UNIT)
+
+DIAGTYPECHAR = ('N', 'U')
+
+FILLMODE = (CUSPARSE_FILL_MODE_LOWER,
+            CUSPARSE_FILL_MODE_UPPER)
+
+FILLMODECHAR = ('L', 'U')
+
+MATRIXTYPE = (CUSPARSE_MATRIX_TYPE_GENERAL,
+              CUSPARSE_MATRIX_TYPE_SYMMETRIC,
+              CUSPARSE_MATRIX_TYPE_HERMITIAN,
+              CUSPARSE_MATRIX_TYPE_TRIANGULAR)
+
+MATRIXTYPECHAR = ('G', 'S', 'H', 'T')
+
+
+class MatDescr(finalizer.OwnerMixin):
+    def __init__(self, api, handle):
+        self._api = api
+        self._handle = handle
+        self._finalizer_track((self._handle, self._api))
+
+    @classmethod
+    def _finalize(cls, res):
+        handle, api = res
+        api.cusparseDestroyMatDescr(handle)
+
+    @property
+    def diagtype(self):
+        return DIAGTYPECHAR[self._api.cusparseGetMatDiagType(self._handle)]
+
+    @diagtype.setter
+    def diagtype(self, val):
+        self._api.cusparseSetMatDiagType(self._handle,
+                                         DIAGTYPECHAR.index(val))
+
+    @property
+    def fillmode(self):
+        return FILLMODECHAR[self._api.cusparseGetMatFillMode(self._handle)]
+
+    @fillmode.setter
+    def fillmode(self, val):
+        return self._api.cusparseSetMatFillMode(self._handle,
+                                                FILLMODECHAR.index(val))
+
+    @property
+    def indexbase(self):
+        return self._api.cusparseGetMatIndexBase(self._handle)
+
+    @indexbase.setter
+    def indexbase(self, val):
+        return self._api.cusparseSetMatIndexBase(self._handle, val)
+
+    @property
+    def matrixtype(self):
+        return MATRIXTYPECHAR[self._api.cusparseGetMatType(self._handle)]
+
+    @matrixtype.setter
+    def matrixtype(self, val):
+        return self._api.cusparseSetMatType(self._handle,
+                                            MATRIXTYPECHAR.index(val))
+
+
+class SolveAnalysisInfo(finalizer.OwnerMixin):
+    def __init__(self, api, handle):
+        self._api = api
+        self._handle = handle
+        self._finalizer_track((self._handle, self._api))
+
+    @classmethod
+    def _finalize(cls, res):
+        handle, api = res
+        api.cusparseDestroySolveAnalysisInfo(handle)
+
+
+class _cuSparse(finalizer.OwnerMixin):
+    def __init__(self):
+        self._api = libcusparse()
+        self._handle = cusparseHandle_t()
+        try:
+            self._api.cusparseCreate(byref(self._handle))
+        except CuSparseError:
+            raise RuntimeError("Cannot initialize cuSparse. "
+                               "Could be caused by insufficient GPU memory.")
+        self._finalizer_track((self._handle, self._api))
+        # Default to NULL stream
+        self._stream = 0
+        # Default to host pointer
+        self.use_host_pointer()
+
+    def matdescr(self):
+        handle = cusparseMatDescr_t()
+        self._api.cusparseCreateMatDescr(byref(handle))
+        return MatDescr(self._api, handle)
+
+    def solve_analysis_info(self):
+        handle = cusparseSolveAnalysisInfo_t()
+        self._api.cusparseCreateSolveAnalysisInfo(byref(handle))
+        return SolveAnalysisInfo(self._api, handle)
+
+    @classmethod
+    def _finalize(cls, res):
+        handle, api = res
+        api.cusparseDestroy(handle)
+
+    @property
+    def version(self):
+        ver = c_int()
+        self._api.cusparseGetVersion(self._handle, byref(ver))
+        return ver.value
+
+    @property
+    def stream(self):
+        return self._stream
+
+    @stream.setter
+    def stream(self, stream):
+        self._stream = stream
+        self._api.cusparseSetStream(self._handle, self._stream.handle)
+
+    @property
+    def pointer_mode(self):
+        mode = cusparsePointerMode_t()
+        self._api.cusparseGetPointerMode(self._handle, byref(mode))
+        return mode.value
+
+    @pointer_mode.setter
+    def pointer_mode(self, value):
+        self._api.cusparseSetPointerMode(self._handle, value)
+
+    def use_host_pointer(self):
+        self.pointer_mode = CUSPARSE_POINTER_MODE_HOST
+
+    def use_device_pointer(self):
+        self.pointer_mode = CUSPARSE_POINTER_MODE_DEVICE
+
+
+_strip_prefix = 'cusparse'
+_len_strip_prefix = len(_strip_prefix)
+
+
+def mangle(name):
+    assert name.startswith(_strip_prefix)
+    name = name[_len_strip_prefix:]
+    return name
+
+
+def _flatten_args(args, kws, argnames, defaults):
+    values = list(args)
+    for name in argnames[len(args):]:
+        if name in kws:
+            values.append(kws.pop(name))
+        elif name in defaults:
+            values.append(defaults[name])
+        else:
+            raise TypeError("missing '%s' arg" % name)
+    if kws:
+        raise TypeError("function has no keyword arguments: %s" %
+                        tuple(kws.keys()))
+    return values
+
+
+def _make_docstring(name, decl):
+    ret, args = decl
+    doc = []
+
+    doc.append("Wrapper for '%s'" % name)
+
+    doc.append('')
+    doc.append("Args")
+    doc.append("----")
+    for a, t in args:
+        doc.append("%s: %s" % (a, t))
+
+    return '\n'.join(doc)
+
+
+def _dummy_preparer(val):
+    return val, None
+
+
+class _api_function(object):
+    __slots__ = 'fn', 'argtypes', 'argnames', 'defaults'
+
+    def __init__(self, fn, decl):
+        self.fn = fn
+        self.argnames, self.argtypes = zip(*decl[1])
+        self.defaults = {}
+        self.set_defaults()
+        assert self.argnames[0] == 'handle'
+        preparers = []
+        for k in self.argnames:
+            pname = 'prepare_%s' % k
+            if hasattr(self, pname):
+                meth = getattr(self, pname)
+                preparers.append(meth)
+            else:
+                preparers.append(_dummy_preparer)
+
+        self.preparers = tuple(preparers)
+
+    def __call__(self, *args, **kws):
+        args = _flatten_args(args, kws, self.argnames, self.defaults)
+        rargs = [pre(val) for pre, val in zip(self.preparers, args)]
+        actual, hold = zip(*rargs)
+        #for k, v in zip(self.argnames, actual):
+            #print(k, v)
+        self.fn(*actual)
+        return self.return_value(*hold)
+
+    def set_defaults(self):
+        if ('idxBase' in self.argnames and
+                    'cusparseIndexBase_t' in self.argtypes):
+            self.defaults['idxBase'] = CUSPARSE_INDEX_BASE_ZERO
+
+    def return_value(self, *args):
+        return
+
+
+def _make_api_function(name, base):
+    return type(name, (base,), {})
+
+
+def _prepare_array(self, val):
+    return device_pointer(val), val
+
+
+def _prepare_hybpartition(self, val):
+    if val == 'A':
+        return CUSPARSE_HYB_PARTITION_AUTO, None
+    elif val == 'U':
+        return CUSPARSE_HYB_PARTITION_USER, None
+    elif val == 'M':
+        return CUSPARSE_HYB_PARTITION_MAX, None
+    else:
+        raise ValueError("Partition flag must be either 'A', 'U' or 'M'")
+
+
+def _prepare_direction_flag(self, val):
+    if val == 'R':
+        return CUSPARSE_DIRECTION_ROW, None
+    elif val == 'C':
+        return CUSPARSE_DIRECTION_COLUMN, None
+    else:
+        raise ValueError("Direction flag must be either 'R' or 'C'")
+
+
+def _prepare_operation_flag(self, val):
+    if val == 'N':
+        return CUSPARSE_OPERATION_NON_TRANSPOSE, None
+    elif val == 'T':
+        return CUSPARSE_OPERATION_TRANSPOSE, None
+    elif val == 'C':
+        return CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE, None
+    else:
+        raise ValueError("Operation flag must be either 'N', 'T' or 'C'")
+
+
+def _prepare_matdescr(self, val):
+    return val._handle, None
+
+
+def _prepare_hybmat(self, val):
+    raise NotImplementedError
+
+
+def _prepare_action(self, val):
+    if val == 'N':
+        return CUSPARSE_ACTION_NUMERIC, None
+    elif val == 'S':
+        return CUSPARSE_ACTION_SYMBOLIC, None
+    else:
+        raise ValueError("Action must be either 'N' or 'S'")
+
+
+def _prepare_solveinfo(self, val):
+    return val._handle, None
+
+
+class _array_double_complex(object):
+    def __init__(self, val=None):
+        cval, data = memalign(c_double_complex, align=16)
+        self._data = data
+        self._cval = cval
+        self._as_parameter_ = byref(self._cval)
+
+        if val is not None:
+            val = c_double_complex(val)
+            self._cval.real = val.real
+            self._cval.imag = val.imag
+
+    @property
+    def value(self):
+        return self._cval.value
+
+
+def _prepare_scalar(self, val):
+    if self.T == c_double_complex:
+        data = _array_double_complex(val)
+        return data, data
+    else:
+        v = self.T(val)
+        return byref(v), v
+
+
+def _prepare_scalar_out(self, val):
+    if self.T == c_double_complex:
+        data = _array_double_complex(val)
+        return data, data
+    else:
+        v = self.T()
+        return byref(v), v
+
+
+class _axpyi_v2(_api_function):
+    __slots__ = ()
+
+    prepare_alpha = _prepare_scalar
+    prepare_xVal = _prepare_array
+    prepare_xInd = _prepare_array
+    prepare_y = _prepare_array
+
+
+class Saxpyi_v2(_axpyi_v2):
+    __slots__ = ()
+    T = c_float
+
+
+class Daxpyi_v2(_axpyi_v2):
+    __slots__ = ()
+    T = c_double
+
+
+class Caxpyi_v2(_axpyi_v2):
+    __slots__ = ()
+    T = c_complex
+
+
+class Zaxpyi_v2(_axpyi_v2):
+    __slots__ = ()
+    T = c_double_complex
+
+
+class _bsr2csr(_api_function):
+    __slots__ = ()
+
+    prepare_dirA = _prepare_direction_flag
+
+    prepare_bsrValA = _prepare_array
+    prepare_bsrRowPtrA = _prepare_array
+    prepare_bsrColIndA = _prepare_array
+    prepare_csrValC = _prepare_array
+    prepare_csrRowPtrC = _prepare_array
+    prepare_csrColIndC = _prepare_array
+
+    prepare_descrA = _prepare_matdescr
+    prepare_descrC = _prepare_matdescr
+
+
+Sbsr2csr = Dbsr2csr = Cbsr2csr = Zbsr2csr = _bsr2csr
+
+
+class _bsrmv(_api_function):
+    __slots__ = ()
+
+    prepare_dirA = _prepare_direction_flag
+    prepare_transA = _prepare_operation_flag
+    prepare_alpha = _prepare_scalar
+    prepare_beta = _prepare_scalar
+
+    prepare_bsrValA = _prepare_array
+    prepare_bsrRowPtrA = _prepare_array
+    prepare_bsrColIndA = _prepare_array
+    prepare_x = _prepare_array
+    prepare_y = _prepare_array
+
+    prepare_descrA = _prepare_matdescr
+
+
+class Sbsrmv(_bsrmv):
+    __slots__ = ()
+    T = c_float
+
+
+class Dbsrmv(_bsrmv):
+    __slots__ = ()
+    T = c_double
+
+
+class Cbsrmv(_bsrmv):
+    __slots__ = ()
+    T = c_complex
+
+
+class Zbsrmv(_bsrmv):
+    __slots__ = ()
+    T = c_double_complex
+
+
+class _bsrxmv(_api_function):
+    __slots__ = ()
+
+    prepare_dirA = _prepare_direction_flag
+    prepare_transA = _prepare_operation_flag
+
+    prepare_alpha = _prepare_scalar
+    prepare_beta = _prepare_scalar
+
+    prepare_bsrValA = _prepare_array
+    prepare_bsrMaskPtrA = _prepare_array
+    prepare_bsrRowPtrA = _prepare_array
+    prepare_bsrEndPtrA = _prepare_array
+    prepare_bsrColIndA = _prepare_array
+
+    prepare_x = _prepare_array
+    prepare_y = _prepare_array
+
+    prepare_descrA = _prepare_matdescr
+
+
+class Sbsrxmv(_bsrxmv):
+    __slots__ = ()
+    T = c_float
+
+
+class Dbsrxmv(_bsrxmv):
+    __slots__ = ()
+    T = c_double
+
+
+class Cbsrxmv(_bsrxmv):
+    __slots__ = ()
+    T = c_complex
+
+
+class Zbsrxmv(_bsrxmv):
+    __slots__ = ()
+    T = c_double_complex
+
+
+class _csc2dense(_api_function):
+    __slots__ = ()
+
+    prepare_dirA = _prepare_direction_flag
+    prepare_transA = _prepare_operation_flag
+
+    prepare_alpha = _prepare_scalar
+    prepare_beta = _prepare_scalar
+
+    prepare_cscValA = _prepare_array
+    prepare_cscRowIndA = _prepare_array
+    prepare_cscColPtrA = _prepare_array
+    prepare_A = _prepare_array
+
+    prepare_descrA = _prepare_matdescr
+
+
+Scsc2dense = Dcsc2dense = Ccsc2dense = Zcsc2dense = _csc2dense
+
+
+class _csc2hyb(_api_function):
+    __slots__ = ()
+    prepare_descrA = _prepare_matdescr
+    prepare_cscValA = _prepare_array
+    prepare_cscRowIndA = _prepare_array
+    prepare_cscColPtrA = _prepare_array
+    prepare_hybA = _prepare_hybmat
+    prepare_partitionType = _prepare_hybpartition
+
+
+Scsc2hyb = Dcsc2hyb = Ccsc2hyb = Zcsc2hyb = _csc2hyb
+
+
+class _csr2bsr(_api_function):
+    __slots__ = ()
+
+    prepare_dirA = _prepare_direction_flag
+    prepare_descrA = _prepare_matdescr
+
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+
+    prepare_descrC = _prepare_matdescr
+
+    prepare_bsrValC = _prepare_array
+    prepare_bsrRowPtrC = _prepare_array
+    prepare_bsrColIndC = _prepare_array
+
+
+Scsr2bsr = Dcsr2bsr = Ccsr2bsr = Zcsr2bsr = _csr2bsr
+
+
+class _csr2csc_v2(_api_function):
+    __slots__ = ()
+
+    prepare_csrVal = _prepare_array
+    prepare_csrRowPtr = _prepare_array
+    prepare_csrColInd = _prepare_array
+    prepare_cscVal = _prepare_array
+    prepare_cscRowInd = _prepare_array
+    prepare_cscColPtr = _prepare_array
+
+    prepare_copyValues = _prepare_action
+
+
+Scsr2csc_v2 = Dcsr2csc_v2 = Ccsr2csc_v2 = Zcsr2csc_v2 = _csr2csc_v2
+
+
+class _csr2dense(_api_function):
+    __slots__ = ()
+    prepare_descrA = _prepare_matdescr
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_A = _prepare_array
+
+
+Scsr2dense = Dcsr2dense = Ccsr2dense = Zcsr2dense = _csr2dense
+
+
+class _csr2hyb(_api_function):
+    descrA = _prepare_matdescr
+    csrValA = _prepare_array
+    csrRowPtrA = _prepare_array
+    csrColIndA = _prepare_array
+    hybA = _prepare_hybmat
+    partitionType = _prepare_hybpartition
+
+
+Scsr2hyb = Dcsr2hyb = Ccsr2hyb = Zcsr2hyb = _csr2hyb
+
+
+class _csrgeam(_api_function):
+    prepare_alpha = _prepare_scalar
+    prepare_beta = _prepare_scalar
+
+    prepare_descrA = _prepare_matdescr
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+
+    prepare_descrB = _prepare_matdescr
+    prepare_csrValB = _prepare_array
+    prepare_csrRowPtrB = _prepare_array
+    prepare_csrColIndB = _prepare_array
+
+    prepare_descrC = _prepare_matdescr
+    prepare_csrValC = _prepare_array
+    prepare_csrRowPtrC = _prepare_array
+    prepare_csrColIndC = _prepare_array
+
+
+class Scsrgeam(_csrgeam):
+    T = c_float
+
+
+class Dcsrgeam(_csrgeam):
+    T = c_double
+
+
+class Ccsrgeam(_csrgeam):
+    T = c_complex
+
+
+class Zcsrgeam(_csrgeam):
+    T = c_double_complex
+
+
+class _csrgemm(_api_function):
+    prepare_transA = _prepare_operation_flag
+    prepare_transB = _prepare_operation_flag
+    prepare_descrA = _prepare_matdescr
+
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_descrB = _prepare_matdescr
+
+    prepare_csrValB = _prepare_array
+    prepare_csrRowPtrB = _prepare_array
+    prepare_csrColIndB = _prepare_array
+    prepare_descrC = _prepare_matdescr
+    prepare_csrValC = _prepare_array
+    prepare_csrRowPtrC = _prepare_array
+    prepare_csrColIndC = _prepare_array
+
+
+Scsrgemm = Dcsrgemm = Ccsrgemm = Zcsrgemm = _csrgemm
+
+
+class _csric0(_api_function):
+    prepare_trans = _prepare_operation_flag
+    prepare_descrA = _prepare_matdescr
+    prepare_csrValA_ValM = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_info = _prepare_solveinfo
+
+
+Scsric0 = Dcsric0 = Ccsric0 = Zcsric0 = _csric0
+
+
+class _csrilu0(_api_function):
+    prepare_trans = _prepare_operation_flag
+    prepare_descrA = _prepare_matdescr
+    prepare_csrValA_ValM = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_info = _prepare_solveinfo
+
+
+Scsrilu0 = Dcsrilu0 = Ccsrilu0 = Zcsrilu0 = _csrilu0
+
+
+class _csrmm2(_api_function):
+    prepare_transa = _prepare_operation_flag
+    prepare_transb = _prepare_operation_flag
+
+    prepare_alpha = _prepare_scalar
+    prepare_beta = _prepare_scalar
+
+    prepare_descrA = _prepare_matdescr
+
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+
+    prepare_B = _prepare_array
+    prepare_C = _prepare_array
+
+
+class Scsrmm2(_csrmm2):
+    T = c_float
+
+
+class Dcsrmm2(_csrmm2):
+    T = c_double
+
+
+class Ccsrmm2(_csrmm2):
+    T = c_complex
+
+
+class Zcsrmm2(_csrmm2):
+    T = c_double_complex
+
+
+class _csrmm_v2(_api_function):
+    prepare_transA = _prepare_operation_flag
+    prepare_alpha = _prepare_scalar
+    prepare_descrA = _prepare_matdescr
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_B = _prepare_array
+    prepare_beta = _prepare_scalar
+    prepare_C = _prepare_array
+
+
+class Scsrmm_v2(_csrmm_v2):
+    T = c_float
+
+
+class Dcsrmm_v2(_csrmm_v2):
+    T = c_double
+
+
+class Ccsrmm_v2(_csrmm_v2):
+    T = c_complex
+
+
+class Zcsrmm_v2(_csrmm_v2):
+    T = c_double_complex
+
+
+class _csrmv_v2(_api_function):
+    prepare_transA = _prepare_operation_flag
+    prepare_alpha = _prepare_scalar
+    prepare_descrA = _prepare_matdescr
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_x = _prepare_array
+    prepare_beta = _prepare_scalar
+    prepare_y = _prepare_array
+
+
+class Scsrmv_v2(_csrmv_v2):
+    T = c_float
+
+
+class Dcsrmv_v2(_csrmv_v2):
+    T = c_double
+
+
+class Ccsrmv_v2(_csrmv_v2):
+    T = c_complex
+
+
+class Zcsrmv_v2(_csrmv_v2):
+    T = c_double_complex
+
+
+class _csrsm_analysis(_api_function):
+    prepare_transA = _prepare_operation_flag
+    prepare_descrA = _prepare_matdescr
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_info = _prepare_solveinfo
+
+
+Scsrsm_analysis = Dcsrsm_analysis = _csrsm_analysis
+Ccsrsm_analysis = Zcsrsm_analysis = _csrsm_analysis
+
+
+class _csrsm_solve(_api_function):
+    prepare_transA = _prepare_operation_flag
+    prepare_alpha = _prepare_scalar
+    prepare_descrA = _prepare_matdescr
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_info = _prepare_solveinfo
+    prepare_x = _prepare_array
+    prepare_y = _prepare_array
+
+
+class Scsrsm_solve(_csrsm_solve):
+    T = c_float
+
+
+class Dcsrsm_solve(_csrsm_solve):
+    T = c_double
+
+
+class Ccsrsm_solve(_csrsm_solve):
+    T = c_complex
+
+
+class Zcsrsm_solve(_csrsm_solve):
+    T = c_double_complex
+
+
+class _csrsv_analysis_v2(_api_function):
+    prepare_transA = _prepare_operation_flag
+    prepare_descrA = _prepare_matdescr
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_info = _prepare_solveinfo
+
+
+Scsrsv_analysis_v2 = Dcsrsv_analysis_v2 = _csrsv_analysis_v2
+Ccsrsv_analysis_v2 = Zcsrsv_analysis_v2 = _csrsv_analysis_v2
+
+
+class _csrsv_solve_v2(_api_function):
+    prepare_transA = _prepare_operation_flag
+    prepare_alpha = _prepare_scalar
+    prepare_descrA = _prepare_matdescr
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_info = _prepare_solveinfo
+    prepare_x = _prepare_array
+    prepare_y = _prepare_array
+
+
+class Scsrsv_solve_v2(_csrsv_solve_v2):
+    T = c_float
+
+
+class Dcsrsv_solve_v2(_csrsv_solve_v2):
+    T = c_double
+
+
+class Ccsrsv_solve_v2(_csrsv_solve_v2):
+    T = c_complex
+
+
+class Zcsrsv_solve_v2(_csrsv_solve_v2):
+    T = c_double_complex
+
+
+class _dense2csc(_api_function):
+    prepare_descrA = _prepare_matdescr
+    prepare_A = _prepare_array
+    prepare_cscValA = _prepare_array
+    prepare_cscRowIndA = _prepare_array
+    prepare_cscColPtrA = _prepare_array
+    prepare_nnzPerCol = _prepare_array
+
+
+Sdense2csc = Ddense2csc = Cdense2csc = Zdense2csc = _dense2csc
+
+
+class _dense2csr(_api_function):
+    prepare_descrA = _prepare_matdescr
+    prepare_A = _prepare_array
+
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_nnzPerRow = _prepare_array
+
+Sdense2csr = Ddense2csr = Cdense2csr = Zdense2csr = _dense2csr
+
+
+class _dense2hyb(_api_function):
+    prepare_descrA = _prepare_matdescr
+    prepare_A = _prepare_array
+    prepare_nnzPerRow = _prepare_array
+    prepare_hybA = _prepare_hybmat
+    prepare_partitionType = _prepare_hybpartition
+
+
+Sdense2hyb = Ddense2hyb = Cdense2hyb = Zdense2hyb = _dense2hyb
+
+
+class _dotci(_api_function):
+    prepare_xVal = _prepare_array
+    prepare_xInd = _prepare_array
+    prepare_y = _prepare_array
+
+    prepare_resultDevHostPtr = _prepare_scalar_out
+
+    def return_value(self, *args):
+        return args[self.argnames.index('resultDevHostPtr')].value
+
+    def set_defaults(self):
+        super(_dotci, self).set_defaults()
+        self.defaults['resultDevHostPtr'] = 0
+
+
+class Cdotci(_dotci):
+    T = c_complex
+
+
+class Zdotci(_dotci):
+    T = c_double_complex
+
+
+class _doti(_api_function):
+    prepare_xVal = _prepare_array
+    prepare_xInd = _prepare_array
+    prepare_y = _prepare_array
+    prepare_resultDevHostPtr = _prepare_scalar_out
+
+    def return_value(self, *args):
+        return args[self.argnames.index('resultDevHostPtr')].value
+
+    def set_defaults(self):
+        super(_doti, self).set_defaults()
+        self.defaults['resultDevHostPtr'] = 0
+
+
+class Sdoti(_doti):
+    T = c_float
+
+
+class Ddoti(_doti):
+    T = c_double
+
+
+class Cdoti(_doti):
+    T = c_complex
+
+
+class Zdoti(_doti):
+    T = c_double_complex
+
+
+class _gthr(_api_function):
+    prepare_y = _prepare_array
+    prepare_xVal = _prepare_array
+    prepare_xInd = _prepare_array
+
+
+Sgthr = Dgthr = Cgthr = Zgthr = _gthr
+
+
+class _gthrz(_api_function):
+    prepare_y = _prepare_array
+    prepare_xVal = _prepare_array
+    prepare_xInd = _prepare_array
+
+
+Sgthrz = Dgthrz = Cgthrz = Zgthrz = _gthrz
+
+
+class _gtsv(_api_function):
+    prepare_dl = _prepare_array
+    prepare_d = _prepare_array
+    prepare_du = _prepare_array
+    prepare_B = _prepare_array
+
+
+Sgtsv = Dgtsv = Cgtsv = Zgtsv = _gtsv
+
+
+class _gtsvStridedBatch(_api_function):
+    prepare_dl = _prepare_array
+    prepare_d = _prepare_array
+    prepare_du = _prepare_array
+    prepare_x = _prepare_array
+
+
+SgtsvStridedBatch = DgtsvStridedBatch = _gtsvStridedBatch
+CgtsvStridedBatch = ZgtsvStridedBatch = _gtsvStridedBatch
+
+
+class _gtsv_nopivot(_api_function):
+    prepare_dl = _prepare_array
+    prepare_d = _prepare_array
+    prepare_du = _prepare_array
+    prepare_B = _prepare_array
+
+
+Sgtsv_nopivot = Dgtsv_nopivot = Cgtsv_nopivot = Zgtsv_nopivot = _gtsv_nopivot
+
+
+class _hyb2csc(_api_function):
+    prepare_descrA = _prepare_matdescr
+    prepare_hybA = _prepare_hybmat
+    prepare_cscVal = _prepare_array
+    prepare_cscRowInd = _prepare_array
+    prepare_cscColPtr = _prepare_array
+
+
+Shyb2csc = Dhyb2csc = Chyb2csc = Zhyb2csc = _hyb2csc
+
+
+class _hyb2csr(_api_function):
+    prepare_descrA = _prepare_matdescr
+    prepare_hybA = _prepare_hybmat
+    prepare_csrValA = _prepare_array
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+
+
+Shyb2csr = Dhyb2csr = Chyb2csr = Zhyb2csr = _hyb2csr
+
+
+class _hyb2dense(_api_function):
+    prepare_descrA = _prepare_matdescr
+    prepare_hybA = _prepare_hybmat
+    prepare_A = _prepare_array
+
+
+Shyb2dense = Dhyb2dense = Chyb2dense = Zhyb2dense = _hyb2dense
+
+
+class _hybmv(_api_function):
+    prepare_transA = _prepare_operation_flag
+    prepare_alpha = _prepare_scalar
+    prepare_descrA = _prepare_matdescr
+    prepare_hybA = _prepare_hybmat
+    prepare_x = _prepare_array
+    prepare_beta = _prepare_scalar
+
+
+Shybmv = Dhybmv = Chybmv = Zhybmv = _hybmv
+
+
+class _hybsv_analysis(_api_function):
+    prepare_transA = _prepare_operation_flag
+    prepare_descrA = _prepare_matdescr
+    prepare_hybA = _prepare_hybmat
+    prepare_info = _prepare_solveinfo
+
+
+Shybsv_analysis = Dhybsv_analysis = _hybsv_analysis
+Chybsv_analysis = Zhybsv_analysis = _hybsv_analysis
+
+
+class _hybsv_solve(_api_function):
+    prepare_trans = _prepare_operation_flag
+    prepare_alpha = _prepare_scalar
+    prepare_descra = _prepare_matdescr
+    prepare_hybA = _prepare_hybmat
+    prepare_info = _prepare_solveinfo
+    prepare_x = _prepare_array
+    prepare_y = _prepare_array
+
+
+Shybsv_solve = Dhybsv_solve = Chybsv_solve = Zhybsv_solve = _hybsv_solve
+
+
+class _nnz(_api_function):
+    T = c_int
+    prepare_dirA = _prepare_direction_flag
+    prepare_descrA = _prepare_matdescr
+    prepare_A = _prepare_array
+    prepare_nnzPerRowCol = _prepare_array
+    prepare_nnzTotalDevHostPtr = _prepare_scalar_out
+
+    def return_value(self, *args):
+        return args[self.argnames.index('nnzTotalDevHostPtr')].value
+
+
+Snnz = Dnnz = Cnnz = Znnz = _nnz
+
+
+class _sctr(_api_function):
+    prepare_xVal = _prepare_array
+    prepare_xInd = _prepare_array
+    prepare_y = _prepare_array
+
+
+Ssctr = Dsctr = Csctr = Zsctr = _sctr
+
+
+class _roti_v2(_api_function):
+    prepare_xVal = _prepare_array
+    prepare_xInd = _prepare_array
+    prepare_y = _prepare_array
+    prepare_c = _prepare_scalar
+    prepare_s = _prepare_scalar
+
+
+class Sroti_v2(_roti_v2):
+    T = c_float
+
+
+class Droti_v2(_roti_v2):
+    T = c_double
+
+
+class Xcoo2csr(_api_function):
+    prepare_cooRowInd = _prepare_array
+    prepare_csrRowPtr = _prepare_array
+
+
+class Xcsr2coo(_api_function):
+    prepare_csrRowPtr = _prepare_array
+    prepare_cooRowInd = _prepare_array
+
+
+class Xcsr2bsrNnz(_api_function):
+    T = c_int
+    prepare_dirA = _prepare_direction_flag
+    prepare_descrA = _prepare_matdescr
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_descrC = _prepare_matdescr
+    prepare_bsrRowPtrC = _prepare_array
+    prepare_nnzTotalDevHostPtr = _prepare_scalar_out
+
+    def return_value(self, *args):
+        return args[self.argnames.index('nnzTotalDevHostPtr')].value
+
+
+class XcsrgeamNnz(_api_function):
+    T = c_int
+    prepare_descrA = _prepare_matdescr
+
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_descrB = _prepare_matdescr
+
+    prepare_csrRowPtrB = _prepare_array
+    prepare_csrColIndB = _prepare_array
+    prepare_descrC = _prepare_matdescr
+    prepare_csrRowPtrC = _prepare_array
+    prepare_nnzTotalDevHostPtr = _prepare_scalar_out
+
+    def return_value(self, *args):
+        return args[self.argnames.index('nnzTotalDevHostPtr')].value
+
+
+class XcsrgemmNnz(_api_function):
+    T = c_int
+    prepare_transA = _prepare_operation_flag
+    prepare_transB = _prepare_operation_flag
+    prepare_descrA = _prepare_matdescr
+    prepare_csrRowPtrA = _prepare_array
+    prepare_csrColIndA = _prepare_array
+    prepare_descrB = _prepare_matdescr
+    prepare_csrRowPtrB = _prepare_array
+    prepare_csrColIndB = _prepare_array
+    prepare_descrC = _prepare_matdescr
+    prepare_csrRowPtrC = _prepare_array
+    prepare_nnzTotalDevHostPtr = _prepare_scalar_out
+
+    def return_value(self, *args):
+        return args[self.argnames.index('nnzTotalDevHostPtr')].value
+
+
+def _init_api_function(name, decl):
+    lib = libcusparse()
+    mangled = mangle(name)
+    for k in globals().keys():
+        if mangled.endswith(k):
+            base = globals()[k]
+            break
+    else:
+        # print("missing", name)
+        raise NotImplementedError(name)
+        # return mangled, None
+
+    docs = _make_docstring(name, decl)
+    cls = _make_api_function(name, base)
+
+    fn = getattr(lib, name)
+
+    obj = cls(fn, decl)
+
+    def method(self, *args, **kws):
+        return obj(self._handle, *args, **kws)
+
+    method.__doc__ = docs
+
+    return mangled, method
+
+
+_bypassed = frozenset('''
+cusparseCreate
+cusparseDestroy
+cusparseCreateHybMat
+cusparseCreateMatDescr
+cusparseCreateSolveAnalysisInfo
+cusparseDestroyHybMat
+cusparseDestroyMatDescr
+cusparseDestroySolveAnalysisInfo
+cusparseGetMatDiagType
+cusparseGetLevelInfo
+cusparseGetMatFillMode
+cusparseGetMatIndexBase
+cusparseGetMatType
+cusparseSetMatDiagType
+cusparseSetMatFillMode
+cusparseSetMatIndexBase
+cusparseSetMatType
+cusparseGetPointerMode
+cusparseSetPointerMode
+cusparseSetStream
+cusparseGetVersion
+'''.split())
+
+
+def _init_cuSparse():
+    gv = {}
+    for k, v in _declarations():
+        if k not in _bypassed:
+            name, func = _init_api_function(k, v)
+            assert name not in gv
+            gv[name] = func
+
+    # rewrite _v2 names
+    for k in list(gv.keys()):
+        if k.endswith('_v2'):
+            stripped = k[:-3]
+            assert stripped + '_v2' == k
+            gv[stripped] = gv[k]
+
+    base = _cuSparse
+    return type('cuSparse', (base,), gv)
+
+
+cuSparse = _init_cuSparse()
diff --git a/pyculib/sparse/decls.py b/pyculib/sparse/decls.py
new file mode 100644
index 0000000..2e6cfab
--- /dev/null
+++ b/pyculib/sparse/decls.py
@@ -0,0 +1,547 @@
+# This file was auto-generated
+
+cusparseCreate = ('cusparseStatus_t', (('handle', 'cusparseHandle_t*'),))
+
+
+cusparseDestroy = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'),))
+
+
+cusparseGetVersion = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('version', 'int*'),))
+
+
+cusparseSetStream = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('streamId', 'cudaStream_t'),))
+
+
+cusparseGetPointerMode = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('mode', 'cusparsePointerMode_t*'),))
+
+
+cusparseSetPointerMode = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('mode', 'cusparsePointerMode_t'),))
+
+
+cusparseCreateMatDescr = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t*'),))
+
+
+cusparseDestroyMatDescr = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t'),))
+
+
+cusparseSetMatType = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t'), ('type', 'cusparseMatrixType_t'),))
+
+
+cusparseGetMatType = ('cusparseMatrixType_t', (('descrA', 'cusparseMatDescr_t'),))
+
+
+cusparseSetMatFillMode = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t'), ('fillMode', 'cusparseFillMode_t'),))
+
+
+cusparseGetMatFillMode = ('cusparseFillMode_t', (('descrA', 'cusparseMatDescr_t'),))
+
+
+cusparseSetMatDiagType = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t'), ('diagType', 'cusparseDiagType_t'),))
+
+
+cusparseGetMatDiagType = ('cusparseDiagType_t', (('descrA', 'cusparseMatDescr_t'),))
+
+
+cusparseSetMatIndexBase = ('cusparseStatus_t', (('descrA', 'cusparseMatDescr_t'), ('base', 'cusparseIndexBase_t'),))
+
+
+cusparseGetMatIndexBase = ('cusparseIndexBase_t', (('descrA', 'cusparseMatDescr_t'),))
+
+
+cusparseCreateSolveAnalysisInfo = ('cusparseStatus_t', (('info', 'cusparseSolveAnalysisInfo_t*'),))
+
+
+cusparseDestroySolveAnalysisInfo = ('cusparseStatus_t', (('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseGetLevelInfo = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('info', 'cusparseSolveAnalysisInfo_t'), ('nlevels', 'int*'), ('levelPtr', 'int**'), ('levelInd', 'int**'),))
+
+
+cusparseCreateHybMat = ('cusparseStatus_t', (('hybA', 'cusparseHybMat_t*'),))
+
+
+cusparseDestroyHybMat = ('cusparseStatus_t', (('hybA', 'cusparseHybMat_t'),))
+
+
+cusparseSaxpyi_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('alpha', 'float*'), ('xVal', 'float*'), ('xInd', 'int*'), ('y', 'float*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseDaxpyi_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('alpha', 'double*'), ('xVal', 'double*'), ('xInd', 'int*'), ('y', 'double*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseCaxpyi_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('alpha', 'cuComplex*'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('y', 'cuComplex*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseZaxpyi_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('alpha', 'cuDoubleComplex*'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('y', 'cuDoubleComplex*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseSdoti = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'float*'), ('xInd', 'int*'), ('y', 'float*'), ('resultDevHostPtr', 'float*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseDdoti = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'double*'), ('xInd', 'int*'), ('y', 'double*'), ('resultDevHostPtr', 'double*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseCdoti = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('y', 'cuComplex*'), ('resultDevHostPtr', 'cuComplex*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseZdoti = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('y', 'cuDoubleComplex*'), ('resultDevHostPtr', 'cuDoubleComplex*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseCdotci = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('y', 'cuComplex*'), ('resultDevHostPtr', 'cuComplex*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseZdotci = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('y', 'cuDoubleComplex*'), ('resultDevHostPtr', 'cuDoubleComplex*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseSgthr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'float*'), ('xVal', 'float*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseDgthr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'double*'), ('xVal', 'double*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseCgthr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'cuComplex*'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseZgthr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'cuDoubleComplex*'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseSgthrz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'float*'), ('xVal', 'float*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseDgthrz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'double*'), ('xVal', 'double*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseCgthrz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'cuComplex*'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseZgthrz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('y', 'cuDoubleComplex*'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseSsctr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'float*'), ('xInd', 'int*'), ('y', 'float*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseDsctr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'double*'), ('xInd', 'int*'), ('y', 'double*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseCsctr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuComplex*'), ('xInd', 'int*'), ('y', 'cuComplex*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseZsctr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'cuDoubleComplex*'), ('xInd', 'int*'), ('y', 'cuDoubleComplex*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseSroti_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'float*'), ('xInd', 'int*'), ('y', 'float*'), ('c', 'float*'), ('s', 'float*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseDroti_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('nnz', 'int'), ('xVal', 'double*'), ('xInd', 'int*'), ('y', 'double*'), ('c', 'double*'), ('s', 'double*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseScsrmv_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('x', 'float*'), ('beta', 'float*'), ('y', 'float*'),))
+
+
+cusparseDcsrmv_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('x', 'double*'), ('beta', 'double*'), ('y', 'double*'),))
+
+
+cusparseCcsrmv_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('x', 'cuComplex*'), ('beta', 'cuComplex*'), ('y', 'cuComplex*'),))
+
+
+cusparseZcsrmv_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('x', 'cuDoubleComplex*'), ('beta', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),))
+
+
+cusparseShybmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('x', 'float*'), ('beta', 'float*'), ('y', 'float*'),))
+
+
+cusparseDhybmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('x', 'double*'), ('beta', 'double*'), ('y', 'double*'),))
+
+
+cusparseChybmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('x', 'cuComplex*'), ('beta', 'cuComplex*'), ('y', 'cuComplex*'),))
+
+
+cusparseZhybmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('x', 'cuDoubleComplex*'), ('beta', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),))
+
+
+cusparseSbsrmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'float*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'float*'), ('beta', 'float*'), ('y', 'float*'),))
+
+
+cusparseDbsrmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'double*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'double*'), ('beta', 'double*'), ('y', 'double*'),))
+
+
+cusparseCbsrmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuComplex*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'cuComplex*'), ('beta', 'cuComplex*'), ('y', 'cuComplex*'),))
+
+
+cusparseZbsrmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuDoubleComplex*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'cuDoubleComplex*'), ('beta', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),))
+
+
+cusparseSbsrxmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('sizeOfMask', 'int'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'float*'), ('bsrMaskPtrA', 'int*'), ('bsrRowPtrA', 'int*'), ('bsrEndPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'float*'), ('beta', 'float*'), ('y', 'float*'),))
+
+
+cusparseDbsrxmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('sizeOfMask', 'int'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'double*'), ('bsrMaskPtrA', 'int*'), ('bsrRowPtrA', 'int*'), ('bsrEndPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'double*'), ('beta', 'double*'), ('y', 'double*'),))
+
+
+cusparseCbsrxmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('sizeOfMask', 'int'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuComplex*'), ('bsrMaskPtrA', 'int*'), ('bsrRowPtrA', 'int*'), ('bsrEndPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'cuComplex*'), ('beta', 'cuComplex*'), ('y', 'cuComplex*'),))
+
+
+cusparseZbsrxmv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('transA', 'cusparseOperation_t'), ('sizeOfMask', 'int'), ('mb', 'int'), ('nb', 'int'), ('nnzb', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuDoubleComplex*'), ('bsrMaskPtrA', 'int*'), ('bsrRowPtrA', 'int*'), ('bsrEndPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('x', 'cuDoubleComplex*'), ('beta', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),))
+
+
+cusparseScsrsv_analysis_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseDcsrsv_analysis_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseCcsrsv_analysis_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseZcsrsv_analysis_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseScsrsv_solve_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'float*'), ('y', 'float*'),))
+
+
+cusparseDcsrsv_solve_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'double*'), ('y', 'double*'),))
+
+
+cusparseCcsrsv_solve_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuComplex*'), ('y', 'cuComplex*'),))
+
+
+cusparseZcsrsv_solve_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),))
+
+
+cusparseShybsv_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseDhybsv_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseChybsv_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseZhybsv_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseShybsv_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('alpha', 'float*'), ('descra', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'float*'), ('y', 'float*'),))
+
+
+cusparseChybsv_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('alpha', 'cuComplex*'), ('descra', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuComplex*'), ('y', 'cuComplex*'),))
+
+
+cusparseDhybsv_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('alpha', 'double*'), ('descra', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'double*'), ('y', 'double*'),))
+
+
+cusparseZhybsv_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('alpha', 'cuDoubleComplex*'), ('descra', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuDoubleComplex*'), ('y', 'cuDoubleComplex*'),))
+
+
+cusparseScsrmm_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'float*'), ('ldb', 'int'), ('beta', 'float*'), ('C', 'float*'), ('ldc', 'int'),))
+
+
+cusparseDcsrmm_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'double*'), ('ldb', 'int'), ('beta', 'double*'), ('C', 'double*'), ('ldc', 'int'),))
+
+
+cusparseCcsrmm_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'cuComplex*'), ('ldb', 'int'), ('beta', 'cuComplex*'), ('C', 'cuComplex*'), ('ldc', 'int'),))
+
+
+cusparseZcsrmm_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'cuDoubleComplex*'), ('ldb', 'int'), ('beta', 'cuDoubleComplex*'), ('C', 'cuDoubleComplex*'), ('ldc', 'int'),))
+
+
+cusparseScsrmm2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transa', 'cusparseOperation_t'), ('transb', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'float*'), ('ldb', 'int'), ('beta', 'float*'), ('C', 'float*'), ('ldc', 'int'),))
+
+
+cusparseDcsrmm2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transa', 'cusparseOperation_t'), ('transb', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'double*'), ('ldb', 'int'), ('beta', 'double*'), ('C', 'double*'), ('ldc', 'int'),))
+
+
+cusparseCcsrmm2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transa', 'cusparseOperation_t'), ('transb', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'cuComplex*'), ('ldb', 'int'), ('beta', 'cuComplex*'), ('C', 'cuComplex*'), ('ldc', 'int'),))
+
+
+cusparseZcsrmm2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transa', 'cusparseOperation_t'), ('transb', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('nnz', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('B', 'cuDoubleComplex*'), ('ldb', 'int'), ('beta', 'cuDoubleComplex*'), ('C', 'cuDoubleComplex*'), ('ldc', 'int'),))
+
+
+cusparseScsrsm_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseDcsrsm_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseCcsrsm_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseZcsrsm_analysis = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('nnz', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseScsrsm_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'float*'), ('ldx', 'int'), ('y', 'float*'), ('ldy', 'int'),))
+
+
+cusparseDcsrsm_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'double*'), ('ldx', 'int'), ('y', 'double*'), ('ldy', 'int'),))
+
+
+cusparseCcsrsm_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuComplex*'), ('ldx', 'int'), ('y', 'cuComplex*'), ('ldy', 'int'),))
+
+
+cusparseZcsrsm_solve = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'), ('x', 'cuDoubleComplex*'), ('ldx', 'int'), ('y', 'cuDoubleComplex*'), ('ldy', 'int'),))
+
+
+cusparseScsrilu0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseDcsrilu0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseCcsrilu0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseZcsrilu0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseScsric0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseDcsric0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseCcsric0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseZcsric0 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('trans', 'cusparseOperation_t'), ('m', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA_ValM', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('info', 'cusparseSolveAnalysisInfo_t'),))
+
+
+cusparseSgtsv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'float*'), ('d', 'float*'), ('du', 'float*'), ('B', 'float*'), ('ldb', 'int'),))
+
+
+cusparseDgtsv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'double*'), ('d', 'double*'), ('du', 'double*'), ('B', 'double*'), ('ldb', 'int'),))
+
+
+cusparseCgtsv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'cuComplex*'), ('d', 'cuComplex*'), ('du', 'cuComplex*'), ('B', 'cuComplex*'), ('ldb', 'int'),))
+
+
+cusparseZgtsv = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'cuDoubleComplex*'), ('d', 'cuDoubleComplex*'), ('du', 'cuDoubleComplex*'), ('B', 'cuDoubleComplex*'), ('ldb', 'int'),))
+
+
+cusparseSgtsv_nopivot = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'float*'), ('d', 'float*'), ('du', 'float*'), ('B', 'float*'), ('ldb', 'int'),))
+
+
+cusparseDgtsv_nopivot = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'double*'), ('d', 'double*'), ('du', 'double*'), ('B', 'double*'), ('ldb', 'int'),))
+
+
+cusparseCgtsv_nopivot = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'cuComplex*'), ('d', 'cuComplex*'), ('du', 'cuComplex*'), ('B', 'cuComplex*'), ('ldb', 'int'),))
+
+
+cusparseZgtsv_nopivot = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('dl', 'cuDoubleComplex*'), ('d', 'cuDoubleComplex*'), ('du', 'cuDoubleComplex*'), ('B', 'cuDoubleComplex*'), ('ldb', 'int'),))
+
+
+cusparseSgtsvStridedBatch = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('dl', 'float*'), ('d', 'float*'), ('du', 'float*'), ('x', 'float*'), ('batchCount', 'int'), ('batchStride', 'int'),))
+
+
+cusparseDgtsvStridedBatch = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('dl', 'double*'), ('d', 'double*'), ('du', 'double*'), ('x', 'double*'), ('batchCount', 'int'), ('batchStride', 'int'),))
+
+
+cusparseCgtsvStridedBatch = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('dl', 'cuComplex*'), ('d', 'cuComplex*'), ('du', 'cuComplex*'), ('x', 'cuComplex*'), ('batchCount', 'int'), ('batchStride', 'int'),))
+
+
+cusparseZgtsvStridedBatch = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('dl', 'cuDoubleComplex*'), ('d', 'cuDoubleComplex*'), ('du', 'cuDoubleComplex*'), ('x', 'cuDoubleComplex*'), ('batchCount', 'int'), ('batchStride', 'int'),))
+
+
+cusparseXcsrgemmNnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('transB', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrRowPtrC', 'int*'), ('nnzTotalDevHostPtr', 'int*'),))
+
+
+cusparseScsrgemm = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('transB', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'float*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'float*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseDcsrgemm = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('transB', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'double*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'double*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseCcsrgemm = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('transB', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'cuComplex*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseZcsrgemm = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('transA', 'cusparseOperation_t'), ('transB', 'cusparseOperation_t'), ('m', 'int'), ('n', 'int'), ('k', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'cuDoubleComplex*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuDoubleComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseXcsrgeamNnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrRowPtrC', 'int*'), ('nnzTotalDevHostPtr', 'int*'),))
+
+
+cusparseScsrgeam = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'float*'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('beta', 'float*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'float*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'float*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseDcsrgeam = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'double*'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('beta', 'double*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'double*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'double*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseCcsrgeam = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'cuComplex*'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('beta', 'cuComplex*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'cuComplex*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseZcsrgeam = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('alpha', 'cuDoubleComplex*'), ('descrA', 'cusparseMatDescr_t'), ('nnzA', 'int'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('beta', 'cuDoubleComplex*'), ('descrB', 'cusparseMatDescr_t'), ('nnzB', 'int'), ('csrValB', 'cuDoubleComplex*'), ('csrRowPtrB', 'int*'), ('csrColIndB', 'int*'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuDoubleComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseSnnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'float*'), ('lda', 'int'), ('nnzPerRowCol', 'int*'), ('nnzTotalDevHostPtr', 'int*'),))
+
+
+cusparseDnnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'double*'), ('lda', 'int'), ('nnzPerRowCol', 'int*'), ('nnzTotalDevHostPtr', 'int*'),))
+
+
+cusparseCnnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuComplex*'), ('lda', 'int'), ('nnzPerRowCol', 'int*'), ('nnzTotalDevHostPtr', 'int*'),))
+
+
+cusparseZnnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuDoubleComplex*'), ('lda', 'int'), ('nnzPerRowCol', 'int*'), ('nnzTotalDevHostPtr', 'int*'),))
+
+
+cusparseSdense2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'float*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),))
+
+
+cusparseDdense2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'double*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),))
+
+
+cusparseCdense2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuComplex*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),))
+
+
+cusparseZdense2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuDoubleComplex*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),))
+
+
+cusparseScsr2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('A', 'float*'), ('lda', 'int'),))
+
+
+cusparseDcsr2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('A', 'double*'), ('lda', 'int'),))
+
+
+cusparseCcsr2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('A', 'cuComplex*'), ('lda', 'int'),))
+
+
+cusparseZcsr2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('A', 'cuDoubleComplex*'), ('lda', 'int'),))
+
+
+cusparseSdense2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'float*'), ('lda', 'int'), ('nnzPerCol', 'int*'), ('cscValA', 'float*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'),))
+
+
+cusparseDdense2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'double*'), ('lda', 'int'), ('nnzPerCol', 'int*'), ('cscValA', 'double*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'),))
+
+
+cusparseCdense2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuComplex*'), ('lda', 'int'), ('nnzPerCol', 'int*'), ('cscValA', 'cuComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'),))
+
+
+cusparseZdense2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuDoubleComplex*'), ('lda', 'int'), ('nnzPerCol', 'int*'), ('cscValA', 'cuDoubleComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'),))
+
+
+cusparseScsc2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'float*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('A', 'float*'), ('lda', 'int'),))
+
+
+cusparseDcsc2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'double*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('A', 'double*'), ('lda', 'int'),))
+
+
+cusparseCcsc2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'cuComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('A', 'cuComplex*'), ('lda', 'int'),))
+
+
+cusparseZcsc2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'cuDoubleComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('A', 'cuDoubleComplex*'), ('lda', 'int'),))
+
+
+cusparseXcoo2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('cooRowInd', 'int*'), ('nnz', 'int'), ('m', 'int'), ('csrRowPtr', 'int*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseXcsr2coo = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('csrRowPtr', 'int*'), ('nnz', 'int'), ('m', 'int'), ('cooRowInd', 'int*'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseScsr2csc_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('csrVal', 'float*'), ('csrRowPtr', 'int*'), ('csrColInd', 'int*'), ('cscVal', 'float*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'), ('copyValues', 'cusparseAction_t'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseDcsr2csc_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('csrVal', 'double*'), ('csrRowPtr', 'int*'), ('csrColInd', 'int*'), ('cscVal', 'double*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'), ('copyValues', 'cusparseAction_t'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseCcsr2csc_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('csrVal', 'cuComplex*'), ('csrRowPtr', 'int*'), ('csrColInd', 'int*'), ('cscVal', 'cuComplex*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'), ('copyValues', 'cusparseAction_t'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseZcsr2csc_v2 = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('nnz', 'int'), ('csrVal', 'cuDoubleComplex*'), ('csrRowPtr', 'int*'), ('csrColInd', 'int*'), ('cscVal', 'cuDoubleComplex*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'), ('copyValues', 'cusparseAction_t'), ('idxBase', 'cusparseIndexBase_t'),))
+
+
+cusparseSdense2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'float*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseDdense2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'double*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseCdense2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuComplex*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseZdense2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('A', 'cuDoubleComplex*'), ('lda', 'int'), ('nnzPerRow', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseShyb2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('A', 'float*'), ('lda', 'int'),))
+
+
+cusparseDhyb2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('A', 'double*'), ('lda', 'int'),))
+
+
+cusparseChyb2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('A', 'cuComplex*'), ('lda', 'int'),))
+
+
+cusparseZhyb2dense = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('A', 'cuDoubleComplex*'), ('lda', 'int'),))
+
+
+cusparseScsr2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseDcsr2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseCcsr2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseZcsr2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseShyb2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),))
+
+
+cusparseDhyb2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),))
+
+
+cusparseChyb2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),))
+
+
+cusparseZhyb2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'),))
+
+
+cusparseScsc2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'float*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseDcsc2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'double*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseCcsc2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'cuComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseZcsc2hyb = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('cscValA', 'cuDoubleComplex*'), ('cscRowIndA', 'int*'), ('cscColPtrA', 'int*'), ('hybA', 'cusparseHybMat_t'), ('userEllWidth', 'int'), ('partitionType', 'cusparseHybPartition_t'),))
+
+
+cusparseShyb2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('cscVal', 'float*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'),))
+
+
+cusparseDhyb2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('cscVal', 'double*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'),))
+
+
+cusparseChyb2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('cscVal', 'cuComplex*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'),))
+
+
+cusparseZhyb2csc = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('descrA', 'cusparseMatDescr_t'), ('hybA', 'cusparseHybMat_t'), ('cscVal', 'cuDoubleComplex*'), ('cscRowInd', 'int*'), ('cscColPtr', 'int*'),))
+
+
+cusparseXcsr2bsrNnz = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('bsrRowPtrC', 'int*'), ('nnzTotalDevHostPtr', 'int*'),))
+
+
+cusparseScsr2bsr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'float*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('bsrValC', 'float*'), ('bsrRowPtrC', 'int*'), ('bsrColIndC', 'int*'),))
+
+
+cusparseDcsr2bsr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'double*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('bsrValC', 'double*'), ('bsrRowPtrC', 'int*'), ('bsrColIndC', 'int*'),))
+
+
+cusparseCcsr2bsr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('bsrValC', 'cuComplex*'), ('bsrRowPtrC', 'int*'), ('bsrColIndC', 'int*'),))
+
+
+cusparseZcsr2bsr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('m', 'int'), ('n', 'int'), ('descrA', 'cusparseMatDescr_t'), ('csrValA', 'cuDoubleComplex*'), ('csrRowPtrA', 'int*'), ('csrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('bsrValC', 'cuDoubleComplex*'), ('bsrRowPtrC', 'int*'), ('bsrColIndC', 'int*'),))
+
+
+cusparseSbsr2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('mb', 'int'), ('nb', 'int'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'float*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'float*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseDbsr2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('mb', 'int'), ('nb', 'int'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'double*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'double*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseCbsr2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('mb', 'int'), ('nb', 'int'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuComplex*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
+
+cusparseZbsr2csr = ('cusparseStatus_t', (('handle', 'cusparseHandle_t'), ('dirA', 'cusparseDirection_t'), ('mb', 'int'), ('nb', 'int'), ('descrA', 'cusparseMatDescr_t'), ('bsrValA', 'cuDoubleComplex*'), ('bsrRowPtrA', 'int*'), ('bsrColIndA', 'int*'), ('blockDim', 'int'), ('descrC', 'cusparseMatDescr_t'), ('csrValC', 'cuDoubleComplex*'), ('csrRowPtrC', 'int*'), ('csrColIndC', 'int*'),))
+
diff --git a/pyculib/tests/__init__.py b/pyculib/tests/__init__.py
new file mode 100644
index 0000000..c9d3670
--- /dev/null
+++ b/pyculib/tests/__init__.py
@@ -0,0 +1,12 @@
+from os.path import dirname, join
+from . import (test_blas_low_level, test_blas, test_fft, test_rand,
+               test_sorting, test_sparse)
+
+test_cases = (
+    test_blas_low_level.test_cases +
+    test_blas.test_cases +
+    test_fft.test_cases +
+    test_rand.test_cases +
+    test_sorting.test_cases +
+    test_sparse.test_cases
+)
diff --git a/pyculib/tests/base.py b/pyculib/tests/base.py
new file mode 100644
index 0000000..73e0d52
--- /dev/null
+++ b/pyculib/tests/base.py
@@ -0,0 +1,22 @@
+from unittest import TestCase, skipIf
+from numba import cuda
+from numba.cuda.cudadrv.error import CudaSupportError
+
+
+def skip_cuda_tests():
+
+    try:
+        if cuda.is_available():
+            gpus = cuda.list_devices()
+            if gpus and gpus[0].compute_capability >= (2, 0):
+                return False
+            else:
+                return True
+        return True
+    except CudaSupportError:
+        return True
+
+
+@skipIf(skip_cuda_tests(), "CUDA not supported on this platform.")
+class CUDATestCase(TestCase):
+    pass
diff --git a/pyculib/tests/blas.py b/pyculib/tests/blas.py
new file mode 100644
index 0000000..5face7a
--- /dev/null
+++ b/pyculib/tests/blas.py
@@ -0,0 +1,395 @@
+from __future__ import print_function, absolute_import, division
+
+import unittest
+import numpy as np
+import scipy.linalg
+from pyculib import warnings, config
+from numba.testing.ddt import ddt, unpack, data
+import time
+
+def create_array(dtype, shape, slices=None, empty=False):
+    """Create a test array of the given dtype and shape.
+    if slices is given, the returned array aliases a bigger parent array
+    using the specified start and step values. (The stop member is expected to
+    be appropriate to yield the given length.)"""
+
+    from numpy.random import normal, seed
+    seed(1234)
+
+    def total_size(s):
+        # this function doesn't support slices whose members are 'None'
+        return s.start + (s.stop - s.start)*np.abs(s.step)
+
+    if not slices:
+        a = np.empty(dtype=dtype, shape=shape)
+    else:
+        if type(shape) is not tuple: # 1D
+            pshape = total_size(slices)
+        else:
+            pshape = tuple([total_size(s) for s in slices])
+        parent = np.empty(dtype=dtype, shape=pshape)
+        a = parent[slices]
+
+    if not empty:
+        mult = np.array(1, dtype=dtype)
+        a[:] = normal(0.,1.,shape).astype(dtype) * mult
+    return a
+
+
+class TestBLAS:
+    """Create test cases by deriving from this (as well as unittest.TestCase.
+    In the derived class, set the 'blas' attribute to the BLAS module that is
+    to be tested."""
+
+    blas = None
+
+    def _test_dot(self, dtype, shape, slice, rtol=1e-07):
+
+        x = create_array(dtype, shape, slice)
+        y = create_array(dtype, shape, slice)
+        res = self.blas.dot(x, y)
+        ref = np.dot(x, y)
+        np.testing.assert_allclose(res, ref, rtol=rtol)
+
+    def _test_gemv(self, dtype, op, shape, slices, rtol=1e-07):
+
+        sp_gemv = scipy.linalg.get_blas_funcs('gemv', dtype=dtype)
+        # f2py convention...
+        sp_trans = {'N':0, 'T':1, 'C':2}
+
+        alpha = 2.
+        A = create_array(dtype, shape, slices, empty=True)
+        A[:] = np.arange(np.prod(shape), dtype=dtype).reshape(shape)
+        x = np.arange(op == 'N' and shape[1] or shape[0], dtype=dtype)
+        beta = 2.
+        y = np.arange(op == 'N' and shape[0] or shape[1], dtype=dtype)
+        res = self.blas.gemv(op, alpha, A, x, beta, y)
+        y = np.arange(op == 'N' and shape[0] or shape[1], dtype=dtype)
+        ref = sp_gemv(alpha, A, x, beta, y, trans=sp_trans[op])
+        np.testing.assert_allclose(res, ref, rtol=rtol)
+
+    def _test_axpy(self, dtype, size, slice, rtol=1e-07):
+
+        sp_axpy = scipy.linalg.get_blas_funcs('axpy', dtype=dtype)
+        alpha = 2.
+        x = create_array(dtype, size, slice)
+        y = create_array(dtype, size, slice)
+        yr = np.copy(y)
+        res = self.blas.axpy(alpha, x, y)
+        ref = sp_axpy(x, yr, size, alpha)
+        np.testing.assert_allclose(res, ref, rtol=rtol)
+
+    def _test_gemm(self, dtype, opa, opb, ashape, aslices, bshape, bslices, rtol=1e-07):
+
+        M = opa == 'N' and ashape[0] or ashape[1]
+        N = opb == 'N' and bshape[1] or bshape[0]
+        cshape = (M, N)
+        sp_gemm = scipy.linalg.get_blas_funcs('gemm', dtype=dtype)
+        # f2py convention...
+        sp_trans = {'N':0, 'T':1, 'C':2}
+
+        alpha = 2.
+        A = create_array(dtype, ashape, aslices, empty=True)
+        A[:] = np.arange(np.prod(ashape), dtype=dtype).reshape(ashape)
+        B = create_array(dtype, bshape, bslices, empty=True)
+        B[:] = np.arange(np.prod(bshape), dtype=dtype).reshape(bshape)
+        beta = 5.
+        C = create_array(dtype, cshape, empty=True)
+        C[:] = np.arange(np.prod(cshape), dtype=dtype).reshape(cshape)
+        res = self.blas.gemm(opa, opb, alpha, A, B, beta, C)
+        # C may have been overwritten in the previous operation.
+        C = np.arange(np.prod(cshape), dtype=dtype).reshape(cshape)
+        ref = sp_gemm(alpha, A, B, beta, C, trans_a=sp_trans[opa], trans_b=sp_trans[opb])
+        np.testing.assert_allclose(res, ref, rtol=rtol)
+
+    def test_dot_invalid(self):
+
+        x = np.arange(1024, dtype=np.float32)
+        y = np.arange(1024, dtype=np.float32)
+        # First make sure the original works...
+        res = self.blas.dot(x, y)
+        ref = np.dot(x, y)
+        np.testing.assert_allclose(res, ref, rtol=1e6)
+        # then check for various types of invalid input
+        with self.assertRaises(TypeError): # invalid type
+          self.blas.dot(np.arange(10), np.arange(10))
+        with self.assertRaises(ValueError):
+            self.blas.dot(x, y.reshape(64, 16)) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.dot(x, y[4:]) # invalid size
+
+    def test_axpy_invalid(self):
+
+        # First make sure the original works...
+        sp_axpy = scipy.linalg.get_blas_funcs('axpy', dtype=np.float32)
+        alpha = np.float32(2.)
+        x = np.arange(64, dtype=np.float32)
+        y = np.arange(64, dtype=np.float32)
+        res = self.blas.axpy(alpha, x, y)
+        y = np.arange(64, dtype=np.float32)
+        ref = sp_axpy(x, y, 64, alpha)
+        np.testing.assert_allclose(res, ref)
+        # then check for various types of invalid input
+        with self.assertRaises(TypeError): # invalid type
+          self.blas.axpy(7, np.arange(64), y)
+        with self.assertRaises(TypeError): # invalid type
+          self.blas.axpy(7, x, np.arange(64))
+        with self.assertRaises(ValueError):
+            self.blas.axpy([1], x, y) # invalid scalar
+        with self.assertRaises(ValueError):
+            self.blas.axpy(alpha, x, y.reshape(8, 8)) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.axpy(alpha, x, y[4:]) # invalid size
+
+    def test_gemv_invalid(self):
+
+        # First make sure the original works...
+        sp_gemv = scipy.linalg.get_blas_funcs('gemv', dtype=np.float32)
+        alpha = 2.
+        A = np.arange(64, dtype=np.float32).reshape(8,8)
+        x = np.arange(8, dtype=np.float32)
+        beta = 2.
+        y = np.arange(8, dtype=np.float32)
+        res = self.blas.gemv('N', alpha, A, x, beta, y)
+        y = np.arange(8, dtype=np.float32)
+        ref = sp_gemv(alpha, A, x, beta, y)
+        np.testing.assert_allclose(res, ref)
+        # then check for various types of invalid input
+        i8x8 = np.arange(64).reshape(8,8)
+        i8 = np.arange(8)
+        with self.assertRaises(TypeError):
+            self.blas.gemv('N', alpha, i8x8, x, beta, y) # invalid type
+        with self.assertRaises(TypeError):
+            self.blas.gemv('N', alpha, A, i8, beta, y) # invalid type
+        with self.assertRaises(TypeError):
+            self.blas.gemv('N', alpha, A, x, beta, i8) # invalid type
+        with self.assertRaises(ValueError):
+            self.blas.gemv('X', alpha, A, x, beta, y) # invalid op
+        with self.assertRaises(ValueError):
+            self.blas.gemv( 'N', [1], A, x, beta, y) # invalid scalar
+        with self.assertRaises(ValueError):
+            self.blas.gemv('N', alpha, A[0], x) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.gemv('N', alpha, A, x.reshape(2, 4), beta, y) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.gemv('N', alpha, A, x, beta, y.reshape(2, 4)) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.gemv('N', alpha, A.reshape(64), x, beta, y) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.gemv('N', alpha, A[1:,:], x, beta, y) # invalid size
+        with self.assertRaises(ValueError):
+            self.blas.gemv('N', alpha, A, x[1:], beta, y) # invalid size
+        with self.assertRaises(ValueError):
+            self.blas.gemv('N', alpha, A, x, beta, y[1:]) # invalid size
+        with self.assertRaises(ValueError):
+            self.blas.gemv('T', alpha, A, x[1:], beta, y) # invalid size
+        with self.assertRaises(ValueError):
+            self.blas.gemv('T', alpha, A, x, beta, y[1:]) # invalid size
+
+
+    def test_gemm_invalid(self):
+
+        # First make sure the original works...
+        sp_gemm = scipy.linalg.get_blas_funcs('gemm', dtype=np.float32)
+
+        alpha = 2.
+        A = np.arange(64, dtype=np.float32).reshape(8, 8)
+        B = np.arange(64, dtype=np.float32).reshape(8, 8)
+        beta = 5.
+        C = np.arange(64, dtype=np.float32).reshape(8, 8)
+        res = self.blas.gemm('N', 'N', alpha, A, B, beta, C)
+        # C may have been overwritten in the previous operation.
+        C = np.arange(64, dtype=np.float32).reshape(8, 8)
+        ref = sp_gemm(alpha, A, B, beta, C)
+        np.testing.assert_allclose(res, ref)
+        # then check for various types of invalid input
+        i8x8 = np.arange(64).reshape(8,8)
+        with self.assertRaises(TypeError):
+            self.blas.gemm('N', 'N', alpha, i8x8, B, beta, C) # invalid type
+        with self.assertRaises(TypeError):
+            self.blas.gemm('N', 'N', alpha, A, i8x8, beta, C) # invalid type
+        with self.assertRaises(TypeError):
+            self.blas.gemm('N', 'N', alpha, A, B, beta, i8x8) # invalid type
+        with self.assertRaises(ValueError):
+            self.blas.gemm('X', 'N', alpha, A, B, beta, C) # invalid op
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'X', alpha, A, B, beta, C) # invalid op
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'N', [1], A, B, beta, C) # invalid scalar
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'N', alpha, A, B, [1], C) # invalid scalar
+        with self.assertRaises(ValueError):
+            self.blas.gemm('C', 'N', alpha, A[0], B) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'N', alpha, A, B[0]) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'N', alpha, A.reshape(64), B, beta, C) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'N', alpha, A, B.reshape(64), beta, C) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'N', alpha, A, B, beta, C.reshape(64)) # invalid dim
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'N', alpha, A[1:,:], B, beta, C) # invalid size
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'N', alpha, A, B[1:,:], beta, C) # invalid size
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'N', alpha, A, B, beta, C[1:,:]) # invalid size
+        with self.assertRaises(ValueError):
+            self.blas.gemm('N', 'N', alpha, A, B, beta, C[:,1:]) # invalid size
+
+    def test_gemv_default(self):
+
+        # Check that default argument and aliasing rules work as expected
+        alpha = 2.
+        A = np.arange(64, dtype=np.float64).reshape(8,8)
+        x = np.arange(8, dtype=np.float64)
+        beta = 0.
+        yres = self.blas.gemv('N', alpha, A, x)
+        y = np.asfortranarray(np.arange(8, dtype=np.float64))
+        res = self.blas.gemv('N', alpha, A, x, beta, y)
+        # Make sure the result is the same even with no default y...
+        np.testing.assert_allclose(yres, res)
+        # ...and res indeed aliases y
+        self.assertIs(res, y)
+        # Make sure this also works for non-contiguous y
+        p = np.arange(16, dtype=np.float64) / 2
+        y = p[::2]
+        res = self.blas.gemv('N', alpha, A, x, beta, y)
+        np.testing.assert_allclose(yres, y)
+        np.testing.assert_allclose(res, y)
+
+    def test_gemm_default(self):
+
+        # Check that default argument and aliasing rules work as expected
+        alpha = 2.
+        A = np.arange(64, dtype=np.float64).reshape(8,8)
+        B = np.arange(64, dtype=np.float64).reshape(8,8)
+        beta = 0.
+        Cres = self.blas.gemm('N', 'N', alpha, A, B)
+        C = np.arange(64, dtype=np.float64).reshape(8, 8, order='F')
+        res = self.blas.gemm('N', 'N', alpha, A, B, beta, C)
+        # Make sure the result is the same even with no default C...
+        np.testing.assert_allclose(Cres, res)
+        # ...and res indeed aliases C
+        self.assertIs(res, C)
+        # Make sure this also works for non-contiguous C
+        p = np.arange(256, dtype=np.float64).reshape(16, 16) / 2
+        C = p[::2,::2]
+        res = self.blas.gemm('N', 'N', alpha, A, B, beta, C)
+        np.testing.assert_allclose(Cres, C)
+        np.testing.assert_allclose(res, C)
+
+    def test_dot_type_promotion(self):
+
+        #Make sure the result has the appropriate type for mixed input types.
+        x = np.arange(4, dtype=np.float64)
+        y = np.arange(4, dtype=np.float32)
+        self.assertIs(type(self.blas.dot(x, y)), np.float64)
+        x = np.arange(4, dtype=np.float64)
+        y = np.arange(4, dtype=np.complex64)
+        self.assertIs(type(self.blas.dot(x, y)), np.complex128)
+        
+    @unittest.skipIf(not config.WARNINGS, "warnings are disabled")
+    def test_dot_warnings(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("error")
+            x = np.arange(4, dtype=np.float64)
+            y = np.arange(4, dtype=np.float32)
+            with self.assertRaises(warnings.PerformanceWarning):
+                self.blas.dot(x, y)
+    
+    def test_axpy_type_promotion(self):
+
+        #Make sure the result has the appropriate type for mixed input types.
+        alpha = 2.
+        x = np.arange(4, dtype=np.float64)
+        y = np.arange(4, dtype=np.float32)
+        self.assertIs(self.blas.axpy(alpha, x, y).dtype.type, np.float64)
+        x = x.astype(np.complex64)
+        self.assertIs(self.blas.axpy(alpha, x, y).dtype.type, np.complex128)
+        alpha = 2.+1j
+        x = np.arange(4, dtype=np.float64)
+        y = np.arange(4, dtype=np.float32)
+        self.assertIs(self.blas.axpy(alpha, x, y).dtype.type, np.complex128)
+
+    @unittest.skipIf(not config.WARNINGS, "warnings are disabled")
+    def test_axpy_warnings(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("error")
+            alpha = 1.
+            x = np.arange(4, dtype=np.float64)
+            y = np.arange(4, dtype=np.float32)
+            with self.assertRaises(warnings.PerformanceWarning):
+                self.blas.axpy(alpha, x, y) # type promotion
+            y = y.astype(np.float64)
+            with self.assertRaises(warnings.PerformanceWarning):
+                self.blas.axpy(alpha, x[::2], y[::2]) # non-unit-stride
+
+    def test_gemv_type_promotion(self):
+
+        #Make sure the result has the appropriate type for mixed input types.
+        alpha = 2.
+        A = np.arange(16, dtype=np.float64).reshape(4,4)
+        x = np.arange(4, dtype=np.float64)
+        beta = 0.
+        self.assertIs(self.blas.gemv('N', alpha, A, x).dtype.type, np.float64)
+        x = x.astype(np.complex64)
+        self.assertIs(self.blas.gemv('N', alpha, A, x).dtype.type, np.complex128)
+        y = np.asfortranarray(np.arange(4, dtype=np.float64))
+        self.assertIs(self.blas.gemv('N', alpha, A, x, beta, y).dtype.type, np.complex128)
+
+    @unittest.skipIf(not config.WARNINGS, "warnings are disabled")
+    def test_gemv_warnings(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("error")
+            alpha = 1.
+            A = np.arange(16, dtype=np.float64).reshape(4,4)
+            x = np.arange(4, dtype=np.float32)
+            beta = 0.
+            y = np.arange(4, dtype=np.float32)
+            with self.assertRaises(warnings.PerformanceWarning):
+                self.blas.gemv('N', alpha, A, x) # type promotion
+            x = x.astype(np.float64)
+            with self.assertRaises(warnings.PerformanceWarning):
+                self.blas.gemv('N', alpha, A, x, beta, y) # type promotion
+            y = y.astype(np.float64)
+            with self.assertRaises(warnings.PerformanceWarning):
+                self.blas.gemv('N', alpha, A[::2,::2], x[::2], beta, y[::2]) # non-unit-stride
+
+    def test_gemm_type_promotion(self):
+
+        #Make sure the result has the appropriate type for mixed input types.
+        alpha = 2.
+        A = np.arange(16, dtype=np.float64).reshape(4,4)
+        B = np.arange(16, dtype=np.float64).reshape(4,4)
+        beta = 0.
+        self.assertIs(self.blas.gemm('N', 'N', alpha, A, B).dtype.type, np.float64)
+        A = A.astype(np.complex64)
+        self.assertIs(self.blas.gemm('N', 'N', alpha, A, B).dtype.type, np.complex128)
+        C = np.asfortranarray(np.arange(16, dtype=np.float64).reshape(4,4))
+        self.assertIs(self.blas.gemm('N', 'N', alpha, A, B, beta, C).dtype.type, np.complex128)
+        C = C.astype(np.complex128)
+        self.assertIs(self.blas.gemm('N', 'N', alpha, A, B, beta, C).dtype.type, np.complex128)
+
+    @unittest.skipIf(not config.WARNINGS, "warnings are disabled")
+    def test_gemm_warnings(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("error")
+            alpha = 1.
+            A = np.arange(16, dtype=np.float32).reshape(4,4)
+            B = np.arange(16, dtype=np.float64).reshape(4,4)
+            beta = 0.
+            C = np.arange(16, dtype=np.complex64).reshape(4,4)
+            with self.assertRaises(warnings.PerformanceWarning):
+                self.blas.gemm('N', 'N', alpha, A, B) # type promotion
+            A = A.astype(np.float64)
+            with self.assertRaises(warnings.PerformanceWarning):
+                self.blas.gemm('N', 'N', alpha, A, B, beta, C) # type promotion
+            C = np.arange(16, dtype=np.float64).reshape(4,4)
+            with self.assertRaises(warnings.PerformanceWarning):
+                self.blas.gemm('N', 'N', alpha, A[::2,::2], B[::2,::2], beta, C[::2,::2]) # non-unit-stride
+
+        
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pyculib/tests/test_blas.py b/pyculib/tests/test_blas.py
new file mode 100644
index 0000000..b54c67b
--- /dev/null
+++ b/pyculib/tests/test_blas.py
@@ -0,0 +1,126 @@
+from __future__ import print_function, absolute_import, division
+
+import unittest
+import numpy as np
+from numba.testing.ddt import ddt, unpack, data
+from pyculib.tests.base import CUDATestCase
+from pyculib import blas as cublas
+from pyculib.tests import blas
+
+@ddt
+class TestCUDABLAS(blas.TestBLAS, CUDATestCase):
+
+    blas = cublas
+    
+    @data((np.float32, 1024, slice(0, 1024, 1)),
+          (np.float64, 1024, slice(0, 1024, 1)),
+          (np.complex64, 1024, slice(0, 1024, 1)),
+          (np.complex128, 1024, slice(0, 1024, 1)),
+          (np.complex128, 1024, slice(1, 1025, 1)),
+          (np.float32, 1024, slice(0, 2048, 2)),
+          (np.float64, 1024, slice(0, 2048, 2)),
+          (np.complex64, 1024, slice(0, 2048, 2)),
+          (np.complex128, 1024, slice(0, 2048, 2)))
+    @unpack
+    def test_dot(self, dtype, shape, slice):
+
+        eps = np.finfo(dtype).eps
+        self._test_dot(dtype, shape, slice, rtol=eps*10)
+
+    @data((np.float32, 'N', (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.float64, 'N', (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.float64, 'T', (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.complex64, 'N', (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.complex128, 'N', (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.complex128, 'N', (64, 67), (slice(1, 65, 1), slice(2, 69, 1))),
+          (np.complex128, 'T', (67, 64), (slice(2, 69, 1), slice(1, 65, 1))),
+          (np.complex128, 'C', (67, 64), (slice(2, 69, 1), slice(1, 65, 1))),
+          (np.float32, 'N', (64, 67), (slice(0, 128, 2), slice(0, 134, 2))),
+          (np.float64, 'N', (64, 67), (slice(0, 128, 2), slice(0, 134, 2))),
+          (np.complex64, 'N', (64, 67), (slice(0, 128, 2), slice(0, 134, 2))),
+          (np.complex128, 'N', (64, 67), (slice(0, 128, 2), slice(0, 134, 2)))
+    )
+    @unpack
+    def test_gemv(self, dtype, op, shape, slices):
+
+        self._test_gemv(dtype, op, shape, slices)
+
+    @data((np.float32, 64, slice(0, 64, 1)),
+          (np.float64, 64, slice(0, 64, 1)),
+          (np.complex64, 64, slice(0, 64, 1)),
+          (np.complex128, 64, slice(0, 64, 1)),
+          (np.complex128, 64, slice(1, 65, 1)),
+          (np.float32, 64, slice(0, 128, 2)),
+          (np.float64, 64, slice(0, 128, 2)),
+          (np.complex64, 64, slice(0, 128, 2)),
+          (np.complex128, 64, slice(0, 128, 2)))
+    @unpack
+    def test_axpy(self, dtype, size, slice):
+
+        self._test_axpy(dtype, size, slice)
+
+    @data((np.float32, 'N', 'N',
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1)),
+           (67, 63), (slice(0, 67, 1), slice(0, 63, 1))),
+          (np.float64, 'N', 'N',
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1)),
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1))),
+          (np.float64, 'T', 'N',
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1)),
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1))),
+          (np.float64, 'N', 'T',
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1)),
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.float64, 'T', 'T',
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1)),
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.complex64, 'N', 'N',
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1)),
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1))),
+          (np.complex64, 'T', 'N',
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1)),
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1))),
+          (np.complex64, 'N', 'T',
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1)),
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.complex64, 'T', 'T',
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1)),
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.complex64, 'C', 'N',
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1)),
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1))),
+          (np.complex64, 'N', 'C',
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1)),
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.complex64, 'C', 'C',
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1)),
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1))),
+          (np.complex128, 'N', 'N',
+           (64, 67), (slice(0, 64, 1), slice(0, 67, 1)),
+           (67, 64), (slice(0, 67, 1), slice(0, 64, 1))),
+          (np.complex128, 'N', 'N',
+           (64, 67), (slice(1, 65, 1), slice(2, 69, 1)),
+           (67, 64), (slice(1, 68, 1), slice(2, 66, 1))),
+          (np.float32, 'N', 'N',
+           (64, 65), (slice(0, 128, 2), slice(0, 130, 2)),
+           (65, 63), (slice(0, 130, 2), slice(0, 126, 2))),
+          (np.float64, 'N', 'N',
+           (64, 65), (slice(0, 128, 2), slice(0, 130, 2)),
+           (65, 63), (slice(0, 130, 2), slice(0, 126, 2))),
+          (np.complex64, 'N', 'N',
+           (64, 65), (slice(0, 128, 2), slice(0, 130, 2)),
+           (65, 63), (slice(0, 130, 2), slice(0, 126, 2))),
+          (np.complex128, 'N', 'N',
+           (64, 65), (slice(0, 128, 2), slice(0, 130, 2)),
+           (65, 63), (slice(0, 130, 2), slice(0, 126, 2))))
+    @unpack
+    def test_gemm(self, dtype, opa, opb, ashape, aslices, bshape, bslices):
+
+        eps = np.finfo(dtype).eps
+        self._test_gemm(dtype, opa, opb, ashape, aslices, bshape, bslices, rtol=eps*10)
+
+
+test_cases = (TestCUDABLAS,)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pyculib/tests/test_blas_low_level.py b/pyculib/tests/test_blas_low_level.py
new file mode 100644
index 0000000..fb07ac2
--- /dev/null
+++ b/pyculib/tests/test_blas_low_level.py
@@ -0,0 +1,2003 @@
+from __future__ import print_function, absolute_import, division
+import numpy as np
+import unittest
+from .base import CUDATestCase
+from numba import cuda
+
+
+class TestCuBlasBinding(CUDATestCase):
+    def test_lib(self):
+        from pyculib.blas.binding import (cuBlas,
+                                                  CUBLAS_POINTER_MODE_HOST,
+                                                  CUBLAS_ATOMICS_NOT_ALLOWED)
+
+        stream = cuda.stream()
+        blas = cuBlas()
+        blas.stream = stream
+        self.assertTrue(blas.stream is stream)
+        blas.pointer_mode = CUBLAS_POINTER_MODE_HOST
+        self.assertTrue(blas.pointer_mode == CUBLAS_POINTER_MODE_HOST)
+        blas.atomics_mode = CUBLAS_ATOMICS_NOT_ALLOWED
+        self.assertTrue(blas.atomics_mode == CUBLAS_ATOMICS_NOT_ALLOWED)
+
+    def Tnrm2(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        x = np.random.random(10).astype(dtype)
+        d_x = cuda.to_device(x)
+
+        blas = cuBlas()
+        got = getattr(blas, fn)(x.size, d_x, 1)
+        exp = np.linalg.norm(x)
+        self.assertTrue(np.allclose(got, exp))
+
+    def test_Snrm2(self):
+        self.Tnrm2('Snrm2', np.float32)
+
+    def test_Dnrm2(self):
+        self.Tnrm2('Dnrm2', np.float64)
+
+    def test_Scnrm2(self):
+        self.Tnrm2('Scnrm2', np.complex64)
+
+    def test_Dznrm2(self):
+        self.Tnrm2('Dznrm2', np.complex128)
+
+    def Tdot(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        x = np.random.random(10).astype(dtype)
+        y = np.random.random(10).astype(dtype)
+        d_x = cuda.to_device(x)
+        d_y = cuda.to_device(y)
+
+        blas = cuBlas()
+        got = getattr(blas, fn)(x.size, d_x, 1, d_y, 1)
+        if fn.endswith('c'):
+            exp = np.vdot(x, y)
+        else:
+            exp = np.dot(x, y)
+        self.assertTrue(np.allclose(got, exp))
+
+    def test_Sdot(self):
+        self.Tdot('Sdot', np.float32)
+
+    def test_Ddot(self):
+        self.Tdot('Ddot', np.float64)
+
+    def test_Cdotu(self):
+        self.Tdot('Cdotu', np.complex64)
+
+    def test_Zdotu(self):
+        self.Tdot('Zdotu', np.complex128)
+
+    def test_Cdotc(self):
+        self.Tdot('Cdotc', np.complex64)
+
+    def test_Zdotc(self):
+        self.Tdot('Zdotc', np.complex128)
+
+    def Tscal(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        alpha = 1.234
+        x = np.random.random(10).astype(dtype)
+        x0 = x.copy()
+        d_x = cuda.to_device(x)
+
+        blas = cuBlas()
+        getattr(blas, fn)(x.size, alpha, d_x, 1)
+
+        d_x.copy_to_host(x)
+
+        self.assertTrue(np.allclose(x0 * alpha, x))
+
+    def test_Sscal(self):
+        self.Tscal('Sscal', np.float32)
+
+    def test_Dscal(self):
+        self.Tscal('Dscal', np.float64)
+
+    def test_Cscal(self):
+        self.Tscal('Cscal', np.complex64)
+
+    def test_Zscal(self):
+        self.Tscal('Zscal', np.complex128)
+
+    def test_Csscal(self):
+        self.Tscal('Csscal', np.complex64)
+
+    def test_Zdscal(self):
+        self.Tscal('Zdscal', np.complex128)
+
+    def Taxpy(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        alpha = 1.234
+        x = np.random.random(10).astype(dtype)
+        y = np.random.random(10).astype(dtype)
+        y0 = y.copy()
+
+        d_x = cuda.to_device(x)
+        d_y = cuda.to_device(y)
+
+        blas = cuBlas()
+        getattr(blas, fn)(x.size, alpha, d_x, 1, d_y, 1)
+
+        d_y.copy_to_host(y)
+
+        self.assertTrue(np.allclose(alpha * x + y0, y))
+
+    def test_Saxpy(self):
+        self.Taxpy('Saxpy', np.float32)
+
+    def test_Daxpy(self):
+        self.Taxpy('Daxpy', np.float64)
+
+    def test_Caxpy(self):
+        self.Taxpy('Caxpy', np.complex64)
+
+    def test_Zaxpy(self):
+        self.Taxpy('Zaxpy', np.complex128)
+
+    def Itamax(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        x = np.random.random(10).astype(dtype)
+
+        d_x = cuda.to_device(x)
+
+        blas = cuBlas()
+        got = getattr(blas, fn)(x.size, d_x, 1) - 1
+        self.assertTrue(np.allclose(np.argmax(x), got))
+
+    def test_Isamax(self):
+        self.Itamax('Isamax', np.float32)
+
+    def test_Idamax(self):
+        self.Itamax('Idamax', np.float64)
+
+    def test_Icamax(self):
+        self.Itamax('Icamax', np.complex64)
+
+    def test_Izamax(self):
+        self.Itamax('Izamax', np.complex128)
+
+    def Itamin(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        x = np.random.random(10).astype(dtype)
+
+        d_x = cuda.to_device(x)
+
+        blas = cuBlas()
+        got = getattr(blas, fn)(x.size, d_x, 1) - 1
+        self.assertTrue(np.allclose(np.argmin(x), got))
+
+    def test_Isamin(self):
+        self.Itamin('Isamin', np.float32)
+
+    def test_Idamin(self):
+        self.Itamin('Idamin', np.float64)
+
+    def test_Icamin(self):
+        self.Itamin('Icamin', np.complex64)
+
+    def test_Izamin(self):
+        self.Itamin('Izamin', np.complex128)
+
+    def Tasum(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        x = np.random.random(10).astype(dtype)
+
+        d_x = cuda.to_device(x)
+
+        blas = cuBlas()
+        got = getattr(blas, fn)(x.size, d_x, 1)
+        self.assertTrue(np.allclose(np.sum(x), got))
+
+    def test_Sasum(self):
+        self.Tasum('Sasum', np.float32)
+
+    def test_Dasum(self):
+        self.Tasum('Dasum', np.float64)
+
+    def test_Scasum(self):
+        self.Tasum('Scasum', np.complex64)
+
+    def test_Dzasum(self):
+        self.Tasum('Dzasum', np.complex128)
+
+    def Trot(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        x = np.random.random(10).astype(dtype)
+        y = np.random.random(10).astype(dtype)
+        angle = 1.342
+        c = np.cos(angle)
+        s = np.sin(angle)
+
+        x0, y0 = c * x + s * y, -s * x + c * y
+
+        d_x = cuda.to_device(x)
+        d_y = cuda.to_device(y)
+
+        blas = cuBlas()
+        getattr(blas, fn)(x.size, d_x, 1, d_y, 1, c, s)
+
+        d_x.copy_to_host(x)
+        d_y.copy_to_host(y)
+
+        self.assertTrue(np.allclose(x, x0))
+        self.assertTrue(np.allclose(y, y0))
+
+    def test_Srot(self):
+        self.Trot('Srot', np.float32)
+
+    def test_Drot(self):
+        self.Trot('Drot', np.float64)
+
+    def test_Crot(self):
+        self.Trot('Crot', np.complex64)
+
+    def test_Zrot(self):
+        self.Trot('Zrot', np.complex128)
+
+    def test_Csrot(self):
+        self.Trot('Csrot', np.complex64)
+
+    def test_Zdrot(self):
+        self.Trot('Zdrot', np.complex128)
+
+    def Trotg(self, fn):
+        from pyculib.blas.binding import cuBlas
+
+        a, b = np.random.random(), np.random.random()
+        blas = cuBlas()
+        r, z, c, s = getattr(blas, fn)(a, b)
+
+        rot = np.array([[c, s],
+                        [-np.conj(s), c]])
+        vec = np.array([[a],
+                        [b]])
+        exp = np.dot(rot, vec)
+        got = np.array([[r],
+                        [0.0]])
+        self.assertTrue(np.allclose(exp, got, atol=1e-6))
+
+    def test_Srotg(self):
+        self.Trotg('Srotg')
+
+    def test_Drotg(self):
+        self.Trotg('Drotg')
+
+    def test_Crotg(self):
+        self.Trotg('Crotg')
+
+    def test_Zrotg(self):
+        self.Trotg('Zrotg')
+
+    def Trotm(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        x = np.random.random(10).astype(dtype)
+        y = np.random.random(10).astype(dtype)
+
+        param = np.random.random(5).astype(dtype)
+        param[0] = -1.0
+        h11, h21, h12, h22 = param[1:].tolist()
+
+        x0, y0 = h11 * x + h12 * y, h21 * x + h22 * y
+
+        d_x = cuda.to_device(x)
+        d_y = cuda.to_device(y)
+
+        blas = cuBlas()
+        getattr(blas, fn)(x.size, d_x, 1, d_y, 1, param)
+
+        d_x.copy_to_host(x)
+        d_y.copy_to_host(y)
+
+        self.assertTrue(np.allclose(x, x0))
+        self.assertTrue(np.allclose(y, y0))
+
+    def test_Srotm(self):
+        self.Trotm('Srotm', np.float32)
+
+    def test_Drotm(self):
+        self.Trotm('Drotm', np.float64)
+
+    def Trotmg(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        d1, d2, x1, y1 = np.random.random(4).tolist()
+
+        blas = cuBlas()
+        param = getattr(blas, fn)(d1, d2, x1, y1)
+
+        flag, h11, h21, h12, h22 = param.tolist()
+
+        if flag == -1.0:
+            pass # don't know how to check
+        elif flag == 0.0:
+            self.assertEqual(h11, 0)
+            self.assertEqual(h22, 0)
+        elif flag == 1.0:
+            self.assertEqual(h12, 0)
+            self.assertEqual(h21, 0)
+        else:
+            self.assertEqual(flag, -2.0)
+            self.assertEqual(h11, 0)
+            self.assertEqual(h12, 0)
+            self.assertEqual(h21, 0)
+            self.assertEqual(h22, 0)
+
+    def test_Srotmg(self):
+        self.Trotmg('Srotmg', np.float32)
+
+    def test_Drotmg(self):
+        self.Trotmg('Drotmg', np.float64)
+
+    #
+    # Level 2 tests
+    # They just simply test to see if the binding works; doesn't check for
+    # correct result.
+    #
+
+    def Tgbmv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        blas = cuBlas()
+        kl = 0
+        ku = 0
+        alpha = 1.
+        beta = 0.
+        A = np.array([[1, 0, 0],
+                      [0, 2, 0],
+                      [0, 0, 3]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([1, 2, 3], dtype=dtype)
+        lda, n = A.shape
+        m = lda
+        y0 = y.copy()
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+        dy = cuda.to_device(y)
+        getattr(blas, fn)('N', m, n, kl, ku, alpha, dA, lda, dx, 1, beta, dy, 1)
+        dy.copy_to_host(y)
+        self.assertFalse(all(y0 == y))
+
+    def test_Sgbmv(self):
+        self.Tgbmv('Sgbmv', np.float32)
+
+    def test_Dgbmv(self):
+        self.Tgbmv('Dgbmv', np.float64)
+
+    def test_Cgbmv(self):
+        self.Tgbmv('Cgbmv', np.complex64)
+
+    def test_Zgbmv(self):
+        self.Tgbmv('Zgbmv', np.complex128)
+
+    def Tgemv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        blas = cuBlas()
+        alpha = 1.
+        beta = 0.
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([1, 2, 3], dtype=dtype)
+        m, n = A.shape
+        lda = m
+        y0 = y.copy()
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+        dy = cuda.to_device(y)
+        getattr(blas, fn)('N', m, n, alpha, dA, lda, dx, 1, beta, dy, 1)
+        dy.copy_to_host(y)
+        self.assertFalse(all(y0 == y))
+
+    def test_Sgemv(self):
+        self.Tgemv('Sgemv', np.float32)
+
+    def test_Dgemv(self):
+        self.Tgemv('Dgemv', np.float64)
+
+    def test_Cgemv(self):
+        self.Tgemv('Cgemv', np.complex64)
+
+    def test_Zgemv(self):
+        self.Tgemv('Zgemv', np.complex128)
+
+    def Ttrmv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        blas = cuBlas()
+        uplo = 'U'
+        trans = 'N'
+        diag = True
+        n = 3
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        lda = n
+        x0 = x.copy()
+        inc = 1
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+        getattr(blas, fn)(uplo, trans, diag, n, dA, lda, dx, inc)
+        dx.copy_to_host(x)
+        self.assertFalse(all(x == x0))
+
+    def test_Strmv(self):
+        self.Ttrmv('Strmv', np.float32)
+
+    def test_Dtrmv(self):
+        self.Ttrmv('Dtrmv', np.float64)
+
+    def test_Ctrmv(self):
+        self.Ttrmv('Ctrmv', np.complex64)
+
+    def test_Ztrmv(self):
+        self.Ttrmv('Ztrmv', np.complex128)
+
+    def Ttbmv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+
+        blas = cuBlas()
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        n = 3
+        lda = n
+        x0 = x.copy()
+        inc = 1
+        k = 0
+        getattr(blas, fn)(uplo, trans, diag, n, k, dA, lda, dx, inc)
+        dx.copy_to_host(x)
+
+        self.assertFalse(all(x == x0))
+
+    def test_Stbmv(self):
+        self.Ttbmv('Stbmv', np.float32)
+
+    def test_Dtbmv(self):
+        self.Ttbmv('Dtbmv', np.float64)
+
+    def test_Ctbmv(self):
+        self.Ttbmv('Ctbmv', np.complex64)
+
+    def test_Ztbmv(self):
+        self.Ttbmv('Ztbmv', np.complex128)
+
+    def Ttpmv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        AP = np.array([[1, 2, 0],
+                       [0, 3, 0],
+                       [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        dAP = cuda.to_device(AP)
+        dx = cuda.to_device(x)
+
+        blas = cuBlas()
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        n = 3
+        x0 = x.copy()
+        inc = 1
+        getattr(blas, fn)(uplo, trans, diag, n, dAP, dx, inc)
+        dx.copy_to_host(x)
+
+        self.assertFalse(all(x == x0))
+
+    def test_Stpmv(self):
+        self.Ttpmv('Stpmv', np.float32)
+
+    def test_Dtpmv(self):
+        self.Ttpmv('Dtpmv', np.float64)
+
+    def test_Ctpmv(self):
+        self.Ttpmv('Ctpmv', np.complex64)
+
+    def test_Ztpmv(self):
+        self.Ttpmv('Ztpmv', np.complex128)
+
+    def Ttrsv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+
+        blas = cuBlas()
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        lda = n = 3
+        x0 = x.copy()
+        inc = 1
+        getattr(blas, fn)(uplo, trans, diag, n, dA, lda, dx, inc)
+        dx.copy_to_host(x)
+
+        self.assertFalse(all(x == x0))
+
+    def test_Strsv(self):
+        self.Ttrsv('Strsv', np.float32)
+
+    def test_Dtrsv(self):
+        self.Ttrsv('Dtrsv', np.float64)
+
+    def test_Ctrsv(self):
+        self.Ttrsv('Ctrsv', np.complex64)
+
+    def test_Ztrsv(self):
+        self.Ttrsv('Ztrsv', np.complex128)
+
+    def _Ttpsv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+
+        blas = cuBlas()
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        n = 3
+        x0 = x.copy()
+        inc = 1
+        getattr(blas, fn)(uplo, trans, diag, n, dA, dx, inc)
+        dx.copy_to_host(x)
+
+        self.assertFalse(all(x == x0))
+
+    def test_Stpsv(self):
+        self._Ttpsv('Stpsv', np.float32)
+
+    def test_Dtpsv(self):
+        self._Ttpsv('Dtpsv', np.float64)
+
+    def test_Ctpsv(self):
+        self._Ttpsv('Ctpsv', np.complex64)
+
+    def test_Ztpsv(self):
+        self._Ttpsv('Ztpsv', np.complex128)
+
+    def _Ttbsv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+
+        blas = cuBlas()
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        lda = n = 3
+        k = 0
+        x0 = x.copy()
+        inc = 1
+        getattr(blas, fn)(uplo, trans, diag, n, k, dA, lda, dx, inc)
+        dx.copy_to_host(x)
+
+        self.assertFalse(all(x == x0))
+
+    def test_Stbsv(self):
+        self._Ttbsv('Stbsv', np.float32)
+
+    def test_Dtbsv(self):
+        self._Ttbsv('Dtbsv', np.float64)
+
+    def test_Ctbsv(self):
+        self._Ttbsv('Ctbsv', np.complex64)
+
+    def test_Ztbsv(self):
+        self._Ttbsv('Ztbsv', np.complex128)
+
+    def _Tsymv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+        dy = cuda.to_device(y)
+
+        alpha = 1.2
+        beta = .34
+        blas = cuBlas()
+        uplo = 'U'
+        lda = n = 3
+        y0 = y.copy()
+        incx = incy = 1
+        getattr(blas, fn)(uplo, n, alpha, dA, lda, dx, incx, beta, dy, incy)
+        dy.copy_to_host(y)
+
+        self.assertFalse(all(y == y0))
+
+    def test_Ssymv(self):
+        self._Tsymv('Ssymv', np.float32)
+
+    def test_Dsymv(self):
+        self._Tsymv('Dsymv', np.float64)
+
+    def test_Csymv(self):
+        self._Tsymv('Csymv', np.complex64)
+
+    def test_Zsymv(self):
+        self._Tsymv('Zsymv', np.complex128)
+
+    _Themv = _Tsymv
+
+    def test_Chemv(self):
+        self._Themv('Chemv', np.complex64)
+
+    def test_Zhemv(self):
+        self._Themv('Zhemv', np.complex128)
+
+    def _Tsbmv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+        dy = cuda.to_device(y)
+
+        alpha = 1.2
+        beta = .34
+        blas = cuBlas()
+        uplo = 'U'
+        lda = n = 3
+        k = 0
+        y0 = y.copy()
+        incx = incy = 1
+        getattr(blas, fn)(uplo, n, k, alpha, dA, lda, dx, incx, beta, dy, incy)
+        dy.copy_to_host(y)
+
+        self.assertFalse(all(y == y0))
+
+    def test_Ssbmv(self):
+        self._Tsbmv('Ssbmv', np.float32)
+
+    def test_Dsbmv(self):
+        self._Tsbmv('Dsbmv', np.float64)
+
+    _Thbmv = _Tsbmv
+
+    def test_Chbmv(self):
+        self._Thbmv('Chbmv', np.complex64)
+
+    def test_Zhbmv(self):
+        self._Thbmv('Zhbmv', np.complex128)
+
+    def _Tspmv(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        AP = np.array([[1, 2, 0],
+                       [0, 3, 0],
+                       [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+        dAP = cuda.to_device(AP)
+        dx = cuda.to_device(x)
+        dy = cuda.to_device(y)
+
+        alpha = 1.2
+        beta = .34
+        blas = cuBlas()
+        uplo = 'U'
+        n = 3
+        y0 = y.copy()
+        incx = incy = 1
+        getattr(blas, fn)(uplo, n, alpha, dAP, dx, incx, beta, dy, incy)
+        dy.copy_to_host(y)
+
+        self.assertFalse(all(y == y0))
+
+    def test_Sspmv(self):
+        self._Tspmv('Sspmv', np.float32)
+
+    def test_Dspmv(self):
+        self._Tspmv('Dspmv', np.float64)
+
+    _Thpmv = _Tspmv
+
+    def test_Chpmv(self):
+        self._Thpmv('Chpmv', np.complex64)
+
+    def test_Zspmv(self):
+        self._Tspmv('Zhpmv', np.complex128)
+
+    def _Tger(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+        dy = cuda.to_device(y)
+
+        alpha = 1.2
+
+        blas = cuBlas()
+
+        lda = m = n = 3
+        A0 = A.copy()
+        incx = incy = 1
+        getattr(blas, fn)(m, n, alpha, dx, incx, dy, incy, dA, lda)
+        dA.copy_to_host(A)
+
+        self.assertFalse(np.all(A == A0))
+
+    def test_Sger(self):
+        self._Tger('Sger', np.float32)
+
+    def test_Dger(self):
+        self._Tger('Dger', np.float64)
+
+    def test_Cgeru(self):
+        self._Tger('Cgeru', np.complex64)
+
+    def test_Cgerc(self):
+        self._Tger('Cgerc', np.complex64)
+
+    def test_Zgeru(self):
+        self._Tger('Zgeru', np.complex128)
+
+    def test_Zgerc(self):
+        self._Tger('Zgerc', np.complex128)
+
+
+    def _Tsyr(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+
+        alpha = 1.2
+        uplo = 'U'
+        blas = cuBlas()
+
+        lda = n = 3
+        A0 = A.copy()
+        incx = 1
+        getattr(blas, fn)(uplo, n, alpha, dx, incx, dA, lda)
+        dA.copy_to_host(A)
+
+        self.assertFalse(np.all(A == A0))
+
+    def test_Ssyr(self):
+        self._Tsyr('Ssyr', np.float32)
+
+    def test_Dsyr(self):
+        self._Tsyr('Dsyr', np.float64)
+
+    def test_Csyr(self):
+        self._Tsyr('Csyr', np.complex64)
+
+    def test_Zsyr(self):
+        self._Tsyr('Zsyr', np.complex128)
+
+    def _Ther(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+
+        alpha = 1.2
+        uplo = 'U'
+        blas = cuBlas()
+
+        lda = n = 3
+        A0 = A.copy()
+        incx = 1
+        getattr(blas, fn)(uplo, n, alpha, dx, incx, dA, lda)
+        dA.copy_to_host(A)
+
+        self.assertFalse(np.all(A == A0))
+
+    def test_Cher(self):
+        self._Ther('Cher', np.complex64)
+
+    def test_Zher(self):
+        self._Ther('Zher', np.complex128)
+
+    def _Tspr(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        AP = np.array([[1, 2, 0],
+                       [0, 3, 0],
+                       [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+
+        dAP = cuda.to_device(AP)
+        dx = cuda.to_device(x)
+
+        alpha = 1.2
+        uplo = 'U'
+        blas = cuBlas()
+
+        n = 3
+        AP0 = AP.copy()
+        incx = 1
+        getattr(blas, fn)(uplo, n, alpha, dx, incx, dAP)
+        dAP.copy_to_host(AP)
+
+        self.assertFalse(np.all(AP == AP0))
+
+    def test_Sspr(self):
+        self._Tspr('Sspr', np.float32)
+
+    def test_Dspr(self):
+        self._Tspr('Dspr', np.float64)
+
+    def test_Chpr(self):
+        self._Tspr('Chpr', np.complex64)
+
+    def test_Zhpr(self):
+        self._Tspr('Zhpr', np.complex128)
+
+    def _Tsyr2(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+        dy = cuda.to_device(y)
+
+        alpha = 1.2
+        uplo = 'U'
+        blas = cuBlas()
+
+        lda = n = 3
+        A0 = A.copy()
+        incx = incy = 1
+        getattr(blas, fn)(uplo, n, alpha, dx, incx, dy, incy, dA, lda)
+        dA.copy_to_host(A)
+
+        self.assertFalse(np.all(A == A0))
+
+    _Ther2 = _Tsyr2
+
+    def test_Ssyr2(self):
+        self._Tsyr2('Ssyr2', np.float32)
+
+    def test_Dsyr2(self):
+        self._Tsyr2('Dsyr2', np.float64)
+
+    def test_Csyr2(self):
+        self._Tsyr2('Csyr2', np.complex64)
+
+    def test_Zsyr2(self):
+        self._Tsyr2('Zsyr2', np.complex128)
+
+    def test_Cher2(self):
+        self._Ther2('Cher2', np.complex64)
+
+    def test_Zher2(self):
+        self._Ther2('Zher2', np.complex128)
+
+    def _Tspr2(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+        dy = cuda.to_device(y)
+
+        alpha = 1.2
+        uplo = 'U'
+        blas = cuBlas()
+
+        n = 3
+        A0 = A.copy()
+        incx = incy = 1
+        getattr(blas, fn)(uplo, n, alpha, dx, incx, dy, incy, dA)
+        dA.copy_to_host(A)
+
+        self.assertFalse(np.all(A == A0))
+
+    _Thpr2 = _Tspr2
+
+    def test_Sspr2(self):
+        self._Tspr2('Sspr2', np.float32)
+
+    def test_Dspr2(self):
+        self._Tspr2('Sspr2', np.float64)
+
+    def test_Chpr2(self):
+        self._Thpr2('Chpr2', np.complex64)
+
+    def test_Zhpr2(self):
+        self._Thpr2('Zhpr2', np.complex128)
+
+    # Level 3
+
+    def _Tgemm(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        B = np.array([[2, 2, 0],
+                      [7, 0, 0],
+                      [1, 4, 1]], order='F', dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+
+        dA = cuda.to_device(A)
+        dB = cuda.to_device(B)
+        dC = cuda.to_device(C)
+
+        alpha = 1.2
+        beta = .34
+
+        transa = 'N'
+        transb = 'N'
+        blas = cuBlas()
+
+        lda = ldb = ldc = m = n = k = 3
+        C0 = C.copy()
+        getattr(blas, fn)(transa, transb, m, n, k, alpha, dA, lda, dB, ldb,
+                          beta, dC, ldc)
+        dC.copy_to_host(C)
+
+        self.assertFalse(np.all(C == C0))
+
+    def test_Sgemm(self):
+        self._Tgemm('Sgemm', np.float32)
+
+    def test_Dgemm(self):
+        self._Tgemm('Dgemm', np.float64)
+
+    def test_Cgemm(self):
+        self._Tgemm('Cgemm', np.complex64)
+
+
+    def test_Zgemm(self):
+        self._Tgemm('Zgemm', np.complex128)
+
+
+    def _Tsyrk(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+
+        dA = cuda.to_device(A)
+        dC = cuda.to_device(C)
+
+        alpha = 1.2
+        beta = .34
+
+        uplo = 'U'
+        trans = 'N'
+
+        blas = cuBlas()
+
+        lda = ldc = n = k = 3
+        C0 = C.copy()
+        getattr(blas, fn)(uplo, trans, n, k, alpha, dA, lda, beta, dC, ldc)
+        dC.copy_to_host(C)
+
+        self.assertFalse(np.all(C == C0))
+
+    def test_Ssyrk(self):
+        self._Tsyrk('Ssyrk', np.float32)
+
+    def test_Dsyrk(self):
+        self._Tsyrk('Dsyrk', np.float64)
+
+    def test_Csyrk(self):
+        self._Tsyrk('Csyrk', np.complex64)
+
+    def test_Zsyrk(self):
+        self._Tsyrk('Zsyrk', np.complex128)
+
+    _Therk = _Tsyrk
+
+    def test_Cherk(self):
+        self._Therk('Cherk', np.complex64)
+
+    def test_Zherk(self):
+        self._Therk('Zherk', np.complex128)
+
+    def _Tsymm(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        B = np.array([[2, 2, 0],
+                      [7, 0, 0],
+                      [1, 4, 1]], order='F', dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+
+        dA = cuda.to_device(A)
+        dB = cuda.to_device(B)
+        dC = cuda.to_device(C)
+
+        alpha = 1.2
+        beta = .34
+
+        side = 'L'
+        uplo = 'U'
+
+        blas = cuBlas()
+
+        lda = ldb = ldc = m = n = 3
+        C0 = C.copy()
+        getattr(blas, fn)(side, uplo, m, n, alpha, dA, lda, dB, ldb, beta, dC,
+                          ldc)
+        dC.copy_to_host(C)
+
+        self.assertFalse(np.all(C == C0))
+
+    def test_Ssymm(self):
+        self._Tsymm('Ssymm', np.float32)
+
+    def test_Dsymm(self):
+        self._Tsymm('Dsymm', np.float64)
+
+    def test_Csymm(self):
+        self._Tsymm('Csymm', np.complex64)
+
+    def test_Zsymm(self):
+        self._Tsymm('Zsymm', np.complex128)
+
+    _Themm = _Tsymm
+
+    def test_Chemm(self):
+        self._Themm('Chemm', np.complex64)
+
+    def test_Zhemm(self):
+        self._Themm('Zhemm', np.complex128)
+
+    def _Ttrsm(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        B = np.array([[2, 2, 0],
+                      [7, 0, 0],
+                      [1, 4, 1]], order='F', dtype=dtype)
+        dA = cuda.to_device(A)
+        dB = cuda.to_device(B)
+
+        alpha = 1.2
+        side = 'L'
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+
+        blas = cuBlas()
+
+        lda = ldb = m = n = 3
+        B0 = B.copy()
+        getattr(blas, fn)(side, uplo, trans, diag, m, n, alpha, dA, lda, dB,
+                          ldb)
+        dB.copy_to_host(B)
+
+        self.assertFalse(np.all(B == B0))
+
+    def test_Strsm(self):
+        self._Ttrsm('Strsm', np.float32)
+
+    def test_Dtrsm(self):
+        self._Ttrsm('Dtrsm', np.float64)
+
+    def test_Ctrsm(self):
+        self._Ttrsm('Ctrsm', np.complex64)
+
+    def test_Ztrsm(self):
+        self._Ttrsm('Ztrsm', np.complex128)
+
+    def _Ttrmm(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        B = np.array([[2, 2, 0],
+                      [7, 0, 0],
+                      [1, 4, 1]], order='F', dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+
+        dA = cuda.to_device(A)
+        dB = cuda.to_device(B)
+        dC = cuda.to_device(C)
+
+        alpha = 1.2
+
+        side = 'L'
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+
+        blas = cuBlas()
+
+        lda = ldb = ldc = m = n = 3
+        C0 = C.copy()
+        getattr(blas, fn)(side, uplo, trans, diag, m, n, alpha, dA, lda, dB,
+                          ldb, dC, ldc)
+        dC.copy_to_host(C)
+
+        self.assertFalse(np.all(C == C0))
+
+    def test_Strmm(self):
+        self._Ttrmm('Strmm', np.float32)
+
+    def test_Dtrmm(self):
+        self._Ttrmm('Dtrmm', np.float64)
+
+    def test_Ctrmm(self):
+        self._Ttrmm('Ctrmm', np.complex64)
+
+    def test_Ztrmm(self):
+        self._Ttrmm('Ztrmm', np.complex128)
+
+
+    def _Tdgmm(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 2.4], dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+
+        dA = cuda.to_device(A)
+        dx = cuda.to_device(x)
+        dC = cuda.to_device(C)
+
+        side = 'L'
+
+        blas = cuBlas()
+
+        lda = ldc = m = n = 3
+        C0 = C.copy()
+        incx = 1
+        getattr(blas, fn)(side, m, n, dA, lda, dx, incx, dC, ldc)
+        dC.copy_to_host(C)
+
+        self.assertFalse(np.all(C == C0))
+
+    def test_Sdgmm(self):
+        self._Tdgmm('Sdgmm', np.float32)
+
+    def test_Ddgmm(self):
+        self._Tdgmm('Ddgmm', np.float64)
+
+    def test_Cdgmm(self):
+        self._Tdgmm('Cdgmm', np.complex64)
+
+    def test_Zdgmm(self):
+        self._Tdgmm('Zdgmm', np.complex128)
+
+
+    def _Tgeam(self, fn, dtype):
+        from pyculib.blas.binding import cuBlas
+
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        B = np.array([[2, 2, 0],
+                      [7, 0, 0],
+                      [1, 4, 1]], order='F', dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+
+        dA = cuda.to_device(A)
+        dB = cuda.to_device(B)
+        dC = cuda.to_device(C)
+
+        alpha = 1.2
+        beta = .34
+
+        transa = 'N'
+        transb = 'N'
+
+        blas = cuBlas()
+
+        lda = ldb = ldc = m = n = 3
+        C0 = C.copy()
+        getattr(blas, fn)(transa, transb, m, n, alpha, dA, lda, beta, dB,
+                          ldb, dC, ldc)
+        dC.copy_to_host(C)
+
+        self.assertFalse(np.all(C == C0))
+
+    def test_Sgeam(self):
+        self._Tgeam('Sgeam', np.float32)
+
+    def test_Dgeam(self):
+        self._Tgeam('Dgeam', np.float64)
+
+    def test_Cgeam(self):
+        self._Tgeam('Cgeam', np.complex64)
+
+    def test_Zgeam(self):
+        self._Tgeam('Zgeam', np.complex128)
+
+
+class TestCuBlasAPI(CUDATestCase):
+    def setUp(self):
+        from pyculib.blas import Blas
+
+        self.blas = Blas()
+
+
+    def Tnrm2(self, fn, dtype):
+        x = np.random.random(10).astype(dtype)
+        got = fn(x)
+        exp = np.linalg.norm(x)
+        self.assertTrue(np.allclose(got, exp))
+
+    def test_nrm2(self):
+        self.Tnrm2(self.blas.nrm2, np.float32)
+        self.Tnrm2(self.blas.nrm2, np.float64)
+        self.Tnrm2(self.blas.nrm2, np.complex64)
+        self.Tnrm2(self.blas.nrm2, np.complex128)
+
+    def Tdot(self, fn, dtype):
+        x = np.random.random(10).astype(dtype)
+        y = np.random.random(10).astype(dtype)
+        got = self.blas.dot(x, y)
+        exp = np.dot(x, y)
+        self.assertTrue(np.allclose(got, exp))
+
+    def test_dot(self):
+        self.Tdot(self.blas.dot, np.float32)
+        self.Tdot(self.blas.dot, np.float64)
+        self.Tdot(self.blas.dot, np.complex64)
+        self.Tdot(self.blas.dot, np.complex128)
+
+    def Tdotc(self, fn, dtype):
+        x = np.random.random(10).astype(dtype)
+        y = np.random.random(10).astype(dtype)
+        got = self.blas.dotc(x, y)
+        exp = np.vdot(x, y)
+        self.assertTrue(np.allclose(got, exp))
+
+    def test_dotc(self):
+        self.Tdot(self.blas.dotc, np.float32)
+
+    def Tscal(self, fn, dtype, alpha):
+        x = np.random.random(10).astype(dtype)
+        x0 = x.copy()
+        fn(alpha, x)
+        self.assertTrue(np.allclose(x0 * alpha, x))
+
+    def test_scal(self):
+        self.Tscal(self.blas.scal, np.float32, 1.234)
+        self.Tscal(self.blas.scal, np.float64, 1.234)
+        self.Tscal(self.blas.scal, np.complex64, 1.234 + 5j)
+        self.Tscal(self.blas.scal, np.complex128, 1.234 + 5j)
+        self.Tscal(self.blas.scal, np.complex64, 1.234)
+        self.Tscal(self.blas.scal, np.complex128, 1.234)
+
+    def Taxpy(self, fn, dtype, alpha):
+        x = np.random.random(10).astype(dtype)
+        y = np.random.random(10).astype(dtype)
+        y0 = y.copy()
+
+        fn(alpha, x, y)
+
+        self.assertTrue(np.allclose(alpha * x + y0, y))
+
+    def test_axpy(self):
+        self.Taxpy(self.blas.axpy, np.float32, 1.234)
+        self.Taxpy(self.blas.axpy, np.float64, 1.234)
+        self.Taxpy(self.blas.axpy, np.complex64, 1.234j)
+        self.Taxpy(self.blas.axpy, np.complex128, 1.234j)
+
+    def Itamax(self, fn, dtype):
+        x = np.random.random(10).astype(dtype)
+        got = fn(x)
+        self.assertTrue(np.allclose(np.argmax(x), got))
+
+    def test_amax(self):
+        self.Itamax(self.blas.amax, np.float32)
+        self.Itamax(self.blas.amax, np.float64)
+        self.Itamax(self.blas.amax, np.complex64)
+        self.Itamax(self.blas.amax, np.complex128)
+
+    def Itamin(self, fn, dtype):
+        x = np.random.random(10).astype(dtype)
+        got = fn(x)
+        self.assertTrue(np.allclose(np.argmin(x), got))
+
+    def test_amin(self):
+        self.Itamin(self.blas.amin, np.float32)
+        self.Itamin(self.blas.amin, np.float64)
+        self.Itamin(self.blas.amin, np.complex64)
+        self.Itamin(self.blas.amin, np.complex128)
+
+    def Tasum(self, fn, dtype):
+        x = np.random.random(10).astype(dtype)
+        got = fn(x)
+        self.assertTrue(np.allclose(np.sum(x), got))
+
+    def test_asum(self):
+        self.Tasum(self.blas.asum, np.float32)
+        self.Tasum(self.blas.asum, np.float64)
+        self.Tasum(self.blas.asum, np.complex64)
+        self.Tasum(self.blas.asum, np.complex128)
+
+    def Trot(self, fn, dtype):
+        x = np.random.random(10).astype(dtype)
+        y = np.random.random(10).astype(dtype)
+        angle = 1.342
+        c = np.cos(angle)
+        s = np.sin(angle)
+
+        x0, y0 = c * x + s * y, -s * x + c * y
+
+        fn(x, y, c, s)
+
+        self.assertTrue(np.allclose(x, x0))
+        self.assertTrue(np.allclose(y, y0))
+
+    def test_rot(self):
+        self.Trot(self.blas.rot, np.float32)
+        self.Trot(self.blas.rot, np.float64)
+        self.Trot(self.blas.rot, np.complex64)
+        self.Trot(self.blas.rot, np.complex128)
+
+    def Trotg(self, fn, dt1, dt2):
+        a, b = (np.array(np.random.random(), dtype=dt1),
+                np.array(np.random.random(), dtype=dt2))
+        r, z, c, s = fn(a, b)
+
+        rot = np.array([[c, s],
+                        [-np.conj(s), c]])
+        vec = np.array([[a],
+                        [b]])
+        exp = np.dot(rot, vec)
+        got = np.array([[r],
+                        [0.0]])
+        self.assertTrue(np.allclose(exp, got, atol=1e-6))
+
+    def test_rotg(self):
+        self.Trotg(self.blas.rotg, np.float32, np.float32)
+        self.Trotg(self.blas.rotg, np.float64, np.float64)
+        self.Trotg(self.blas.rotg, np.complex64, np.complex64)
+        self.Trotg(self.blas.rotg, np.complex128, np.complex128)
+
+    def Trotm(self, fn, dtype):
+        x = np.random.random(10).astype(dtype)
+        y = np.random.random(10).astype(dtype)
+
+        param = np.random.random(5).astype(dtype)
+        param[0] = -1.0
+        h11, h21, h12, h22 = param[1:].tolist()
+
+        x0, y0 = h11 * x + h12 * y, h21 * x + h22 * y
+
+        fn(x, y, param)
+
+        self.assertTrue(np.allclose(x, x0))
+        self.assertTrue(np.allclose(y, y0))
+
+    def test_rotm(self):
+        self.Trotm(self.blas.rotm, np.float32)
+        self.Trotm(self.blas.rotm, np.float64)
+
+    def Trotmg(self, fn, dtype):
+        d1, d2, x1, y1 = np.random.random(4).tolist()
+
+        param = fn(d1, d2, x1, y1)
+
+        flag, h11, h21, h12, h22 = param.tolist()
+
+        if flag == -1.0:
+            pass # don't know how to check
+        elif flag == 0.0:
+            self.assertEqual(h11, 0)
+            self.assertEqual(h22, 0)
+        elif flag == 1.0:
+            self.assertEqual(h12, 0)
+            self.assertEqual(h21, 0)
+        else:
+            self.assertEqual(flag, -2.0)
+            self.assertEqual(h11, 0)
+            self.assertEqual(h12, 0)
+            self.assertEqual(h21, 0)
+            self.assertEqual(h22, 0)
+
+    def test_rotmg(self):
+        self.Trotmg(self.blas.rotmg, np.float32)
+        self.Trotmg(self.blas.rotmg, np.float64)
+
+    # Level 2
+
+    def _test_all(self, test, fn):
+        dtypes = np.float32, np.float64, np.complex64, np.complex128
+        for dt in dtypes:
+            test(fn, dt)
+
+    def _test_float(self, test, fn):
+        dtypes = np.float32, np.float64
+        for dt in dtypes:
+            test(fn, dt)
+
+    def _test_complex(self, test, fn):
+        dtypes = np.complex64, np.complex128
+        for dt in dtypes:
+            test(fn, dt)
+
+    def Tgbmv(self, fn, dtype):
+        kl = 0
+        ku = 0
+        alpha = 1.
+        beta = 0.
+        A = np.array([[1, 0, 0],
+                      [0, 2, 0],
+                      [0, 0, 3]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([1, 2, 3], dtype=dtype)
+        lda, n = A.shape
+        m = lda
+        y0 = y.copy()
+        fn('N', m, n, kl, ku, alpha, A, x, beta, y)
+        self.assertFalse(all(y0 == y))
+
+    def test_gbmv(self):
+        self._test_all(self.Tgbmv, self.blas.gbmv)
+
+    def Tgemv(self, fn, dtype):
+        alpha = 1.
+        beta = 0.
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([1, 2, 3], dtype=dtype)
+        m, n = A.shape
+        y0 = y.copy()
+
+        fn('N', m, n, alpha, A, x, beta, y)
+        self.assertFalse(all(y0 == y))
+
+    def test_gemv(self):
+        self._test_all(self.Tgemv, self.blas.gemv)
+
+    def Ttrmv(self, fn, dtype):
+        uplo = 'U'
+        trans = 'N'
+        diag = True
+        n = 3
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        x0 = x.copy()
+        fn(uplo, trans, diag, n, A, x)
+        self.assertFalse(all(x == x0))
+
+    def test_trmv(self):
+        self._test_all(self.Ttrmv, self.blas.trmv)
+
+    def Ttbmv(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        n = 3
+        x0 = x.copy()
+        k = 0
+        fn(uplo, trans, diag, n, k, A, x)
+        self.assertFalse(all(x == x0))
+
+    def test_tbmv(self):
+        self._test_all(self.Ttbmv, self.blas.tbmv)
+
+
+    def Ttpmv(self, fn, dtype):
+        AP = np.array([[1, 2, 0],
+                       [0, 3, 0],
+                       [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        n = 3
+        x0 = x.copy()
+        fn(uplo, trans, diag, n, AP, x)
+        self.assertFalse(all(x == x0))
+
+    def test_tpmv(self):
+        self._test_all(self.Ttpmv, self.blas.tpmv)
+
+    def Ttrsv(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        n = 3
+        x0 = x.copy()
+        fn(uplo, trans, diag, n, A, x)
+        self.assertFalse(all(x == x0))
+
+    def test_trsv(self):
+        self._test_all(self.Ttrsv, self.blas.trsv)
+
+    def Ttpsv(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        n = 3
+        x0 = x.copy()
+        fn(uplo, trans, diag, n, A, x)
+        self.assertFalse(all(x == x0))
+
+    def test_tpsv(self):
+        self._test_all(self.Ttpsv, self.blas.tpsv)
+
+    def Ttbsv(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        n = 3
+        k = 0
+        x0 = x.copy()
+        fn(uplo, trans, diag, n, k, A, x)
+        self.assertFalse(all(x == x0))
+
+    def test_tbsv(self):
+        self._test_all(self.Ttbsv, self.blas.tbsv)
+
+
+    def Tsymv(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+        alpha = 1.2
+        beta = .34
+        uplo = 'U'
+        n = 3
+        y0 = y.copy()
+        fn(uplo, n, alpha, A, x, beta, y)
+        self.assertFalse(all(y == y0))
+
+    def test_symv(self):
+        self._test_all(self.Tsymv, self.blas.symv)
+
+    Themv = Tsymv
+
+    def test_hemv(self):
+        self._test_complex(self.Themv, self.blas.hemv)
+
+
+    def Tsbmv(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+        alpha = 1.2
+        beta = .34
+        uplo = 'U'
+        n = 3
+        k = 0
+        y0 = y.copy()
+        fn(uplo, n, k, alpha, A, x, beta, y)
+        self.assertFalse(all(y == y0))
+
+    def test_sbmv(self):
+        self._test_float(self.Tsbmv, self.blas.sbmv)
+
+    Thbmv = Tsbmv
+
+    def test_hbmv(self):
+        self._test_complex(self.Thbmv, self.blas.hbmv)
+
+    def Tspmv(self, fn, dtype):
+        AP = np.array([[1, 2, 0],
+                       [0, 3, 0],
+                       [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+        alpha = 1.2
+        beta = .34
+        uplo = 'U'
+        n = 3
+        y0 = y.copy()
+        fn(uplo, n, alpha, AP, x, beta, y)
+        self.assertFalse(all(y == y0))
+
+    def test_spmv(self):
+        self._test_float(self.Tspmv, self.blas.spmv)
+
+    Thpmv = Tspmv
+
+    def test_hpmv(self):
+        self._test_complex(self.Thpmv, self.blas.hpmv)
+
+    def Tger(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+        alpha = 1.2
+        m = n = 3
+        A0 = A.copy()
+        fn(m, n, alpha, x, y, A)
+        self.assertFalse(np.all(A == A0))
+
+    def test_ger(self):
+        self._test_float(self.Tger, self.blas.ger)
+
+    def test_geru(self):
+        self._test_complex(self.Tger, self.blas.geru)
+
+    def test_gerc(self):
+        self._test_complex(self.Tger, self.blas.gerc)
+
+    def Tsyr(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+
+        alpha = 1.2
+        uplo = 'U'
+        n = 3
+        A0 = A.copy()
+        fn(uplo, n, alpha, x, A)
+        self.assertFalse(np.all(A == A0))
+
+    def test_syr(self):
+        self._test_all(self.Tsyr, self.blas.syr)
+
+    def Ther(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        alpha = 1.2
+        uplo = 'U'
+        n = 3
+        A0 = A.copy()
+        fn(uplo, n, alpha, x, A)
+        self.assertFalse(np.all(A == A0))
+
+    def test_her(self):
+        self._test_complex(self.Ther, self.blas.her)
+
+    def Tspr(self, fn, dtype):
+        AP = np.array([[1, 2, 0],
+                       [0, 3, 0],
+                       [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        alpha = 1.2
+        uplo = 'U'
+        n = 3
+        AP0 = AP.copy()
+        fn(uplo, n, alpha, x, AP)
+        self.assertFalse(np.all(AP == AP0))
+
+    def test_spr(self):
+        self._test_float(self.Tspr, self.blas.spr)
+
+    Thpr = Tspr
+
+    def test_hpr(self):
+        self._test_complex(self.Thpr, self.blas.hpr)
+
+    def Tsyr2(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+        alpha = 1.2
+        uplo = 'U'
+        n = 3
+        A0 = A.copy()
+        fn(uplo, n, alpha, x, y, A)
+        self.assertFalse(np.all(A == A0))
+
+    Ther2 = Tsyr2
+
+    def test_syr2(self):
+        self._test_all(self.Tsyr2, self.blas.syr2)
+
+    def test_her2(self):
+        self._test_complex(self.Ther2, self.blas.her2)
+
+    def Tspr2(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 3], dtype=dtype)
+        y = np.array([8, 2, 3], dtype=dtype)
+        alpha = 1.2
+        uplo = 'U'
+        n = 3
+        A0 = A.copy()
+        fn(uplo, n, alpha, x, y, A)
+        self.assertFalse(np.all(A == A0))
+
+    Thpr2 = Tspr2
+
+    def test_spr2(self):
+        self._test_float(self.Tspr2, self.blas.spr2)
+
+    def test_hpr2(self):
+        self._test_complex(self.Thpr2, self.blas.hpr2)
+
+    # Level 3
+
+    def Tgemm(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        B = np.array([[2, 2, 0],
+                      [7, 0, 0],
+                      [1, 4, 1]], order='F', dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+        alpha = 1.2
+        beta = .34
+        transa = 'N'
+        transb = 'N'
+        m = n = k = 3
+        C0 = C.copy()
+        fn(transa, transb, m, n, k, alpha, A, B, beta, C)
+        self.assertFalse(np.all(C == C0))
+
+    def test_gemm(self):
+        self._test_all(self.Tgemm, self.blas.gemm)
+
+
+    def Tsyrk(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+        alpha = 1.2
+        beta = .34
+        uplo = 'U'
+        trans = 'N'
+        n = k = 3
+        C0 = C.copy()
+        fn(uplo, trans, n, k, alpha, A, beta, C)
+        self.assertFalse(np.all(C == C0))
+
+    def test_syrk(self):
+        self._test_all(self.Tsyrk, self.blas.syrk)
+
+    Therk = Tsyrk
+
+    def test_herk(self):
+        self._test_complex(self.Therk, self.blas.herk)
+
+    def Tsymm(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        B = np.array([[2, 2, 0],
+                      [7, 0, 0],
+                      [1, 4, 1]], order='F', dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+        alpha = 1.2
+        beta = .34
+        side = 'L'
+        uplo = 'U'
+        m = n = 3
+        C0 = C.copy()
+        fn(side, uplo, m, n, alpha, A, B, beta, C)
+        self.assertFalse(np.all(C == C0))
+
+    def test_symm(self):
+        self._test_all(self.Tsymm, self.blas.symm)
+
+    Themm = Tsymm
+
+    def test_hemm(self):
+        self._test_complex(self.Themm, self.blas.hemm)
+
+    def Ttrsm(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        B = np.array([[2, 2, 0],
+                      [7, 0, 0],
+                      [1, 4, 1]], order='F', dtype=dtype)
+        alpha = 1.2
+        side = 'L'
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        m = n = 3
+        B0 = B.copy()
+        fn(side, uplo, trans, diag, m, n, alpha, A, B)
+        self.assertFalse(np.all(B == B0))
+
+    def test_trsm(self):
+        self._test_all(self.Ttrsm, self.blas.trsm)
+
+    def Ttrmm(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        B = np.array([[2, 2, 0],
+                      [7, 0, 0],
+                      [1, 4, 1]], order='F', dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+        alpha = 1.2
+        side = 'L'
+        uplo = 'U'
+        trans = 'N'
+        diag = False
+        m = n = 3
+        C0 = C.copy()
+        fn(side, uplo, trans, diag, m, n, alpha, A, B, C)
+        self.assertFalse(np.all(C == C0))
+
+    def test_trmm(self):
+        self._test_all(self.Ttrmm, self.blas.trmm)
+
+    def Tdgmm(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        x = np.array([1, 2, 2.4], dtype=dtype)
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+        side = 'L'
+        m = n = 3
+        C0 = C.copy()
+        fn(side, m, n, A, x, C)
+        self.assertFalse(np.all(C == C0))
+
+    def test_dgmm(self):
+        self._test_all(self.Tdgmm, self.blas.dgmm)
+
+
+    def Tgeam(self, fn, dtype):
+        A = np.array([[1, 2, 0],
+                      [0, 3, 0],
+                      [1, 0, 1]], order='F', dtype=dtype)
+        B = np.array([[2, 2, 0],
+                      [7, 0, 0],
+                      [1, 4, 1]], order='F', dtype=dtype)
+
+        C = np.array([[0, 9, 0],
+                      [0, 1, 1],
+                      [0, 0, 1]], order='F', dtype=dtype)
+        alpha = 1.2
+        beta = .34
+        transa = 'N'
+        transb = 'N'
+        m = n = 3
+        C0 = C.copy()
+        fn(transa, transb, m, n, alpha, A, beta, B, C)
+        self.assertFalse(np.all(C == C0))
+
+    def test_geam(self):
+        self._test_all(self.Tgeam, self.blas.geam)
+
+
+def test():
+    import sys
+    loader = unittest.TestLoader()
+    suite = unittest.TestSuite()
+    mod = sys.modules[__name__]
+    for name in dir(mod):
+        if name.startswith('Test'):
+            test_class = getattr(mod, name)
+            tests = loader.loadTestsFromTestCase(test_class)
+            suite.addTests(tests)
+    runner = unittest.runner.TextTestRunner()
+    return runner.run(suite)
+
+test_cases = (TestCuBlasBinding, TestCuBlasAPI)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pyculib/tests/test_fft.py b/pyculib/tests/test_fft.py
new file mode 100644
index 0000000..b3e7b9b
--- /dev/null
+++ b/pyculib/tests/test_fft.py
@@ -0,0 +1,364 @@
+from __future__ import print_function, absolute_import, division
+import numpy as np
+import unittest
+from .base import CUDATestCase
+from numba import cuda
+
+
+class TestCuFFTLib(CUDATestCase):
+    def test_lib(self):
+        from pyculib.fft.binding import libcufft
+        cufft = libcufft()
+        self.assertNotEqual(libcufft().version, 0)
+
+
+class TestCuFFTPlan(CUDATestCase):
+    def test_plan1d(self):
+        from pyculib.fft.binding import Plan, CUFFT_C2C
+        n = 10
+        data = np.arange(n, dtype=np.complex64)
+        orig = data.copy()
+        d_data = cuda.to_device(data)
+        fftplan = Plan.one(CUFFT_C2C, n)
+        fftplan.forward(d_data, d_data)
+        fftplan.inverse(d_data, d_data)
+        d_data.copy_to_host(data)
+        result = data / n
+        self.assertTrue(np.allclose(orig, result.real))
+
+    def test_plan2d(self):
+        from pyculib.fft.binding import Plan, CUFFT_C2C
+        n = 2**4
+        data = np.arange(n, dtype=np.complex64).reshape(2, n//2)
+        orig = data.copy()
+        d_data = cuda.to_device(data)
+        fftplan = Plan.two(CUFFT_C2C, *data.shape)
+        fftplan.forward(d_data, d_data)
+        fftplan.inverse(d_data, d_data)
+        d_data.copy_to_host(data)
+        result = data / n
+        self.assertTrue(np.allclose(orig, result.real))
+
+    def test_plan3d(self):
+        from pyculib.fft.binding import Plan, CUFFT_C2C
+        n = 32
+        data = np.arange(n, dtype=np.complex64).reshape(2, 2, 8)
+
+        orig = data.copy()
+        d_data = cuda.to_device(data)
+        fftplan = Plan.three(CUFFT_C2C, *data.shape)
+        fftplan.forward(d_data, d_data)
+        fftplan.inverse(d_data, d_data)
+        d_data.copy_to_host(data)
+        result = data / n
+        self.assertTrue(np.allclose(orig, result.real))
+
+
+    def test_against_fft_1d(self):
+        from pyculib.fft.binding import Plan, CUFFT_R2C
+        N = 128
+        x = np.asarray(np.arange(N), dtype=np.float32)
+        xf = np.fft.fft(x)
+        d_x_gpu = cuda.to_device(x)
+        xf_gpu = np.zeros(N//2+1, np.complex64)
+        d_xf_gpu = cuda.to_device(xf_gpu)
+        plan = Plan.many(x.shape, CUFFT_R2C)
+        plan.forward(d_x_gpu, d_xf_gpu)
+        d_xf_gpu.copy_to_host(xf_gpu)
+        self.assertTrue( np.allclose(xf[0:N//2+1], xf_gpu,
+                                      atol=1e-6) )
+
+    def test_against_fft_2d(self):
+        from pyculib.fft.binding import Plan, CUFFT_R2C
+        rank = 2
+        rowsize = 128
+        N = rowsize * rank
+        x = np.arange(N, dtype=np.float32).reshape(rank, rowsize)
+        xf = np.fft.fft2(x)
+        d_x_gpu = cuda.to_device(x)
+        xf_gpu = np.zeros(shape=(rank, rowsize//2 + 1), dtype=np.complex64)
+        d_xf_gpu = cuda.to_device(xf_gpu)
+        plan = Plan.many(x.shape, CUFFT_R2C)
+        plan.forward(d_x_gpu, d_xf_gpu)
+        d_xf_gpu.copy_to_host(xf_gpu)
+        self.assertTrue(np.allclose(xf[:, 0:rowsize//2+1], xf_gpu, atol=1e-6))
+
+    def test_against_fft_3d(self):
+        from pyculib.fft.binding import Plan, CUFFT_R2C
+        depth = 2
+        colsize = 2
+        rowsize = 64
+        N = depth * colsize * rowsize
+        x = np.arange(N, dtype=np.float32).reshape(depth, colsize, rowsize)
+        xf = np.fft.fftn(x)
+        d_x_gpu = cuda.to_device(x)
+        xf_gpu = np.zeros(shape=(depth, colsize, rowsize//2 + 1), dtype=np.complex64)
+        d_xf_gpu = cuda.to_device(xf_gpu)
+        plan = Plan.many(x.shape, CUFFT_R2C)
+        plan.forward(d_x_gpu, d_xf_gpu)
+        d_xf_gpu.copy_to_host(xf_gpu)
+        self.assertTrue(np.allclose(xf[:, :, 0:rowsize//2+1], xf_gpu, atol=1e-6))
+
+
+class TestCuFFTAPI(CUDATestCase):
+    def test_fft_1d_single(self):
+        from pyculib.fft import fft
+        N = 32
+        x = np.asarray(np.arange(N), dtype=np.float32)
+        xf = np.fft.fft(x)
+
+        xf_gpu = np.empty(shape=N//2 + 1, dtype=np.complex64)
+        fft(x, xf_gpu)
+
+        self.assertTrue( np.allclose(xf[0:N//2+1], xf_gpu, atol=1e-6) )
+
+    def test_fft_1d_double(self):
+        from pyculib.fft import fft
+        N = 32
+        x = np.asarray(np.arange(N), dtype=np.float64)
+        xf = np.fft.fft(x)
+
+        xf_gpu = np.zeros(shape=N//2 + 1, dtype=np.complex128)
+        fft(x, xf_gpu)
+
+        self.assertTrue( np.allclose(xf[0:N//2+1], xf_gpu, atol=1e-6) )
+
+    def test_fft_2d_single(self):
+        from pyculib.fft import fft
+        N2 = 2
+        N1 = 32
+        N = N1 * N2
+        x = np.asarray(np.arange(N), dtype=np.float32).reshape(N2, N1)
+        xf = np.fft.fft2(x)
+
+        xf_gpu = np.empty(shape=(N2, N1//2 + 1), dtype=np.complex64)
+        fft(x, xf_gpu)
+
+        self.assertTrue( np.allclose(xf[:, 0:N1//2+1], xf_gpu, atol=1e-6) )
+
+    def test_fft_2d_single_col_major(self):
+        from pyculib.fft import fft
+        N2 = 2
+        N1 = 8
+        N = N1 * N2
+        x = np.asarray(np.arange(N), dtype=np.float32).reshape((N2, N1), order='F')
+        xf_ref = np.fft.rfft2(x)
+
+        xf = np.empty(shape=(N2, N1//2 + 1), dtype=np.complex64, order='F')
+        fft(x, xf)
+        self.assertTrue( np.allclose(xf_ref, xf, atol=1e-6) )
+
+    def test_invfft_2d_single_col_major(self):
+        from pyculib.fft import ifft
+        N2 = 4
+        N1 = 16
+        N = (N1//2 + 1) * N2
+        x = np.asarray(np.arange(N), dtype=np.complex64).reshape((N2, N1//2 + 1), order='F')
+        xf_ref = np.fft.irfft2(x)
+        xf = np.empty(shape=(N2, N1), dtype=np.float32, order='F')
+        ifft(x, xf)
+        # Note the different normalization conventions in np.fft and cuda.fft !
+        xf /= N1*N2
+        self.assertTrue(np.allclose(xf_ref, xf, atol=1e-6))
+
+    def test_fft_2d_double(self):
+        from pyculib.fft import fft
+        N2 = 2
+        N1 = 32
+        N = N1 * N2
+        x = np.asarray(np.arange(N), dtype=np.float64).reshape(N2, N1)
+        xf = np.fft.fft2(x)
+
+        xf_gpu = np.empty(shape=(N2, N1//2 + 1), dtype=np.complex128)
+        fft(x, xf_gpu)
+
+        self.assertTrue( np.allclose(xf[:, 0:N1//2+1], xf_gpu, atol=1e-6) )
+
+    def test_fft_3d_single(self):
+        from pyculib.fft import fft
+        N3 = 2
+        N2 = 2
+        N1 = 32
+        N = N1 * N2 * N3
+        x = np.asarray(np.arange(N), dtype=np.float32).reshape(N3, N2, N1)
+        xf = np.fft.fftn(x)
+
+        xf_gpu = np.empty(shape=(N3, N2, N1//2 + 1), dtype=np.complex64)
+        fft(x, xf_gpu)
+
+        self.assertTrue( np.allclose(xf[:, :, 0:N1//2+1], xf_gpu, atol=1e-6) )
+
+    def test_fft_3d_double(self):
+        from pyculib.fft import fft
+        N3 = 2
+        N2 = 2
+        N1 = 32
+        N = N1 * N2 * N3
+        x = np.asarray(np.arange(N), dtype=np.float64).reshape(N3, N2, N1)
+        xf = np.fft.fftn(x)
+
+        xf_gpu = np.empty(shape=(N3, N2, N1//2 + 1), dtype=np.complex128)
+        fft(x, xf_gpu)
+
+        self.assertTrue( np.allclose(xf[:, :, 0:N1//2+1], xf_gpu, atol=1e-6) )
+
+    def test_fft_1d_roundtrip_single(self):
+        from pyculib.fft import fft, ifft
+        N = 32
+        x = np.asarray(np.arange(N), dtype=np.float32)
+        x0 = x.copy()
+        xf_gpu = np.empty(shape=N//2 + 1, dtype=np.complex64)
+        fft(x, xf_gpu)
+        ifft(xf_gpu, x)
+        self.assertTrue( np.allclose(x / N, x0, atol=1e-6) )
+
+    def test_fft_1d_roundtrip_double(self):
+        from pyculib.fft import fft, ifft
+        N = 32
+        x = np.asarray(np.arange(N), dtype=np.float64)
+        x0 = x.copy()
+        xf_gpu = np.empty(shape=N//2 + 1, dtype=np.complex128)
+        fft(x, xf_gpu)
+        ifft(xf_gpu, x)
+        self.assertTrue( np.allclose(x / N, x0, atol=1e-6) )
+
+
+    def test_fft_2d_roundtrip_single(self):
+        from pyculib.fft import fft, ifft
+        N2 = 2
+        N1 = 32
+        N = N2 * N1
+        x = np.asarray(np.arange(N), dtype=np.float32).reshape(N2, N1)
+        x0 = x.copy()
+        xf_gpu = np.empty(shape=(N2, N1//2 + 1), dtype=np.complex64)
+        fft(x, xf_gpu)
+        ifft(xf_gpu, x)
+        self.assertTrue( np.allclose(x / N, x0, atol=1e-6) )
+
+    def test_fft_3d_roundtrip_single(self):
+        from pyculib.fft import fft, ifft
+        N3 = 2
+        N2 = 2
+        N1 = 32
+        N = N3 * N2 * N1
+        x = np.asarray(np.arange(N), dtype=np.float32).reshape(N3, N2, N1)
+        x0 = x.copy()
+        xf_gpu = np.empty(shape=(N3, N2, N1//2 + 1), dtype=np.complex64)
+        fft(x, xf_gpu)
+        ifft(xf_gpu, x)
+        self.assertTrue( np.allclose(x / N, x0, atol=1e-6) )
+
+    def test_fft_inplace_1d_single(self):
+        from pyculib.fft import fft_inplace
+        N = 32
+        x = np.asarray(np.arange(N), dtype=np.complex64)
+        xf = np.fft.fft(x)
+
+        fft_inplace(x)
+
+        self.assertTrue( np.allclose(xf, x, atol=1e-6) )
+
+    def test_fft_inplace_1d_double(self):
+        from pyculib.fft import fft_inplace
+        N = 32
+        x = np.asarray(np.arange(N), dtype=np.complex128)
+        xf = np.fft.fft(x)
+
+        fft_inplace(x)
+
+        self.assertTrue( np.allclose(xf, x, atol=1e-6) )
+
+    def test_fft_inplace_2d_single(self):
+        from pyculib.fft import fft_inplace
+        N1 = 32
+        N2 = 2
+        N = N1 * N2
+        x = np.asarray(np.arange(N), dtype=np.complex64).reshape(N2, N1)
+        xf = np.fft.fft2(x)
+
+        fft_inplace(x)
+
+        self.assertTrue( np.allclose(xf, x, atol=1e-6) )
+
+    def test_fft_inplace_2d_double(self):
+        from pyculib.fft import fft_inplace
+        N1 = 32
+        N2 = 2
+        N = N1 * N2
+        x = np.asarray(np.arange(N), dtype=np.complex128).reshape(N2, N1)
+        xf = np.fft.fft2(x)
+
+        fft_inplace(x)
+
+        self.assertTrue( np.allclose(xf, x, atol=1e-6) )
+
+    def test_fft_1d_roundtrip_single_2(self):
+        from pyculib.fft import fft_inplace, ifft_inplace
+        N = 32
+        x = np.asarray(np.arange(N), dtype=np.complex64)
+        x0 = x.copy()
+
+        fft_inplace(x)
+        ifft_inplace(x)
+
+        self.assertTrue( np.allclose(x / N, x0, atol=1e-6) )
+
+    def test_fft_1d_roundtrip_double_2(self):
+        from pyculib.fft import fft_inplace, ifft_inplace
+        N = 32
+        x = np.asarray(np.arange(N), dtype=np.complex128)
+        x0 = x.copy()
+
+        fft_inplace(x)
+        ifft_inplace(x)
+
+        self.assertTrue( np.allclose(x / N, x0, atol=1e-6) )
+
+
+    def test_fft_2d_roundtrip_single_2(self):
+        from pyculib.fft import fft_inplace, ifft_inplace
+        N2 = 2
+        N1 = 32
+        N = N1 * N2
+        x = np.asarray(np.arange(N), dtype=np.complex64).reshape(N2, N1)
+        x0 = x.copy()
+
+        fft_inplace(x)
+        ifft_inplace(x)
+
+        self.assertTrue( np.allclose(x / N, x0, atol=1e-6) )
+
+    def test_fft_3d_roundtrip_double(self):
+        from pyculib.fft import fft_inplace, ifft_inplace
+        N3 = 2
+        N2 = 2
+        N1 = 8
+        N = N3 * N2 * N1
+        x = np.asarray(np.arange(N), dtype=np.complex128).reshape(N3, N2, N1)
+        x0 = x.copy()
+
+        fft_inplace(x)
+        ifft_inplace(x)
+
+        self.assertTrue( np.allclose(x / N, x0, atol=1e-6) )
+
+
+def test():
+    import sys
+    loader = unittest.TestLoader()
+    suite = unittest.TestSuite()
+    mod = sys.modules[__name__]
+    for name in dir(mod):
+        if name.startswith('Test'):
+            test_class = getattr(mod, name)
+            tests = loader.loadTestsFromTestCase(test_class)
+            suite.addTests(tests)
+    runner = unittest.runner.TextTestRunner()
+    return runner.run(suite)
+
+
+test_cases = (TestCuFFTLib, TestCuFFTPlan, TestCuFFTAPI)
+
+if __name__ == '__main__':
+    unittest.main()
+
diff --git a/pyculib/tests/test_rand.py b/pyculib/tests/test_rand.py
new file mode 100644
index 0000000..eb4f98e
--- /dev/null
+++ b/pyculib/tests/test_rand.py
@@ -0,0 +1,219 @@
+from __future__ import print_function, absolute_import, division
+import numpy as np
+import unittest
+from .base import CUDATestCase
+from numba import cuda
+
+
+class TestCURand(CUDATestCase):
+    def test_lib(self):
+        from pyculib.rand.binding import libcurand
+        curand = libcurand()
+        self.assertNotEqual(libcurand().version, 0)
+
+class TestCURandPseudo(CUDATestCase):
+    def setUp(self):
+        from pyculib.rand.binding import (Generator,
+                                                  CURAND_RNG_PSEUDO_DEFAULT)
+
+        self.N = 10
+        self.ary32 = np.zeros(self.N, dtype=np.float32)
+        self.ary64 = np.zeros(self.N, dtype=np.float64)
+
+        self.stream = cuda.stream()
+        self.devary32 = cuda.to_device(self.ary32, stream=self.stream)
+        self.devary64 = cuda.to_device(self.ary64, stream=self.stream)
+
+        self.rndgen = Generator(CURAND_RNG_PSEUDO_DEFAULT)
+        self.rndgen.set_stream(self.stream)
+        self.rndgen.set_offset(123)
+        self.rndgen.set_pseudo_random_generator_seed(1234)
+
+    def tearDown(self):
+        self.devary32.copy_to_host(self.ary32, stream=self.stream)
+        self.devary64.copy_to_host(self.ary64, stream=self.stream)
+
+        self.stream.synchronize()
+
+        self.assertTrue(any(self.ary32 != 0))
+        self.assertTrue(any(self.ary64 != 0))
+
+        del self.N
+        del self.ary32
+        del self.ary64
+        del self.stream
+        del self.devary32
+        del self.devary64
+
+    def test_uniform(self):
+        self.rndgen.generate_uniform(self.devary32, self.N)
+        self.rndgen.generate_uniform(self.devary64, self.N)
+
+
+    def test_normal(self):
+        self.rndgen.generate_normal(self.devary32, self.N, 0, 1)
+        self.rndgen.generate_normal(self.devary64, self.N, 0, 1)
+
+    def test_log_normal(self):
+        self.rndgen.generate_log_normal(self.devary32, self.N, 0, 1)
+        self.rndgen.generate_log_normal(self.devary64, self.N, 0, 1)
+
+
+class TestCURandPoisson(CUDATestCase):
+    def setUp(self):
+        from pyculib.rand.binding import (Generator,
+                                                  CURAND_RNG_PSEUDO_DEFAULT)
+
+        self.N = 10
+        self.ary32 = np.zeros(self.N, dtype=np.uint32)
+
+        self.stream = cuda.stream()
+        self.devary32 = cuda.to_device(self.ary32, stream=self.stream)
+
+        self.rndgen = Generator(CURAND_RNG_PSEUDO_DEFAULT)
+        self.rndgen.set_stream(self.stream)
+        self.rndgen.set_offset(123)
+        self.rndgen.set_pseudo_random_generator_seed(1234)
+
+    def tearDown(self):
+        self.devary32.copy_to_host(self.ary32, stream=self.stream)
+
+        self.stream.synchronize()
+
+        self.assertTrue(any(self.ary32 != 0))
+
+        del self.N
+        del self.ary32
+        del self.stream
+        del self.devary32
+
+    def test_poisson(self):
+        self.rndgen.generate_poisson(self.devary32, self.N, 1)
+
+
+class TestCURandQuasi(CUDATestCase):
+    def test_generate(self):
+        from pyculib.rand.binding import (Generator,
+                                                  CURAND_RNG_QUASI_SOBOL64,
+                                                  CURAND_RNG_QUASI_DEFAULT)
+        N = 10
+        stream = cuda.stream()
+
+        ary32 = np.zeros(N, dtype=np.uint32)
+        devary32 = cuda.to_device(ary32, stream=stream)
+
+        rndgen = Generator(CURAND_RNG_QUASI_DEFAULT)
+        rndgen.set_stream(stream)
+        rndgen.set_offset(123)
+        rndgen.set_quasi_random_generator_dimensions(1)
+        rndgen.generate(devary32, N)
+
+        devary32.copy_to_host(ary32, stream=stream)
+        stream.synchronize()
+
+        self.assertTrue(any(ary32 != 0))
+
+
+        ary64 = np.zeros(N, dtype=np.uint64)
+        devary64 = cuda.to_device(ary64, stream=stream)
+
+        rndgen = Generator(CURAND_RNG_QUASI_SOBOL64)
+        rndgen.set_stream(stream)
+        rndgen.set_offset(123)
+        rndgen.set_quasi_random_generator_dimensions(1)
+        rndgen.generate(devary64, N)
+
+        devary64.copy_to_host(ary64, stream=stream)
+        stream.synchronize()
+
+        self.assertTrue(any(ary64 != 0))
+
+
+class TestCURandAPI(CUDATestCase):
+    def test_pseudo(self):
+        from pyculib import rand
+        prng = rand.PRNG()
+        prng.seed = 0xbeef
+        N = 10
+        ary = np.zeros(N, dtype=np.float32)
+        prng.uniform(ary, N)
+        self.assertTrue(any(ary != 0))
+
+        iary = np.zeros(N, dtype=np.uint32)
+        prng.poisson(iary, N)
+        self.assertTrue(any(iary != 0))
+
+    def test_quasi(self):
+        from pyculib import rand
+        qrng = rand.QRNG()
+        qrng.ndim = 2
+        N = 10
+        ary = np.zeros(N, dtype=np.uint32)
+        qrng.generate(ary, N)
+        self.assertTrue(any(ary != 0))
+
+
+class TestTopLevel(CUDATestCase):
+    def test_uniform(self):
+        from pyculib import rand
+        A = rand.uniform(10)
+        B = rand.uniform(10)
+        self.assertTrue(np.mean(abs((A - B) / B)) > .10)
+
+    def test_normal(self):
+        from pyculib import rand
+        A = rand.normal(0, 1, 10)
+        B = rand.normal(0, 1, 10)
+        self.assertTrue(np.mean(abs((A - B) / B)) > .10)
+
+    def test_lognormal(self):
+        from pyculib import rand
+        A = rand.lognormal(0, 1, 10)
+        B = rand.lognormal(0, 1, 10)
+        self.assertTrue(np.mean(abs((A - B) / B)) > .10)
+
+    def test_poisson(self):
+        from pyculib import rand
+        A = rand.poisson(10, 10)
+        B = rand.poisson(10, 10)
+        self.assertTrue(np.mean(abs((A - B) / B)) > .10)
+
+    def test_quasi(self):
+        from pyculib import rand
+        A = rand.quasi(10, nd=1, bits=32)
+        B = rand.quasi(10, nd=1, bits=32)
+        self.assertTrue(np.mean(abs((A - B) / B)) > .10)
+
+        A = rand.quasi(10, nd=1, bits=64)
+        B = rand.quasi(10, nd=1, bits=64)
+        self.assertTrue(np.mean(abs((A - B) / B)) > .10)
+
+        A = rand.quasi(10, nd=5, bits=32)
+        B = rand.quasi(10, nd=5, bits=32)
+        self.assertTrue(np.mean(abs((A - B) / B)) > .10)
+
+        A = rand.quasi(10, nd=5, bits=64)
+        B = rand.quasi(10, nd=5, bits=64)
+        self.assertTrue(np.mean(abs((A - B) / B)) > .10)
+
+
+def test():
+    import sys
+    loader = unittest.TestLoader()
+    suite = unittest.TestSuite()
+    mod = sys.modules[__name__]
+    for name in dir(mod):
+        if name.startswith('Test'):
+            test_class = getattr(mod, name)
+            tests = loader.loadTestsFromTestCase(test_class)
+            suite.addTests(tests)
+    runner = unittest.runner.TextTestRunner()
+    return runner.run(suite)
+
+test_cases = (TestCURand, TestCURandPseudo, TestCURandPoisson, TestCURandQuasi,
+              TestCURandAPI, TestTopLevel)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
diff --git a/pyculib/tests/test_sorting.py b/pyculib/tests/test_sorting.py
new file mode 100644
index 0000000..22e8fe0
--- /dev/null
+++ b/pyculib/tests/test_sorting.py
@@ -0,0 +1,151 @@
+from __future__ import print_function, absolute_import, division
+import numpy as np
+import unittest
+from .base import CUDATestCase
+
+SELECT_THRESHOLD = 100000
+
+
+class TestRadixSort(CUDATestCase):
+    def _test_sort(self, dtype, counts, reverse=False, seed=0,
+                   getindices=False):
+        from pyculib import sorting
+        np.random.seed(seed)
+        for count in counts:
+            data = (np.random.rand(count) * 10 * count).astype(dtype)
+            orig = data.copy()
+            gold = data.copy()
+            gold.sort()
+
+            if reverse:
+                gold = gold[::-1]
+            rs = sorting.RadixSort(maxcount=count, dtype=data.dtype,
+                                   descending=reverse)
+            if getindices:
+                indices = rs.argsort(data)
+            else:
+                indices = rs.sort(data)
+            self.assertTrue(np.all(data == gold))
+            if getindices:
+                self.assertTrue(np.all(orig[indices] == gold))
+            else:
+                self.assertIsNone(indices)
+
+    def test_sort_float32(self):
+        counts = [1, 2, 10, 13, 31, 73]
+        self._test_sort(np.float32, counts)
+        self._test_sort(np.float32, counts, reverse=True)
+        self._test_sort(np.float32, counts, reverse=True, getindices=True)
+
+    def test_sort_int32(self):
+        counts = [1, 2, 10, 13, 31, 73]
+        self._test_sort(np.int32, counts)
+        self._test_sort(np.int32, counts, reverse=True)
+        self._test_sort(np.int32, counts, reverse=True, getindices=True)
+
+    def test_sort_float64(self):
+        counts = [1, 2, 10, 13, 31, 73]
+        self._test_sort(np.float64, counts)
+        self._test_sort(np.float64, counts, reverse=True)
+        self._test_sort(np.float64, counts, reverse=True, getindices=True)
+
+    def _test_select(self, dtype, counts, ks, reverse=False, seed=0,
+                     getindices=False):
+        from pyculib import sorting
+        np.random.seed(seed)
+        for k, count in zip(ks, counts):
+            data = (np.random.rand(count) * 10 * count).astype(dtype)
+            orig = data.copy()
+            gold = data.copy()
+            gold.sort()
+            if reverse:
+                gold = gold[::-1]
+            gold = gold[:k]
+            rs = sorting.RadixSort(maxcount=count, dtype=data.dtype,
+                                   descending=reverse)
+            if getindices:
+                indices = rs.argselect(keys=data, k=k)
+            else:
+                indices = rs.select(keys=data, k=k)
+            data = data[:k]
+            self.assertTrue(np.all(data == gold))
+            # print(data, gold)
+            if getindices:
+                # print(indices)
+                # print(orig[indices])
+                self.assertTrue(np.all(orig[indices] == gold))
+            else:
+                self.assertIsNone(indices)
+
+    def test_select_float32(self):
+        counts = [1, 2, 10, 13, 31, 73, 100, 101, SELECT_THRESHOLD]
+        ks = [1, 1, 3, 5, 10, 60, 99, 101, 1000]
+        self._test_select(np.float32, counts, ks)
+        self._test_select(np.float32, counts, ks, reverse=True)
+        self._test_select(np.float32, counts, ks, reverse=True,
+                          getindices=True)
+
+    def test_select_int32(self):
+        counts = [1, 2, 10, 13, 31, 73, 100, 101, SELECT_THRESHOLD]
+        ks = [1, 1, 3, 5, 10, 60, 99, 101, 1000]
+        self._test_select(np.int32, counts, ks)
+        self._test_select(np.int32, counts, ks, reverse=True)
+        self._test_select(np.int32, counts, ks, reverse=True,
+                          getindices=True)
+
+    def test_select_float64(self):
+        counts = [1, 2, 10, 13, 31, 73, 100, 101, SELECT_THRESHOLD]
+        ks = [1, 1, 3, 5, 10, 60, 99, 101, 1000]
+        self._test_select(np.float64, counts, ks)
+        self._test_select(np.float64, counts, ks, reverse=True,
+                          getindices=True)
+
+
+class TestSegmentedSort(CUDATestCase):
+    def _test_generic(self, dtype, divby=1):
+        from pyculib import sorting
+        keys = np.array(list(reversed(range(100))), dtype=dtype) / divby
+        reference = keys.copy()
+        original = keys.copy()
+        vals = np.arange(keys.size, dtype=np.int32)
+        segments = np.array([10, 40, 70], dtype=np.int32)
+        sorting.segmented_sort(keys, vals, segments)
+
+        reference[:10].sort()
+        reference[10:40].sort()
+        reference[40:70].sort()
+        reference[70:].sort()
+
+        self.assertTrue(np.all(keys == reference))
+        self.assertTrue(np.all(original[vals] == reference))
+
+    def test_float32(self):
+        self._test_generic(np.float32, divby=10)
+
+    def test_float64(self):
+        self._test_generic(np.float64, divby=10)
+
+    def test_int32(self):
+        self._test_generic(np.int32)
+
+    def test_int64(self):
+        self._test_generic(np.int64)
+
+
+def test():
+    import sys
+    loader = unittest.TestLoader()
+    suite = unittest.TestSuite()
+    mod = sys.modules[__name__]
+    for name in dir(mod):
+        if name.startswith('Test'):
+            test_class = getattr(mod, name)
+            tests = loader.loadTestsFromTestCase(test_class)
+            suite.addTests(tests)
+    runner = unittest.runner.TextTestRunner()
+    return runner.run(suite)
+
+test_cases = (TestRadixSort, TestSegmentedSort)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pyculib/tests/test_sparse.py b/pyculib/tests/test_sparse.py
new file mode 100644
index 0000000..4d751d9
--- /dev/null
+++ b/pyculib/tests/test_sparse.py
@@ -0,0 +1,660 @@
+from __future__ import print_function, absolute_import
+import numpy as np
+import scipy.sparse
+import unittest
+from .base import CUDATestCase
+
+
+class TestCuSparseLevel1(CUDATestCase):
+    def setUp(self):
+        from pyculib.sparse import Sparse
+        self.cus = Sparse()
+
+    def tearDown(self):
+        del self.cus
+
+    def generic_test_axpyi(self, dtype):
+        alpha = 2
+        xval = np.arange(5, dtype=dtype) + 1
+        xind = np.arange(xval.size, dtype='int32') * 2
+        y = np.zeros(shape=xval.size * 2, dtype=xval.dtype)
+        self.cus.axpyi(alpha, xval, xind, y)
+        self.assertTrue(np.allclose(y[xind], (xval * 2)))
+
+    def test_Saxpyi(self):
+        self.generic_test_axpyi(dtype=np.float32)
+
+    def test_Daxpyi(self):
+        self.generic_test_axpyi(dtype=np.float64)
+
+    def test_Caxpyi(self):
+        self.generic_test_axpyi(dtype=np.complex64)
+
+    def test_Zaxpyi(self):
+        self.generic_test_axpyi(dtype=np.complex128)
+
+    def generic_test_doti(self, dtype):
+        xval = np.arange(5, dtype=dtype) + 1
+        xind = np.arange(xval.size, dtype='int32') * 2
+        y = np.ones(shape=xval.size * 2, dtype=xval.dtype)
+        result = self.cus.doti(xval, xind, y)
+        self.assertTrue(result)
+
+    def test_Sdoti(self):
+        self.generic_test_doti(dtype=np.float32)
+
+    def test_Zdoti(self):
+        self.generic_test_doti(dtype=np.complex128)
+
+    def generic_test_dotci(self, dtype):
+        xval = np.arange(5, dtype=dtype) + 1
+        xind = np.arange(xval.size, dtype='int32') * 2
+        y = np.ones(shape=xval.size * 2, dtype=xval.dtype)
+        result = self.cus.dotci(xval, xind, y)
+        self.assertTrue(result)
+
+    def test_Zdotci(self):
+        self.generic_test_dotci(dtype=np.complex128)
+
+    def generic_test_gthr(self, dtype):
+        xval = np.arange(5, dtype=dtype) + 1
+        xind = np.arange(xval.size, dtype='int32') * 2
+        y = np.ones(shape=xval.size * 2, dtype=xval.dtype)
+        self.cus.gthr(y, xval, xind)
+        self.assertTrue(np.all(xval == 1))
+
+    def test_Sgthr(self):
+        self.generic_test_gthr(dtype=np.float32)
+
+    def test_Cgthr(self):
+        self.generic_test_gthr(dtype=np.complex64)
+
+    def generic_test_gthrz(self, dtype):
+        xval = np.arange(5, dtype=dtype) + 1
+        xind = np.arange(xval.size, dtype='int32') * 2
+        y = np.ones(shape=xval.size * 2, dtype=xval.dtype)
+        self.cus.gthrz(y, xval, xind)
+        self.assertTrue(np.all(xval == 1))
+        self.assertTrue(np.all(y[xind] == 0))
+
+    def test_Dgthr(self):
+        self.generic_test_gthrz(dtype=np.float64)
+
+    def test_Zgthr(self):
+        self.generic_test_gthrz(dtype=np.complex128)
+
+    def generic_test_roti(self, dtype):
+        xval = np.arange(5, dtype=dtype) + 1
+        xind = np.arange(xval.size, dtype='int32') * 2
+        y = np.ones(shape=xval.size * 2, dtype=xval.dtype)
+        c = .2
+        s = .3
+        oldxval = xval.copy()
+        oldy = y.copy()
+        self.cus.roti(xval, xind, y, c, s)
+        self.assertFalse(np.all(oldxval == xval))
+        self.assertFalse(np.all(oldy == y))
+
+    def test_Sroti(self):
+        self.generic_test_roti(dtype=np.float32)
+
+    def test_Droti(self):
+        self.generic_test_roti(dtype=np.float64)
+
+    def generic_test_sctr(self, dtype):
+        xval = np.arange(5, dtype=dtype) + 1
+        xind = np.arange(xval.size, dtype='int32') * 2
+        y = np.ones(shape=xval.size * 2, dtype=xval.dtype)
+        oldy = y.copy()
+        self.cus.sctr(xval, xind, y)
+        self.assertFalse(np.all(oldy == y))
+
+    def test_Ssctr(self):
+        self.generic_test_sctr(dtype=np.float32)
+
+    def test_Csctr(self):
+        self.generic_test_sctr(dtype=np.complex64)
+
+
+class TestCuSparseMatrixOp(CUDATestCase):
+    def test_bsr_matrix(self):
+        from pyculib.sparse import bsr_matrix
+        row = np.array([0, 0, 1, 2, 2, 2])
+        col = np.array([0, 2, 2, 0, 1, 2])
+        data = np.array([1, 2, 3, 4, 5, 6])
+        expect = scipy.sparse.bsr_matrix((data, (row, col)), shape=(3, 3))
+        mat = bsr_matrix((data, (row, col)), shape=(3, 3))
+        host = mat.copy_to_host()
+        self.assertTrue(np.all(host.indices == expect.indices))
+        self.assertTrue(np.all(host.indptr == expect.indptr))
+        self.assertTrue(np.all(host.data == expect.data))
+
+    def test_matdescr(self):
+        from pyculib.sparse import Sparse
+        sparse = Sparse()
+        md = sparse.matdescr()
+        md.diagtype = 'N'
+        md.fillmode = 'L'
+        md.indexbase = 0
+        md.matrixtype = 'G'
+
+        self.assertEqual('N', md.diagtype)
+        self.assertEqual('L', md.fillmode)
+        self.assertEqual(0, md.indexbase)
+        self.assertEqual('G', md.matrixtype)
+        del md
+
+
+class TestCuSparseLevel2(CUDATestCase):
+    def setUp(self):
+        from pyculib.sparse import Sparse
+        self.cus = Sparse()
+
+    def generic_test_bsrmv(self, dtype):
+        from pyculib.sparse import bsr_matrix
+
+        row = np.array([0, 0, 1, 2, 2, 2])
+        col = np.array([0, 2, 2, 0, 1, 2])
+        data = np.array([1, 2, 3, 4, 5, 6], dtype=dtype)
+
+        bsrmat = bsr_matrix((data, (row, col)), shape=(3, 3))
+        x = np.ones(3, dtype=dtype)
+        y = np.ones(3, dtype=dtype)
+        oldy = y.copy()
+
+        alpha = 1
+        beta = 1
+        descr = self.cus.matdescr()
+        self.cus.bsrmv_matrix('C', 'N', alpha, descr, bsrmat, x, beta, y)
+
+        self.assertFalse(np.all(y == oldy))
+
+    def test_Sbsrmv(self):
+        dtype = np.float32
+        self.generic_test_bsrmv(dtype=dtype)
+
+    def test_Cbsrmv(self):
+        dtype = np.complex64
+        self.generic_test_bsrmv(dtype=dtype)
+
+    def test_Sbsrxmv(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+        alpha = 0
+        beta = 0
+        descr = self.cus.matdescr()
+        bsrVal = np.zeros(10, dtype=dtype)
+        bsrMaskPtr = np.zeros(10, dtype=np.int32)
+        bsrRowPtr = np.zeros(10, dtype=np.int32)
+        bsrEndPtr = np.zeros(10, dtype=np.int32)
+        bsrColInd = np.zeros(10, dtype=np.int32)
+        blockDim = 1
+        x = np.zeros(10, dtype=dtype)
+        y = np.zeros(10, dtype=dtype)
+        self.cus.bsrxmv('C', 'N', 1, 1, 1, 1, alpha, descr, bsrVal,
+                        bsrMaskPtr, bsrRowPtr, bsrEndPtr, bsrColInd,
+                        blockDim, x, beta, y)
+
+    def test_Scsrmv(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+        alpha = 0
+        beta = 0
+        descr = self.cus.matdescr()
+        csrVal = np.zeros(10, dtype=dtype)
+        csrColInd = np.zeros(10, dtype=np.int32)
+        csrRowPtr = np.zeros(10, dtype=np.int32)
+        x = np.zeros(10, dtype=dtype)
+        y = np.zeros(10, dtype=dtype)
+        trans = 'N'
+        m = 1
+        n = 1
+        nnz = 1
+        self.cus.csrmv(trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr,
+                       csrColInd, x, beta, y)
+
+    def test_Scsrmv(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+
+        descr = self.cus.matdescr()
+        csrVal = np.zeros(10, dtype=dtype)
+        csrColInd = np.zeros(10, dtype=np.int32)
+        csrRowPtr = np.zeros(10, dtype=np.int32)
+        trans = 'N'
+        m = 1
+        nnz = 1
+        info = self.cus.csrsv_analysis(trans, m, nnz, descr, csrVal,
+                                       csrRowPtr, csrColInd)
+
+        alpha = 1.0
+        x = np.zeros(10, dtype=dtype)
+        y = np.zeros(10, dtype=dtype)
+        self.cus.csrsv_solve(trans, m, alpha, descr, csrVal, csrRowPtr,
+                             csrColInd, info, x, y)
+
+
+class TestCuSparseLevel3(CUDATestCase):
+    def setUp(self):
+        from pyculib.sparse import Sparse
+        self.cus = Sparse()
+
+    def test_Scsrmm(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+
+        descrA = self.cus.matdescr()
+        B = C = csrValA = np.zeros(10, dtype=dtype)
+        csrColIndA = np.zeros(10, dtype=np.int32)
+        csrRowPtrA = np.zeros(10, dtype=np.int32)
+        ldb = 1
+        ldc = 1
+        m = 1
+        n = 1
+        k = 1
+        nnz = 1
+        alpha = 1
+        beta = 1
+        transA = 'N'
+        self.cus.csrmm(transA, m, n, k, nnz, alpha, descrA, csrValA,
+                       csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc)
+
+    def test_Ccsrmm(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.complex64
+
+        descrA = self.cus.matdescr()
+        B = C = csrValA = np.zeros(10, dtype=dtype)
+        csrColIndA = np.zeros(10, dtype=np.int32)
+        csrRowPtrA = np.zeros(10, dtype=np.int32)
+        ldb = 1
+        ldc = 1
+        m = 1
+        n = 1
+        k = 1
+        nnz = 1
+        alpha = 1
+        beta = 1
+        transA = transB = 'N'
+        self.cus.csrmm2(transA, transB, m, n, k, nnz, alpha, descrA,
+                        csrValA,
+                        csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc)
+
+    def test_Scsrsm(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+
+        descrA = self.cus.matdescr()
+        X = Y = csrValA = np.zeros(10, dtype=dtype)
+        csrColIndA = np.zeros(10, dtype=np.int32)
+        csrRowPtrA = np.zeros(10, dtype=np.int32)
+        m = 1
+        n = 1
+        nnz = 1
+        transA = 'N'
+        info = self.cus.csrsm_analysis(transA, m, nnz, descrA, csrValA,
+                                       csrRowPtrA, csrColIndA)
+        alpha = 1
+        ldx = 1
+        ldy = 1
+        self.cus.csrsm_solve(transA, m, n, alpha, descrA, csrValA,
+                             csrRowPtrA, csrColIndA, info, X, ldx, Y, ldy)
+
+
+class TestCuSparseExtra(CUDATestCase):
+    def setUp(self):
+        from pyculib.sparse import Sparse
+        self.cus = Sparse()
+
+    def test_XcsrgeamNnz(self):
+        """
+        Just exercise the codepath
+        """
+        m = n = 1
+        nnzA = 1
+        nnzB = 1
+        descrA = descrB = descrC = self.cus.matdescr()
+        csrColIndA = csrColIndB = np.zeros(10, dtype=np.int32)
+        csrRowPtrA = csrRowPtrB = csrRowPtrC = np.zeros(10, dtype=np.int32)
+        nnzC = self.cus.XcsrgeamNnz(m, n, descrA, nnzA, csrRowPtrA, csrColIndA,
+                                    descrB, nnzB, csrRowPtrB, csrColIndB,
+                                    descrC,
+                                    csrRowPtrC)
+        self.assertTrue(isinstance(nnzC, int))
+
+    def test_Scsrgeam(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+        m = n = 1
+        nnzA = 1
+        nnzB = 1
+        alpha = beta = 1
+        csrValA = csrValB = csrValC = np.zeros(10, dtype=dtype)
+        descrA = descrB = descrC = self.cus.matdescr()
+        csrColIndA = csrColIndB = csrColIndC = np.zeros(10, dtype=np.int32)
+        csrRowPtrA = csrRowPtrB = csrRowPtrC = np.zeros(10, dtype=np.int32)
+        self.cus.csrgeam(m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA,
+                         csrColIndA, beta, descrB, nnzB, csrValB,
+                         csrRowPtrB, csrColIndB, descrC, csrValC,
+                         csrRowPtrC, csrColIndC)
+
+
+    def test_XcsrgemmNnz(self):
+        """
+        Just exercise the codepath
+        """
+        m = n = k = 1
+        nnzA = 1
+        nnzB = 1
+        descrA = descrB = descrC = self.cus.matdescr()
+        csrColIndA = csrColIndB = np.zeros(10, dtype=np.int32)
+        csrRowPtrA = csrRowPtrB = csrRowPtrC = np.zeros(10, dtype=np.int32)
+        transA = transB = 'N'
+        nnzC = self.cus.XcsrgemmNnz(transA, transB, m, n, k, descrA, nnzA,
+                                    csrRowPtrA,
+                                    csrColIndA, descrB, nnzB, csrRowPtrB,
+                                    csrColIndB, descrC,
+                                    csrRowPtrC)
+        self.assertTrue(isinstance(nnzC, int))
+
+    def test_Scsrgemm(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+        m = n = k = 0
+        transA = transB = 'N'
+        nnzA = 0
+        nnzB = 0
+        csrValA = csrValB = csrValC = np.zeros(10, dtype=dtype)
+        descrA = descrB = descrC = self.cus.matdescr()
+        csrColIndA = csrColIndB = csrColIndC = np.zeros(10, dtype=np.int32)
+        csrRowPtrA = csrRowPtrB = csrRowPtrC = np.zeros(10, dtype=np.int32)
+        self.cus.csrgemm(transA, transB, m, n, k, descrA, nnzA, csrValA,
+                         csrRowPtrA, csrColIndA, descrB, nnzB, csrValB,
+                         csrRowPtrB,
+                         csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC)
+
+    def test_csrgemm_ez(self):
+        from pyculib.sparse import CudaSparseMatrix
+        from scipy.sparse import csr_matrix
+
+        def random_square_csr_matrix(N):
+            X = np.random.random((N, N))
+            X[X > 0.7] = 0
+            return csr_matrix(X)
+
+        a = random_square_csr_matrix(10)
+        b = random_square_csr_matrix(10)
+        a_cuda = CudaSparseMatrix().from_host_matrix(a)
+        b_cuda = CudaSparseMatrix().from_host_matrix(b)
+        c_cuda = self.cus.csrgemm_ez(a, b)
+        c_host = c_cuda.copy_to_host()
+        c_gold = a * b
+        # Conversion to dense necessary because Scipy and cuSPARSE order their
+        # indices differently
+        np.testing.assert_allclose(c_gold.toarray(), c_host.toarray())
+
+
+class TestCuSparsePreconditioners(CUDATestCase):
+    def setUp(self):
+        from pyculib.sparse import Sparse
+        self.cus = Sparse()
+
+    def test_Scsric0(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+
+        m = n = 3
+        trans = 'N'
+
+        sary = scipy.sparse.rand(m, n, 0.75, format='csr', dtype=dtype)
+        nnz = sary.nnz
+        csrValM = sary.data
+        csrRowPtrA = sary.indptr
+        csrColIndA = sary.indices
+
+        descr = self.cus.matdescr(matrixtype='S')
+        info = self.cus.csrsv_analysis(trans, m, nnz, descr, csrValM,
+                                       csrRowPtrA, csrColIndA)
+        self.cus.csric0(trans, m, descr, csrValM, csrRowPtrA,
+                        csrColIndA, info)
+
+    def test_Scsrilu0(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+
+        m = n = 3
+        trans = 'N'
+
+        sary = scipy.sparse.rand(m, n, 0.75, format='csr', dtype=dtype)
+        csrValM = sary.data
+        csrRowPtrA = sary.indptr
+        csrColIndA = sary.indices
+
+        descr = self.cus.matdescr()
+        info = self.cus.api.solve_analysis_info()
+        self.cus.csrilu0(trans, m, descr, csrValM, csrRowPtrA,
+                         csrColIndA, info)
+
+    def test_Sgtsv(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+
+        m = 4
+        n = 3
+        ldb = m
+        dl = np.asarray([3] * 8, dtype=dtype)
+        d = np.asarray([1] * 9, dtype=dtype)
+        du = np.asarray([4] * 8, dtype=dtype)
+        B = np.ones((m, n), dtype=dtype, order='F')
+        Bcopy = B.copy()
+        self.cus.gtsv(m, n, dl, d, du, B, ldb)
+        self.assertTrue(not np.all(B == Bcopy))
+
+    def test_Sgtsv_nopivot(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+
+        m = 4
+        n = 3
+        ldb = m
+        dl = np.asarray([3] * 8, dtype=dtype)
+        d = np.asarray([1] * 9, dtype=dtype)
+        du = np.asarray([4] * 8, dtype=dtype)
+        B = np.ones((m, n), dtype=dtype, order='F')
+        Bcopy = B.copy()
+        self.cus.gtsv_nopivot(m, n, dl, d, du, B, ldb)
+        self.assertTrue(not np.all(B == Bcopy))
+
+    def test_SgtsvStridedBatch(self):
+        """
+        Just exercise the codepath
+        """
+        dtype = np.float32
+
+        batchCount = 1
+        batchStride = 4
+
+        m = 4
+        n = 3
+        dl = np.asarray([3] * 8, dtype=dtype)
+        d = np.asarray([1] * 9, dtype=dtype)
+        du = np.asarray([4] * 8, dtype=dtype)
+        x = np.ones((m, n), dtype=dtype, order='F')
+        xcopy = x.copy()
+        self.cus.gtsvStridedBatch(m, dl, d, du, x, batchCount, batchStride)
+        self.assertTrue(not np.all(x == xcopy))
+
+
+class TestCuSparseFormatConversion(CUDATestCase):
+    """
+    These test can corrupt the CUDA context making the remaining test fails
+    """
+
+    def setUp(self):
+        from pyculib.sparse import Sparse
+        self.cus = Sparse()
+
+    def tearDown(self):
+        del self.cus
+
+    def test_Sbsr2csr(self):
+        dtype = np.dtype('float32')
+
+        dirA = 'C'
+        mb = nb = 0
+        descrA = self.cus.matdescr()
+        descrC = self.cus.matdescr()
+        bsrValA = csrValC = np.zeros(10, dtype=dtype)
+        bsrRowPtrA = bsrColIndA = np.zeros(10, dtype=np.int32)
+        csrRowPtrC = csrColIndC = np.zeros(10, dtype=np.int32)
+        blockDim = 1
+        self.cus.bsr2csr(dirA, mb, nb, descrA, bsrValA, bsrRowPtrA,
+                         bsrColIndA, blockDim, descrC, csrValC,
+                         csrRowPtrC, csrColIndC)
+
+    def test_Xcoo2csr(self):
+        nnz = 1
+        m = 1
+        csrRowPtr = np.zeros(20, dtype=np.int32)
+        cooRowInd = np.zeros(20, dtype=np.int32)
+        self.cus.Xcoo2csr(cooRowInd, nnz, m, csrRowPtr)
+
+    def test_Scsc2dense(self):
+        m = n = 1
+        lda = 1
+        descrA = self.cus.matdescr()
+        cscValA = np.zeros(10, dtype=np.float32)
+        cscRowIndA = cscColPtrA = A = np.zeros(10, dtype=np.int32)
+        self.cus.csc2dense(m, n, descrA, cscValA, cscRowIndA, cscColPtrA, A,
+                           lda)
+
+    def test_Xcsr2bsrNnz(self):
+        dirA = 'C'
+        m = n = 1
+        blockDim = 1
+        descrC = descrA = self.cus.matdescr()
+        bsrRowPtrC = csrRowPtrA = csrColIndA = np.zeros(10, dtype=np.int32)
+        self.cus.Xcsr2bsrNnz(dirA, m, n, descrA, csrRowPtrA, csrColIndA,
+                             blockDim, descrC, bsrRowPtrC)
+
+    def test_Scsr2bsr(self):
+        dtype = np.float32
+
+        dirA = 'C'
+        m = n = 1
+        blockDim = 1
+        descrC = descrA = self.cus.matdescr()
+        csrValA = bsrColIndC = bsrValC = np.zeros(10, dtype=dtype)
+        bsrRowPtrC = csrRowPtrA = csrColIndA = np.zeros(10, dtype=np.int32)
+        self.cus.csr2bsr(dirA, m, n, descrA, csrValA, csrRowPtrA, csrColIndA,
+                         blockDim, descrC, bsrValC, bsrRowPtrC, bsrColIndC)
+
+    def test_Xcsr2coo(self):
+        nnz = m = 1
+        csrRowPtr = cooRowInd = np.zeros(10, dtype=np.int32)
+        self.cus.Xcsr2coo(csrRowPtr, nnz, m, cooRowInd)
+
+    def test_Scsr2csc(self):
+        dtype = np.float32
+        m = n = nnz = 1
+        csrVal = cscVal = np.zeros(10, dtype=dtype)
+        csrRowPtr = csrColInd = np.zeros(10, dtype=np.int32)
+        cscRowInd = cscColPtr = np.zeros(10, dtype=np.int32)
+        copyValues = 'N'
+        self.cus.csr2csc(m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+                         cscRowInd, cscColPtr, copyValues)
+
+    def test_Scsr2dense(self):
+        dtype = np.float32
+
+        m = n = 1
+        lda = 1
+        descrA = self.cus.matdescr()
+        A = csrValA = np.zeros(10, dtype=dtype)
+        csrRowPtrA = csrColIndA = np.zeros(10, np.int32)
+        self.cus.csr2dense(m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A,
+                           lda)
+
+    def test_Sdense2csc(self):
+        dtype = np.float32
+        
+        m = n = 2
+        lda = m
+        nnzPerCol = np.ones(n, np.int32) * 2
+        descrA = self.cus.matdescr()
+        A = np.ones(n, dtype=dtype)
+        cscValA = np.zeros(10, dtype=dtype)
+        cscRowIndA = np.zeros(10, np.int32)
+        cscColPtrA = np.zeros(10, np.int32)
+
+        self.cus.dense2csc(m, n, descrA, A, lda, nnzPerCol, cscValA,
+                           cscRowIndA, cscColPtrA)
+
+    def test_Sdense2csr(self):
+        dtype = np.float32
+
+        m = n = 2
+        lda = m
+        nnzPerRow = np.ones(n, np.int32) * 2
+        descrA = self.cus.matdescr()
+        A = np.ones(10, dtype=dtype)
+        csrValA = np.zeros(10, dtype=dtype)
+        csrRowPtrA = csrColIndA = np.zeros(10, np.int32)
+        self.cus.dense2csr(m, n, descrA, A, lda, nnzPerRow, csrValA,
+                           csrRowPtrA, csrColIndA)
+
+    def test_Snnz(self):
+        dtype = np.float32
+        dirA = 'C'
+        m = n = 2
+        lda = m
+        descrA = self.cus.matdescr()
+        A = np.ones(10, dtype=dtype)
+        nnzPerRowColumn = np.zeros(10, np.int32)
+        self.cus.nnz(dirA, m, n, descrA, A, lda, nnzPerRowColumn)
+
+
+def test():
+    import sys
+    loader = unittest.TestLoader()
+    suite = unittest.TestSuite()
+    mod = sys.modules[__name__]
+    for name in dir(mod):
+        if name.startswith('Test'):
+            test_class = getattr(mod, name)
+            tests = loader.loadTestsFromTestCase(test_class)
+            suite.addTests(tests)
+    runner = unittest.runner.TextTestRunner()
+    return runner.run(suite)
+
+test_cases = (TestCuSparseLevel1, TestCuSparseMatrixOp, TestCuSparseLevel2,
+              TestCuSparseLevel3, TestCuSparseExtra,
+              TestCuSparsePreconditioners, TestCuSparseFormatConversion)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pyculib/utils/__init__.py b/pyculib/utils/__init__.py
new file mode 100644
index 0000000..4b74290
--- /dev/null
+++ b/pyculib/utils/__init__.py
@@ -0,0 +1,4 @@
+from __future__ import absolute_import
+from .libutils import ctype_function, Lib
+from .cctypes import c_complex, c_double_complex, memalign
+from .finalizer import OwnerMixin
diff --git a/pyculib/utils/cctypes.py b/pyculib/utils/cctypes.py
new file mode 100644
index 0000000..99f6a58
--- /dev/null
+++ b/pyculib/utils/cctypes.py
@@ -0,0 +1,47 @@
+from __future__ import absolute_import, print_function, division
+from ctypes import c_float, c_double, Structure, c_uint8, sizeof, addressof
+import numpy as np
+
+
+class c_complex(Structure):
+    _fields_ = [('real', c_float), ('imag', c_float)]
+
+    def __init__(self, real=0, imag=0):
+        if isinstance(real, (complex, np.complex64, np.complex128)):
+            real, imag = real.real, real.imag
+        super(c_complex, self).__init__(real, imag)
+
+    @property
+    def value(self):
+        return complex(self.real, self.imag)
+
+
+class c_double_complex(Structure):
+    _fields_ = [('real', c_double), ('imag', c_double)]
+
+    def __init__(self, real=0, imag=0):
+        if isinstance(real, (complex, np.complex64, np.complex128)):
+            real, imag = real.real, real.imag
+        super(c_double_complex, self).__init__(real, imag)
+
+    @property
+    def value(self):
+        return complex(self.real, self.imag)
+
+
+def memalign(cty, align):
+    """Allocate a ctype object on the specific byte alignment
+    """
+    # Allocate bytes with offset
+    mem = (c_uint8 * (sizeof(cty) + align))()
+    addr = addressof(mem)
+
+    # Move to alignment
+    offset = addr % align
+    if offset:
+        offset = align - offset
+
+    buf = cty.from_address(offset + addr)
+    assert 0 == addressof(buf) % align
+
+    return buf, mem
diff --git a/pyculib/utils/finalizer.py b/pyculib/utils/finalizer.py
new file mode 100644
index 0000000..17ca300
--- /dev/null
+++ b/pyculib/utils/finalizer.py
@@ -0,0 +1,43 @@
+'''
+Modified C-level finalizer by Benjamin Peterson <benjamin@python.org>
+Available at http://code.activestate.com/recipes/577242-calling-c-level-finalizers-without-__del__/
+'''
+import weakref
+
+class OwnerRef(weakref.ref):
+    """A simple weakref.ref subclass, so attributes can be added."""
+    pass
+
+def _run_finalizer(ref):
+    """Internal weakref callback to run finalizers"""
+    del _finalize_refs[ref.owner]
+    for item, finalizer in ref.items:
+        try:
+            finalizer(item)
+        except Exception:
+            pass
+
+_finalize_refs = {}
+
+def track(owner, item, finalizer):
+    """Register an object for finalization.
+
+        ``owner`` is the the object which is responsible for ``item``.
+        ``finalizer`` will be called with ``item`` as its only argument when
+        ``owner`` is destroyed by the garbage collector.
+        """
+    if id(owner) in _finalize_refs:
+        ref = _finalize_refs[id(owner)]
+    else:
+        ref = OwnerRef(owner, _run_finalizer)
+        ref.owner = id(owner)
+        ref.items = []
+    ref.items.append((item, finalizer))
+    _finalize_refs[id(owner)] = ref
+
+class OwnerMixin(object):
+    def _finalizer_track(self, item):
+        if not hasattr(self, '_finalize'):
+            raise AttributeError("%s must define a _finalize method" % self)
+        track(self, item, type(self)._finalize)
+
diff --git a/pyculib/utils/libutils.py b/pyculib/utils/libutils.py
new file mode 100644
index 0000000..92434ac
--- /dev/null
+++ b/pyculib/utils/libutils.py
@@ -0,0 +1,55 @@
+from __future__ import absolute_import
+from numba.cuda.cudadrv.libs import open_cudalib
+
+class ctype_function(object):
+    def __init__(self, restype=None, *argtypes):
+        self.restype = restype
+        self.argtypes = argtypes
+
+class Lib(object):
+    __singleton = None
+    lib = None
+
+    def __new__(cls):
+        # Check if we already have opened the dll
+        if cls.__singleton is None:
+            try:
+                dll = open_cudalib(cls.lib)
+            except OSError as e:
+                raise Exception("Cannot open library for %s:\n%s" % (cls.lib,
+                                                                     e))
+            # Create new instance
+            inst = object.__new__(cls)
+            cls.__singleton = inst
+            inst.dll = dll
+            inst._initialize()
+        else:
+            inst = cls.__singleton
+        return inst
+
+    def _initialize(self):
+        # Populate the instance with the functions
+        for name, obj in vars(type(self)).items():
+            if isinstance(obj, ctype_function):
+                fn = getattr(self.dll, name)
+                fn.restype = obj.restype
+                fn.argtypes = obj.argtypes
+                setattr(self, name, self._auto_checking_wrapper(fn, name=name))
+
+    def _auto_checking_wrapper(self, fn, name):
+        def wrapped(*args, **kws):
+            nargs = len(args) + len(kws)
+            expected = len(fn.argtypes)
+            if nargs != expected:
+                msg = "expecting {expected} arguments but got {nargs}: {fname}"
+                raise TypeError(msg.format(expected=expected, nargs=nargs,
+                                           fname=name))
+            status = fn(*args, **kws)
+            self.check_error(status)
+            return status
+        return wrapped
+
+    def check_error(self, status):
+        if status != 0:
+            raise self.ErrorType(status)
+
diff --git a/pyculib/warnings.py b/pyculib/warnings.py
new file mode 100644
index 0000000..88037db
--- /dev/null
+++ b/pyculib/warnings.py
@@ -0,0 +1,22 @@
+from __future__ import absolute_import
+from pyculib import config
+import warnings # so we can use warnings.warn
+from warnings import *
+
+class PerformanceWarning(Warning):
+    """
+    Warning category for when an operation might not be
+    as fast as expected.
+    """
+
+
+# Define a simple no-op for the (default) case
+# where performance warnings are disabled.
+
+def no_warn(*args, **kwds): pass
+
+if config.WARNINGS:
+    warn = warnings.warn
+
+else:
+    warn = no_warn
diff --git a/runtests.py b/runtests.py
new file mode 100755
index 0000000..1125c10
--- /dev/null
+++ b/runtests.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env python
+
+import runpy
+
+runpy.run_module('pyculib.runtests', run_name='__main__')
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..3f5a53c
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,7 @@
+[versioneer]
+VCS = git
+style = pep440
+versionfile_source = pyculib/_version.py
+versionfile_build = pyculib/_version.py
+tag_prefix =
+parentdir_prefix = pyculib
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..862ab39
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+try:
+    from setuptools import setup
+except ImportError:
+    from distutils.core import setup
+
+from distutils.spawn import spawn
+from distutils.command import build
+
+import versioneer
+
+class build_doc(build.build):
+    description = "build documentation"
+
+    def run(self):
+        spawn(['make', '-C', 'docs', 'html'])
+
+packages = [
+    'pyculib',
+    'pyculib.blas',
+    'pyculib.fft',
+    'pyculib.rand',
+    'pyculib.sparse',
+    'pyculib.sorting',
+    'pyculib.utils',
+    'pyculib.tests',
+]
+
+cmdclass = versioneer.get_cmdclass()
+cmdclass['build_doc'] = build_doc
+
+if __name__ == '__main__':
+    setup(
+        name='pyculib',
+        description='Pyculib - python bindings for NVIDIA CUDA libraries',
+        author='Continuum Analytics, Inc.',
+        author_email='support@continuum.io',
+        url='http://continuum.io',
+        packages=packages,
+        license='BSD',
+        version=versioneer.get_version(),
+        cmdclass=cmdclass,
+    )
diff --git a/versioneer.py b/versioneer.py
new file mode 100644
index 0000000..64fea1c
--- /dev/null
+++ b/versioneer.py
@@ -0,0 +1,1822 @@
+
+# Version: 0.18
+
+"""The Versioneer - like a rocketeer, but for versions.
+
+The Versioneer
+==============
+
+* like a rocketeer, but for versions!
+* https://github.com/warner/python-versioneer
+* Brian Warner
+* License: Public Domain
+* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy
+* [![Latest Version]
+(https://pypip.in/version/versioneer/badge.svg?style=flat)
+](https://pypi.python.org/pypi/versioneer/)
+* [![Build Status]
+(https://travis-ci.org/warner/python-versioneer.png?branch=master)
+](https://travis-ci.org/warner/python-versioneer)
+
+This is a tool for managing a recorded version number in distutils-based
+python projects. The goal is to remove the tedious and error-prone "update
+the embedded version string" step from your release process. Making a new
+release should be as easy as recording a new tag in your version-control
+system, and maybe making new tarballs.
+
+
+## Quick Install
+
+* `pip install versioneer` to somewhere to your $PATH
+* add a `[versioneer]` section to your setup.cfg (see below)
+* run `versioneer install` in your source tree, commit the results
+
+## Version Identifiers
+
+Source trees come from a variety of places:
+
+* a version-control system checkout (mostly used by developers)
+* a nightly tarball, produced by build automation
+* a snapshot tarball, produced by a web-based VCS browser, like github's
+  "tarball from tag" feature
+* a release tarball, produced by "setup.py sdist", distributed through PyPI
+
+Within each source tree, the version identifier (either a string or a number,
+this tool is format-agnostic) can come from a variety of places:
+
+* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
+  about recent "tags" and an absolute revision-id
+* the name of the directory into which the tarball was unpacked
+* an expanded VCS keyword ($Id$, etc)
+* a `_version.py` created by some earlier build step
+
+For released software, the version identifier is closely related to a VCS
+tag. Some projects use tag names that include more than just the version
+string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
+needs to strip the tag prefix to extract the version identifier. For
+unreleased software (between tags), the version identifier should provide
+enough information to help developers recreate the same tree, while also
+giving them an idea of roughly how old the tree is (after version 1.2, before
+version 1.3). Many VCS systems can report a description that captures this,
+for example `git describe --tags --dirty --always` reports things like
+"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
+0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
+uncommitted changes.
+
+The version identifier is used for multiple purposes:
+
+* to allow the module to self-identify its version: `myproject.__version__`
+* to choose a name and prefix for a 'setup.py sdist' tarball
+
+## Theory of Operation
+
+Versioneer works by adding a special `_version.py` file into your source
+tree, where your `__init__.py` can import it. This `_version.py` knows how to
+dynamically ask the VCS tool for version information at import time.
+
+`_version.py` also contains `$Revision$` markers, and the installation
+process marks `_version.py` to have this marker rewritten with a tag name
+during the `git archive` command. As a result, generated tarballs will
+contain enough information to get the proper version.
+
+To allow `setup.py` to compute a version too, a `versioneer.py` is added to
+the top level of your source tree, next to `setup.py` and the `setup.cfg`
+that configures it. This overrides several distutils/setuptools commands to
+compute the version when invoked, and changes `setup.py build` and `setup.py
+sdist` to replace `_version.py` with a small static file that contains just
+the generated version data.
+
+## Installation
+
+See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
+
+## Version-String Flavors
+
+Code which uses Versioneer can learn about its version string at runtime by
+importing `_version` from your main `__init__.py` file and running the
+`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
+import the top-level `versioneer.py` and run `get_versions()`.
+
+Both functions return a dictionary with different flavors of version
+information:
+
+* `['version']`: A condensed version string, rendered using the selected
+  style. This is the most commonly used value for the project's version
+  string. The default "pep440" style yields strings like `0.11`,
+  `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
+  below for alternative styles.
+
+* `['full-revisionid']`: detailed revision identifier. For Git, this is the
+  full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
+
+* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
+  commit date in ISO 8601 format. This will be None if the date is not
+  available.
+
+* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
+  this is only accurate if run in a VCS checkout, otherwise it is likely to
+  be False or None
+
+* `['error']`: if the version string could not be computed, this will be set
+  to a string describing the problem, otherwise it will be None. It may be
+  useful to throw an exception in setup.py if this is set, to avoid e.g.
+  creating tarballs with a version string of "unknown".
+
+Some variants are more useful than others. Including `full-revisionid` in a
+bug report should allow developers to reconstruct the exact code being tested
+(or indicate the presence of local changes that should be shared with the
+developers). `version` is suitable for display in an "about" box or a CLI
+`--version` output: it can be easily compared against release notes and lists
+of bugs fixed in various releases.
+
+The installer adds the following text to your `__init__.py` to place a basic
+version in `YOURPROJECT.__version__`:
+
+    from ._version import get_versions
+    __version__ = get_versions()['version']
+    del get_versions
+
+## Styles
+
+The setup.cfg `style=` configuration controls how the VCS information is
+rendered into a version string.
+
+The default style, "pep440", produces a PEP440-compliant string, equal to the
+un-prefixed tag name for actual releases, and containing an additional "local
+version" section with more detail for in-between builds. For Git, this is
+TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
+--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
+tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
+that this commit is two revisions ("+2") beyond the "0.11" tag. For released
+software (exactly equal to a known tag), the identifier will only contain the
+stripped tag, e.g. "0.11".
+
+Other styles are available. See [details.md](details.md) in the Versioneer
+source tree for descriptions.
+
+## Debugging
+
+Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
+to return a version of "0+unknown". To investigate the problem, run `setup.py
+version`, which will run the version-lookup code in a verbose mode, and will
+display the full contents of `get_versions()` (including the `error` string,
+which may help identify what went wrong).
+
+## Known Limitations
+
+Some situations are known to cause problems for Versioneer. This details the
+most significant ones. More can be found on Github
+[issues page](https://github.com/warner/python-versioneer/issues).
+
+### Subprojects
+
+Versioneer has limited support for source trees in which `setup.py` is not in
+the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
+two common reasons why `setup.py` might not be in the root:
+
+* Source trees which contain multiple subprojects, such as
+  [Buildbot](https://github.com/buildbot/buildbot), which contains both
+  "master" and "slave" subprojects, each with their own `setup.py`,
+  `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
+  distributions (and upload multiple independently-installable tarballs).
+* Source trees whose main purpose is to contain a C library, but which also
+  provide bindings to Python (and perhaps other langauges) in subdirectories.
+
+Versioneer will look for `.git` in parent directories, and most operations
+should get the right version string. However `pip` and `setuptools` have bugs
+and implementation details which frequently cause `pip install .` from a
+subproject directory to fail to find a correct version string (so it usually
+defaults to `0+unknown`).
+
+`pip install --editable .` should work correctly. `setup.py install` might
+work too.
+
+Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
+some later version.
+
+[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking
+this issue. The discussion in
+[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the
+issue from the Versioneer side in more detail.
+[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
+[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
+pip to let Versioneer work correctly.
+
+Versioneer-0.16 and earlier only looked for a `.git` directory next to the
+`setup.cfg`, so subprojects were completely unsupported with those releases.
+
+### Editable installs with setuptools <= 18.5
+
+`setup.py develop` and `pip install --editable .` allow you to install a
+project into a virtualenv once, then continue editing the source code (and
+test) without re-installing after every change.
+
+"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
+convenient way to specify executable scripts that should be installed along
+with the python package.
+
+These both work as expected when using modern setuptools. When using
+setuptools-18.5 or earlier, however, certain operations will cause
+`pkg_resources.DistributionNotFound` errors when running the entrypoint
+script, which must be resolved by re-installing the package. This happens
+when the install happens with one version, then the egg_info data is
+regenerated while a different version is checked out. Many setup.py commands
+cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
+a different virtualenv), so this can be surprising.
+
+[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes
+this one, but upgrading to a newer version of setuptools should probably
+resolve it.
+
+### Unicode version strings
+
+While Versioneer works (and is continually tested) with both Python 2 and
+Python 3, it is not entirely consistent with bytes-vs-unicode distinctions.
+Newer releases probably generate unicode version strings on py2. It's not
+clear that this is wrong, but it may be surprising for applications when then
+write these strings to a network connection or include them in bytes-oriented
+APIs like cryptographic checksums.
+
+[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates
+this question.
+
+
+## Updating Versioneer
+
+To upgrade your project to a new release of Versioneer, do the following:
+
+* install the new Versioneer (`pip install -U versioneer` or equivalent)
+* edit `setup.cfg`, if necessary, to include any new configuration settings
+  indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
+* re-run `versioneer install` in your source tree, to replace
+  `SRC/_version.py`
+* commit any changed files
+
+## Future Directions
+
+This tool is designed to make it easily extended to other version-control
+systems: all VCS-specific components are in separate directories like
+src/git/ . The top-level `versioneer.py` script is assembled from these
+components by running make-versioneer.py . In the future, make-versioneer.py
+will take a VCS name as an argument, and will construct a version of
+`versioneer.py` that is specific to the given VCS. It might also take the
+configuration arguments that are currently provided manually during
+installation by editing setup.py . Alternatively, it might go the other
+direction and include code from all supported VCS systems, reducing the
+number of intermediate scripts.
+
+
+## License
+
+To make Versioneer easier to embed, all its code is dedicated to the public
+domain. The `_version.py` that it creates is also in the public domain.
+Specifically, both are released under the Creative Commons "Public Domain
+Dedication" license (CC0-1.0), as described in
+https://creativecommons.org/publicdomain/zero/1.0/ .
+
+"""
+
+from __future__ import print_function
+try:
+    import configparser
+except ImportError:
+    import ConfigParser as configparser
+import errno
+import json
+import os
+import re
+import subprocess
+import sys
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_root():
+    """Get the project root directory.
+
+    We require that all commands are run from the project root, i.e. the
+    directory that contains setup.py, setup.cfg, and versioneer.py .
+    """
+    root = os.path.realpath(os.path.abspath(os.getcwd()))
+    setup_py = os.path.join(root, "setup.py")
+    versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        # allow 'python path/to/setup.py COMMAND'
+        root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
+        setup_py = os.path.join(root, "setup.py")
+        versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        err = ("Versioneer was unable to run the project root directory. "
+               "Versioneer requires setup.py to be executed from "
+               "its immediate directory (like 'python setup.py COMMAND'), "
+               "or in a way that lets it use sys.argv[0] to find the root "
+               "(like 'python path/to/setup.py COMMAND').")
+        raise VersioneerBadRootError(err)
+    try:
+        # Certain runtime workflows (setup.py install/develop in a setuptools
+        # tree) execute all dependencies in a single python process, so
+        # "versioneer" may be imported multiple times, and python's shared
+        # module-import table will cache the first one. So we can't use
+        # os.path.dirname(__file__), as that will find whichever
+        # versioneer.py was first imported, even in later projects.
+        me = os.path.realpath(os.path.abspath(__file__))
+        me_dir = os.path.normcase(os.path.splitext(me)[0])
+        vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
+        if me_dir != vsr_dir:
+            print("Warning: build in %s is using versioneer.py from %s"
+                  % (os.path.dirname(me), versioneer_py))
+    except NameError:
+        pass
+    return root
+
+
+def get_config_from_root(root):
+    """Read the project setup.cfg file to determine Versioneer config."""
+    # This might raise EnvironmentError (if setup.cfg is missing), or
+    # configparser.NoSectionError (if it lacks a [versioneer] section), or
+    # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
+    # the top of versioneer.py for instructions on writing your setup.cfg .
+    setup_cfg = os.path.join(root, "setup.cfg")
+    parser = configparser.SafeConfigParser()
+    with open(setup_cfg, "r") as f:
+        parser.readfp(f)
+    VCS = parser.get("versioneer", "VCS")  # mandatory
+
+    def get(parser, name):
+        if parser.has_option("versioneer", name):
+            return parser.get("versioneer", name)
+        return None
+    cfg = VersioneerConfig()
+    cfg.VCS = VCS
+    cfg.style = get(parser, "style") or ""
+    cfg.versionfile_source = get(parser, "versionfile_source")
+    cfg.versionfile_build = get(parser, "versionfile_build")
+    cfg.tag_prefix = get(parser, "tag_prefix")
+    if cfg.tag_prefix in ("''", '""'):
+        cfg.tag_prefix = ""
+    cfg.parentdir_prefix = get(parser, "parentdir_prefix")
+    cfg.verbose = get(parser, "verbose")
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+# these dictionaries contain VCS-specific tools
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+LONG_VERSION_PY['git'] = '''
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
+    git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
+    git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "%(STYLE)s"
+    cfg.tag_prefix = "%(TAG_PREFIX)s"
+    cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
+    cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %%s" %% dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %%s" %% (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %%s (error)" %% dispcmd)
+            print("stdout was %%s" %% stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %%s but none started with prefix %%s" %%
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %%d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%%s', no digits" %% ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %%s" %% ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %%s" %% r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %%s not under git control" %% root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%%s*" %% tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%%s'"
+                               %% describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%%s' doesn't start with prefix '%%s'"
+                print(fmt %% (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
+                               %% (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%%d" %% pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%%d" %% pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%%s" %% pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%%s" %% pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%%s'" %% style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
+'''
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def do_vcs_install(manifest_in, versionfile_source, ipy):
+    """Git-specific installation logic for Versioneer.
+
+    For Git, this means creating/changing .gitattributes to mark _version.py
+    for export-subst keyword substitution.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+    files = [manifest_in, versionfile_source]
+    if ipy:
+        files.append(ipy)
+    try:
+        me = __file__
+        if me.endswith(".pyc") or me.endswith(".pyo"):
+            me = os.path.splitext(me)[0] + ".py"
+        versioneer_file = os.path.relpath(me)
+    except NameError:
+        versioneer_file = "versioneer.py"
+    files.append(versioneer_file)
+    present = False
+    try:
+        f = open(".gitattributes", "r")
+        for line in f.readlines():
+            if line.strip().startswith(versionfile_source):
+                if "export-subst" in line.strip().split()[1:]:
+                    present = True
+        f.close()
+    except EnvironmentError:
+        pass
+    if not present:
+        f = open(".gitattributes", "a+")
+        f.write("%s export-subst\n" % versionfile_source)
+        f.close()
+        files.append(".gitattributes")
+    run_command(GITS, ["add", "--"] + files)
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+SHORT_VERSION_PY = """
+# This file was generated by 'versioneer.py' (0.18) from
+# revision-control system data, or from the parent directory name of an
+# unpacked source archive. Distribution tarballs contain a pre-generated copy
+# of this file.
+
+import json
+
+version_json = '''
+%s
+'''  # END VERSION_JSON
+
+
+def get_versions():
+    return json.loads(version_json)
+"""
+
+
+def versions_from_file(filename):
+    """Try to determine the version from _version.py if present."""
+    try:
+        with open(filename) as f:
+            contents = f.read()
+    except EnvironmentError:
+        raise NotThisMethod("unable to read _version.py")
+    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
+                   contents, re.M | re.S)
+    if not mo:
+        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
+                       contents, re.M | re.S)
+    if not mo:
+        raise NotThisMethod("no version_json in _version.py")
+    return json.loads(mo.group(1))
+
+
+def write_to_version_file(filename, versions):
+    """Write the given version number to the given _version.py file."""
+    os.unlink(filename)
+    contents = json.dumps(versions, sort_keys=True,
+                          indent=1, separators=(",", ": "))
+    with open(filename, "w") as f:
+        f.write(SHORT_VERSION_PY % contents)
+
+    print("set %s to '%s'" % (filename, versions["version"]))
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+class VersioneerBadRootError(Exception):
+    """The project root directory is unknown or missing key files."""
+
+
+def get_versions(verbose=False):
+    """Get the project version from whatever source is available.
+
+    Returns dict with two keys: 'version' and 'full'.
+    """
+    if "versioneer" in sys.modules:
+        # see the discussion in cmdclass.py:get_cmdclass()
+        del sys.modules["versioneer"]
+
+    root = get_root()
+    cfg = get_config_from_root(root)
+
+    assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
+    handlers = HANDLERS.get(cfg.VCS)
+    assert handlers, "unrecognized VCS '%s'" % cfg.VCS
+    verbose = verbose or cfg.verbose
+    assert cfg.versionfile_source is not None, \
+        "please set versioneer.versionfile_source"
+    assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
+
+    versionfile_abs = os.path.join(root, cfg.versionfile_source)
+
+    # extract version from first of: _version.py, VCS command (e.g. 'git
+    # describe'), parentdir. This is meant to work for developers using a
+    # source checkout, for users of a tarball created by 'setup.py sdist',
+    # and for users of a tarball/zipball created by 'git archive' or github's
+    # download-from-tag feature or the equivalent in other VCSes.
+
+    get_keywords_f = handlers.get("get_keywords")
+    from_keywords_f = handlers.get("keywords")
+    if get_keywords_f and from_keywords_f:
+        try:
+            keywords = get_keywords_f(versionfile_abs)
+            ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
+            if verbose:
+                print("got version from expanded keyword %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        ver = versions_from_file(versionfile_abs)
+        if verbose:
+            print("got version from file %s %s" % (versionfile_abs, ver))
+        return ver
+    except NotThisMethod:
+        pass
+
+    from_vcs_f = handlers.get("pieces_from_vcs")
+    if from_vcs_f:
+        try:
+            pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
+            ver = render(pieces, cfg.style)
+            if verbose:
+                print("got version from VCS %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        if cfg.parentdir_prefix:
+            ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+            if verbose:
+                print("got version from parentdir %s" % ver)
+            return ver
+    except NotThisMethod:
+        pass
+
+    if verbose:
+        print("unable to compute version")
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None, "error": "unable to compute version",
+            "date": None}
+
+
+def get_version():
+    """Get the short version string for this project."""
+    return get_versions()["version"]
+
+
+def get_cmdclass():
+    """Get the custom setuptools/distutils subclasses used by Versioneer."""
+    if "versioneer" in sys.modules:
+        del sys.modules["versioneer"]
+        # this fixes the "python setup.py develop" case (also 'install' and
+        # 'easy_install .'), in which subdependencies of the main project are
+        # built (using setup.py bdist_egg) in the same python process. Assume
+        # a main project A and a dependency B, which use different versions
+        # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
+        # sys.modules by the time B's setup.py is executed, causing B to run
+        # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
+        # sandbox that restores sys.modules to it's pre-build state, so the
+        # parent is protected against the child's "import versioneer". By
+        # removing ourselves from sys.modules here, before the child build
+        # happens, we protect the child from the parent's versioneer too.
+        # Also see https://github.com/warner/python-versioneer/issues/52
+
+    cmds = {}
+
+    # we add "version" to both distutils and setuptools
+    from distutils.core import Command
+
+    class cmd_version(Command):
+        description = "report generated version string"
+        user_options = []
+        boolean_options = []
+
+        def initialize_options(self):
+            pass
+
+        def finalize_options(self):
+            pass
+
+        def run(self):
+            vers = get_versions(verbose=True)
+            print("Version: %s" % vers["version"])
+            print(" full-revisionid: %s" % vers.get("full-revisionid"))
+            print(" dirty: %s" % vers.get("dirty"))
+            print(" date: %s" % vers.get("date"))
+            if vers["error"]:
+                print(" error: %s" % vers["error"])
+    cmds["version"] = cmd_version
+
+    # we override "build_py" in both distutils and setuptools
+    #
+    # most invocation pathways end up running build_py:
+    #  distutils/build -> build_py
+    #  distutils/install -> distutils/build ->..
+    #  setuptools/bdist_wheel -> distutils/install ->..
+    #  setuptools/bdist_egg -> distutils/install_lib -> build_py
+    #  setuptools/install -> bdist_egg ->..
+    #  setuptools/develop -> ?
+    #  pip install:
+    #   copies source tree to a tempdir before running egg_info/etc
+    #   if .git isn't copied too, 'git describe' will fail
+    #   then does setup.py bdist_wheel, or sometimes setup.py install
+    #  setup.py egg_info -> ?
+
+    # we override different "build_py" commands for both environments
+    if "setuptools" in sys.modules:
+        from setuptools.command.build_py import build_py as _build_py
+    else:
+        from distutils.command.build_py import build_py as _build_py
+
+    class cmd_build_py(_build_py):
+        def run(self):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            versions = get_versions()
+            _build_py.run(self)
+            # now locate _version.py in the new build/ directory and replace
+            # it with an updated value
+            if cfg.versionfile_build:
+                target_versionfile = os.path.join(self.build_lib,
+                                                  cfg.versionfile_build)
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+    cmds["build_py"] = cmd_build_py
+
+    if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
+        from cx_Freeze.dist import build_exe as _build_exe
+        # nczeczulin reports that py2exe won't like the pep440-style string
+        # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
+        # setup(console=[{
+        #   "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
+        #   "product_version": versioneer.get_version(),
+        #   ...
+
+        class cmd_build_exe(_build_exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _build_exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["build_exe"] = cmd_build_exe
+        del cmds["build_py"]
+
+    if 'py2exe' in sys.modules:  # py2exe enabled?
+        try:
+            from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
+        except ImportError:
+            from py2exe.build_exe import py2exe as _py2exe  # py2
+
+        class cmd_py2exe(_py2exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _py2exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["py2exe"] = cmd_py2exe
+
+    # we override different "sdist" commands for both environments
+    if "setuptools" in sys.modules:
+        from setuptools.command.sdist import sdist as _sdist
+    else:
+        from distutils.command.sdist import sdist as _sdist
+
+    class cmd_sdist(_sdist):
+        def run(self):
+            versions = get_versions()
+            self._versioneer_generated_versions = versions
+            # unless we update this, the command will keep using the old
+            # version
+            self.distribution.metadata.version = versions["version"]
+            return _sdist.run(self)
+
+        def make_release_tree(self, base_dir, files):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            _sdist.make_release_tree(self, base_dir, files)
+            # now locate _version.py in the new base_dir directory
+            # (remembering that it may be a hardlink) and replace it with an
+            # updated value
+            target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
+            print("UPDATING %s" % target_versionfile)
+            write_to_version_file(target_versionfile,
+                                  self._versioneer_generated_versions)
+    cmds["sdist"] = cmd_sdist
+
+    return cmds
+
+
+CONFIG_ERROR = """
+setup.cfg is missing the necessary Versioneer configuration. You need
+a section like:
+
+ [versioneer]
+ VCS = git
+ style = pep440
+ versionfile_source = src/myproject/_version.py
+ versionfile_build = myproject/_version.py
+ tag_prefix =
+ parentdir_prefix = myproject-
+
+You will also need to edit your setup.py to use the results:
+
+ import versioneer
+ setup(version=versioneer.get_version(),
+       cmdclass=versioneer.get_cmdclass(), ...)
+
+Please read the docstring in ./versioneer.py for configuration instructions,
+edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
+"""
+
+SAMPLE_CONFIG = """
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+#VCS = git
+#style = pep440
+#versionfile_source =
+#versionfile_build =
+#tag_prefix =
+#parentdir_prefix =
+
+"""
+
+INIT_PY_SNIPPET = """
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
+"""
+
+
+def do_setup():
+    """Main VCS-independent setup function for installing Versioneer."""
+    root = get_root()
+    try:
+        cfg = get_config_from_root(root)
+    except (EnvironmentError, configparser.NoSectionError,
+            configparser.NoOptionError) as e:
+        if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
+            print("Adding sample versioneer config to setup.cfg",
+                  file=sys.stderr)
+            with open(os.path.join(root, "setup.cfg"), "a") as f:
+                f.write(SAMPLE_CONFIG)
+        print(CONFIG_ERROR, file=sys.stderr)
+        return 1
+
+    print(" creating %s" % cfg.versionfile_source)
+    with open(cfg.versionfile_source, "w") as f:
+        LONG = LONG_VERSION_PY[cfg.VCS]
+        f.write(LONG % {"DOLLAR": "$",
+                        "STYLE": cfg.style,
+                        "TAG_PREFIX": cfg.tag_prefix,
+                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        })
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
+                       "__init__.py")
+    if os.path.exists(ipy):
+        try:
+            with open(ipy, "r") as f:
+                old = f.read()
+        except EnvironmentError:
+            old = ""
+        if INIT_PY_SNIPPET not in old:
+            print(" appending to %s" % ipy)
+            with open(ipy, "a") as f:
+                f.write(INIT_PY_SNIPPET)
+        else:
+            print(" %s unmodified" % ipy)
+    else:
+        print(" %s doesn't exist, ok" % ipy)
+        ipy = None
+
+    # Make sure both the top-level "versioneer.py" and versionfile_source
+    # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
+    # they'll be copied into source distributions. Pip won't be able to
+    # install the package without this.
+    manifest_in = os.path.join(root, "MANIFEST.in")
+    simple_includes = set()
+    try:
+        with open(manifest_in, "r") as f:
+            for line in f:
+                if line.startswith("include "):
+                    for include in line.split()[1:]:
+                        simple_includes.add(include)
+    except EnvironmentError:
+        pass
+    # That doesn't cover everything MANIFEST.in can do
+    # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
+    # it might give some false negatives. Appending redundant 'include'
+    # lines is safe, though.
+    if "versioneer.py" not in simple_includes:
+        print(" appending 'versioneer.py' to MANIFEST.in")
+        with open(manifest_in, "a") as f:
+            f.write("include versioneer.py\n")
+    else:
+        print(" 'versioneer.py' already in MANIFEST.in")
+    if cfg.versionfile_source not in simple_includes:
+        print(" appending versionfile_source ('%s') to MANIFEST.in" %
+              cfg.versionfile_source)
+        with open(manifest_in, "a") as f:
+            f.write("include %s\n" % cfg.versionfile_source)
+    else:
+        print(" versionfile_source already in MANIFEST.in")
+
+    # Make VCS-specific changes. For git, this means creating/changing
+    # .gitattributes to mark _version.py for export-subst keyword
+    # substitution.
+    do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
+    return 0
+
+
+def scan_setup_py():
+    """Validate the contents of setup.py against Versioneer's expectations."""
+    found = set()
+    setters = False
+    errors = 0
+    with open("setup.py", "r") as f:
+        for line in f.readlines():
+            if "import versioneer" in line:
+                found.add("import")
+            if "versioneer.get_cmdclass()" in line:
+                found.add("cmdclass")
+            if "versioneer.get_version()" in line:
+                found.add("get_version")
+            if "versioneer.VCS" in line:
+                setters = True
+            if "versioneer.versionfile_source" in line:
+                setters = True
+    if len(found) != 3:
+        print("")
+        print("Your setup.py appears to be missing some important items")
+        print("(but I might be wrong). Please make sure it has something")
+        print("roughly like the following:")
+        print("")
+        print(" import versioneer")
+        print(" setup( version=versioneer.get_version(),")
+        print("        cmdclass=versioneer.get_cmdclass(),  ...)")
+        print("")
+        errors += 1
+    if setters:
+        print("You should remove lines like 'versioneer.VCS = ' and")
+        print("'versioneer.versionfile_source = ' . This configuration")
+        print("now lives in setup.cfg, and should be removed from setup.py")
+        print("")
+        errors += 1
+    return errors
+
+
+if __name__ == "__main__":
+    cmd = sys.argv[1]
+    if cmd == "setup":
+        errors = do_setup()
+        errors += scan_setup_py()
+        if errors:
+            sys.exit(1)