diff --git a/modules/cudaarithm/CMakeLists.txt b/modules/cudaarithm/CMakeLists.txt
new file mode 100644
index 00000000000..d552bb4ebe9
--- /dev/null
+++ b/modules/cudaarithm/CMakeLists.txt
@@ -0,0 +1,27 @@
+if(IOS OR WINRT OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudaarithm)
+endif()
+
+set(the_description "CUDA-accelerated Operations on Matrices")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev WRAP python)
+
+ocv_module_include_directories()
+ocv_glob_module_sources()
+
+set(extra_libs "")
+
+if(HAVE_CUBLAS)
+  list(APPEND extra_libs ${CUDA_cublas_LIBRARY})
+endif()
+
+if(HAVE_CUFFT)
+  list(APPEND extra_libs ${CUDA_cufft_LIBRARY})
+endif()
+
+ocv_create_module(${extra_libs})
+
+ocv_add_accuracy_tests(DEPENDS_ON opencv_imgproc)
+ocv_add_perf_tests(DEPENDS_ON opencv_imgproc)
diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
new file mode 100644
index 00000000000..c357f77b4f1
--- /dev/null
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -0,0 +1,878 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDAARITHM_HPP
+#define OPENCV_CUDAARITHM_HPP
+
+#ifndef __cplusplus
+#  error cudaarithm.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudaarithm Operations on Matrices
+    @{
+        @defgroup cudaarithm_core Core Operations on Matrices
+        @defgroup cudaarithm_elem Per-element Operations
+        @defgroup cudaarithm_reduce Matrix Reductions
+        @defgroup cudaarithm_arithm Arithm Operations on Matrices
+    @}
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudaarithm
+//! @{
+
+//! @addtogroup cudaarithm_elem
+//! @{
+
+/** @brief Computes a matrix-matrix or matrix-scalar sum.
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+@param dst Destination matrix that has the same size and number of channels as the input array(s).
+The depth is defined by dtype or src1 depth.
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed. The mask can be used only with single channel images.
+@param dtype Optional depth of the output array.
+@param stream Stream for the asynchronous version.
+
+@sa add
+ */
+CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());
+
+/** @brief Computes a matrix-matrix or matrix-scalar difference.
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+@param dst Destination matrix that has the same size and number of channels as the input array(s).
+The depth is defined by dtype or src1 depth.
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed. The mask can be used only with single channel images.
+@param dtype Optional depth of the output array.
+@param stream Stream for the asynchronous version.
+
+@sa subtract
+ */
+CV_EXPORTS_W void subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());
+
+/** @brief Computes a matrix-matrix or matrix-scalar per-element product.
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and number of channels as the input array(s).
+The depth is defined by dtype or src1 depth.
+@param scale Optional scale factor.
+@param dtype Optional depth of the output array.
+@param stream Stream for the asynchronous version.
+
+@sa multiply
+ */
+CV_EXPORTS_W void multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+
+/** @brief Computes a matrix-matrix or matrix-scalar division.
+
+@param src1 First source matrix or a scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and number of channels as the input array(s).
+The depth is defined by dtype or src1 depth.
+@param scale Optional scale factor.
+@param dtype Optional depth of the output array.
+@param stream Stream for the asynchronous version.
+
+This function, in contrast to divide, uses a round-down rounding mode.
+
+@sa divide
+ */
+CV_EXPORTS_W void divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+
+/** @brief Computes per-element absolute difference of two matrices (or of a matrix and scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param stream Stream for the asynchronous version.
+
+@sa absdiff
+ */
+CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Computes an absolute value of each matrix element.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+
+@sa abs
+ */
+CV_EXPORTS_W void abs(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Computes a square value of each matrix element.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void sqr(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Computes a square root of each matrix element.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+
+@sa sqrt
+ */
+CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Computes an exponent of each matrix element.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+
+@sa exp
+ */
+CV_EXPORTS_W void exp(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Computes a natural logarithm of absolute value of each matrix element.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+
+@sa log
+ */
+CV_EXPORTS_W void log(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Raises every matrix element to a power.
+
+@param src Source matrix.
+@param power Exponent of power.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+
+The function pow raises every element of the input matrix to power :
+
+\f[\texttt{dst} (I) =  \fork{\texttt{src}(I)^power}{if \texttt{power} is integer}{|\texttt{src}(I)|^power}{otherwise}\f]
+
+@sa pow
+ */
+CV_EXPORTS_W void pow(InputArray src, double power, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Compares elements of two matrices (or of a matrix and scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param cmpop Flag specifying the relation between the elements to be checked:
+-   **CMP_EQ:** a(.) == b(.)
+-   **CMP_GT:** a(.) \> b(.)
+-   **CMP_GE:** a(.) \>= b(.)
+-   **CMP_LT:** a(.) \< b(.)
+-   **CMP_LE:** a(.) \<= b(.)
+-   **CMP_NE:** a(.) != b(.)
+@param stream Stream for the asynchronous version.
+
+@sa compare
+ */
+CV_EXPORTS_W void compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream = Stream::Null());
+
+/** @brief Performs a per-element bitwise inversion.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed. The mask can be used only with single channel images.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+/** @brief Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed. The mask can be used only with single channel images.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+/** @brief Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed. The mask can be used only with single channel images.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+/** @brief Performs a per-element bitwise exclusive or operation of two matrices (or of matrix and scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed. The mask can be used only with single channel images.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+/** @brief Performs pixel by pixel right shift of an image by a constant value.
+
+@param src Source matrix. Supports 1, 3 and 4 channels images with integers elements.
+@param val Constant values, one per channel.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS void rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Performs pixel by pixel right left of an image by a constant value.
+
+@param src Source matrix. Supports 1, 3 and 4 channels images with CV_8U , CV_16U or CV_32S
+depth.
+@param val Constant values, one per channel.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS void lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Computes the per-element minimum of two matrices (or a matrix and a scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param stream Stream for the asynchronous version.
+
+@sa min
+ */
+CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Computes the per-element maximum of two matrices (or a matrix and a scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param stream Stream for the asynchronous version.
+
+@sa max
+ */
+CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Computes the weighted sum of two arrays.
+
+@param src1 First source array.
+@param alpha Weight for the first array elements.
+@param src2 Second source array of the same size and channel number as src1 .
+@param beta Weight for the second array elements.
+@param dst Destination array that has the same size and number of channels as the input arrays.
+@param gamma Scalar added to each sum.
+@param dtype Optional depth of the destination array. When both input arrays have the same depth,
+dtype can be set to -1, which will be equivalent to src1.depth().
+@param stream Stream for the asynchronous version.
+
+The function addWeighted calculates the weighted sum of two arrays as follows:
+
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)* \texttt{beta} +  \texttt{gamma} )\f]
+
+where I is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+
+@sa addWeighted
+ */
+CV_EXPORTS_W void addWeighted(InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst,
+                            int dtype = -1, Stream& stream = Stream::Null());
+
+//! adds scaled array to another one (dst = alpha*src1 + src2)
+static inline void scaleAdd(InputArray src1, double alpha, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+{
+    addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);
+}
+
+/** @brief Applies a fixed-level threshold to each array element.
+
+@param src Source array (single-channel).
+@param dst Destination array with the same size and type as src .
+@param thresh Threshold value.
+@param maxval Maximum value to use with THRESH_BINARY and THRESH_BINARY_INV threshold types.
+@param type Threshold type. For details, see threshold . The THRESH_OTSU and THRESH_TRIANGLE
+threshold types are not supported.
+@param stream Stream for the asynchronous version.
+
+@sa threshold
+ */
+CV_EXPORTS_W double threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
+
+/** @brief Computes magnitudes of complex matrix elements.
+
+@param xy Source complex matrix in the interleaved format ( CV_32FC2 ).
+@param magnitude Destination matrix of float magnitudes ( CV_32FC1 ).
+@param stream Stream for the asynchronous version.
+
+@sa magnitude
+ */
+CV_EXPORTS_W void magnitude(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null());
+
+/** @brief Computes squared magnitudes of complex matrix elements.
+
+@param xy Source complex matrix in the interleaved format ( CV_32FC2 ).
+@param magnitude Destination matrix of float magnitude squares ( CV_32FC1 ).
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void magnitudeSqr(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null());
+
+/** @overload
+ computes magnitude of each (x(i), y(i)) vector
+ supports only floating-point source
+@param x Source matrix containing real components ( CV_32FC1 ).
+@param y Source matrix containing imaginary components ( CV_32FC1 ).
+@param magnitude Destination matrix of float magnitudes ( CV_32FC1 ).
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void magnitude(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null());
+
+/** @overload
+ computes squared magnitude of each (x(i), y(i)) vector
+ supports only floating-point source
+@param x Source matrix containing real components ( CV_32FC1 ).
+@param y Source matrix containing imaginary components ( CV_32FC1 ).
+@param magnitude Destination matrix of float magnitude squares ( CV_32FC1 ).
+@param stream Stream for the asynchronous version.
+*/
+CV_EXPORTS_W void magnitudeSqr(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null());
+
+/** @brief Computes polar angles of complex matrix elements.
+
+@param x Source matrix containing real components ( CV_32FC1 ).
+@param y Source matrix containing imaginary components ( CV_32FC1 ).
+@param angle Destination matrix of angles ( CV_32FC1 ).
+@param angleInDegrees Flag for angles that must be evaluated in degrees.
+@param stream Stream for the asynchronous version.
+
+@sa phase
+ */
+CV_EXPORTS_W void phase(InputArray x, InputArray y, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
+
+/** @brief Converts Cartesian coordinates into polar.
+
+@param x Source matrix containing real components ( CV_32FC1 ).
+@param y Source matrix containing imaginary components ( CV_32FC1 ).
+@param magnitude Destination matrix of float magnitudes ( CV_32FC1 ).
+@param angle Destination matrix of angles ( CV_32FC1 ).
+@param angleInDegrees Flag for angles that must be evaluated in degrees.
+@param stream Stream for the asynchronous version.
+
+@sa cartToPolar
+ */
+CV_EXPORTS_W void cartToPolar(InputArray x, InputArray y, OutputArray magnitude, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
+
+/** @brief Converts polar coordinates into Cartesian.
+
+@param magnitude Source matrix containing magnitudes ( CV_32FC1 ).
+@param angle Source matrix containing angles ( CV_32FC1 ).
+@param x Destination matrix of real components ( CV_32FC1 ).
+@param y Destination matrix of imaginary components ( CV_32FC1 ).
+@param angleInDegrees Flag that indicates angles in degrees.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void polarToCart(InputArray magnitude, InputArray angle, OutputArray x, OutputArray y, bool angleInDegrees = false, Stream& stream = Stream::Null());
+
+//! @} cudaarithm_elem
+
+//! @addtogroup cudaarithm_core
+//! @{
+
+/** @brief Makes a multi-channel matrix out of several single-channel matrices.
+
+@param src Array/vector of source matrices.
+@param n Number of source matrices.
+@param dst Destination matrix.
+@param stream Stream for the asynchronous version.
+
+@sa merge
+ */
+CV_EXPORTS_W void merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null());
+/** @overload */
+CV_EXPORTS_W void merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Copies each plane of a multi-channel matrix into an array.
+
+@param src Source matrix.
+@param dst Destination array/vector of single-channel matrices.
+@param stream Stream for the asynchronous version.
+
+@sa split
+ */
+CV_EXPORTS_W void split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null());
+/** @overload */
+CV_EXPORTS_W void split(InputArray src, std::vector<GpuMat>& dst, Stream& stream = Stream::Null());
+
+/** @brief Transposes a matrix.
+
+@param src1 Source matrix. 1-, 4-, 8-byte element sizes are supported for now.
+@param dst Destination matrix.
+@param stream Stream for the asynchronous version.
+
+@sa transpose
+ */
+CV_EXPORTS_W void transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Flips a 2D matrix around vertical, horizontal, or both axes.
+
+@param src Source matrix. Supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or
+CV_32F depth.
+@param dst Destination matrix.
+@param flipCode Flip mode for the source:
+-   0 Flips around x-axis.
+-   \> 0 Flips around y-axis.
+-   \< 0 Flips around both axes.
+@param stream Stream for the asynchronous version.
+
+@sa flip
+ */
+CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null());
+
+/** @brief Base class for transform using lookup table.
+ */
+class CV_EXPORTS_W LookUpTable : public Algorithm
+{
+public:
+    /** @brief Transforms the source matrix into the destination matrix using the given look-up table:
+    dst(I) = lut(src(I)) .
+
+    @param src Source matrix. CV_8UC1 and CV_8UC3 matrices are supported for now.
+    @param dst Destination matrix.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+};
+
+/** @brief Creates implementation for cuda::LookUpTable .
+
+@param lut Look-up table of 256 elements. It is a continuous CV_8U matrix.
+ */
+CV_EXPORTS_W Ptr<LookUpTable> createLookUpTable(InputArray lut);
+
+/** @brief Forms a border around an image.
+
+@param src Source image. CV_8UC1 , CV_8UC4 , CV_32SC1 , and CV_32FC1 types are supported.
+@param dst Destination image with the same type as src. The size is
+Size(src.cols+left+right, src.rows+top+bottom) .
+@param top
+@param bottom
+@param left
+@param right Number of pixels in each direction from the source image rectangle to extrapolate.
+For example: top=1, bottom=1, left=1, right=1 mean that 1 pixel-wide border needs to be built.
+@param borderType Border type. See borderInterpolate for details. BORDER_REFLECT101 ,
+BORDER_REPLICATE , BORDER_CONSTANT , BORDER_REFLECT and BORDER_WRAP are supported for now.
+@param value Border value.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType,
+                               Scalar value = Scalar(), Stream& stream = Stream::Null());
+
+//! @} cudaarithm_core
+
+//! @addtogroup cudaarithm_reduce
+//! @{
+
+/** @brief Returns the norm of a matrix (or difference of two matrices).
+
+@param src1 Source matrix. Any matrices except 64F are supported.
+@param normType Norm type. NORM_L1 , NORM_L2 , and NORM_INF are supported for now.
+@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
+
+@sa norm
+ */
+CV_EXPORTS_W double norm(InputArray src1, int normType, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS_W void calcNorm(InputArray src, OutputArray dst, int normType, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+/** @brief Returns the difference of two matrices.
+
+@param src1 Source matrix. Any matrices except 64F are supported.
+@param src2 Second source matrix (if any) with the same size and type as src1.
+@param normType Norm type. NORM_L1 , NORM_L2 , and NORM_INF are supported for now.
+
+@sa norm
+ */
+CV_EXPORTS_W double norm(InputArray src1, InputArray src2, int normType=NORM_L2);
+/** @overload */
+CV_EXPORTS_W void calcNormDiff(InputArray src1, InputArray src2, OutputArray dst, int normType=NORM_L2, Stream& stream = Stream::Null());
+
+/** @brief Returns the sum of matrix elements.
+
+@param src Source image of any depth except for CV_64F .
+@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
+
+@sa sum
+ */
+CV_EXPORTS_W Scalar sum(InputArray src, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS_W void calcSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+/** @brief Returns the sum of absolute values for matrix elements.
+
+@param src Source image of any depth except for CV_64F .
+@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
+ */
+CV_EXPORTS_W Scalar absSum(InputArray src, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS_W void calcAbsSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+/** @brief Returns the squared sum of matrix elements.
+
+@param src Source image of any depth except for CV_64F .
+@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
+ */
+CV_EXPORTS_W Scalar sqrSum(InputArray src, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS_W void calcSqrSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+/** @brief Finds global minimum and maximum matrix elements and returns their values.
+
+@param src Single-channel source image.
+@param minVal Pointer to the returned minimum value. Use NULL if not required.
+@param maxVal Pointer to the returned maximum value. Use NULL if not required.
+@param mask Optional mask to select a sub-matrix.
+
+The function does not work with CV_64F images on GPUs with the compute capability \< 1.3.
+
+@sa minMaxLoc
+ */
+CV_EXPORTS_W void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS_W void findMinMax(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+/** @brief Finds global minimum and maximum matrix elements and returns their values with locations.
+
+@param src Single-channel source image.
+@param minVal Pointer to the returned minimum value. Use NULL if not required.
+@param maxVal Pointer to the returned maximum value. Use NULL if not required.
+@param minLoc Pointer to the returned minimum location. Use NULL if not required.
+@param maxLoc Pointer to the returned maximum location. Use NULL if not required.
+@param mask Optional mask to select a sub-matrix.
+
+The function does not work with CV_64F images on GPU with the compute capability \< 1.3.
+
+@sa minMaxLoc
+ */
+CV_EXPORTS_W void minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
+                          InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS_W void findMinMaxLoc(InputArray src, OutputArray minMaxVals, OutputArray loc,
+                              InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+/** @brief Counts non-zero matrix elements.
+
+@param src Single-channel source image.
+
+The function does not work with CV_64F images on GPUs with the compute capability \< 1.3.
+
+@sa countNonZero
+ */
+CV_EXPORTS_W int countNonZero(InputArray src);
+/** @overload */
+CV_EXPORTS_W void countNonZero(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Reduces a matrix to a vector.
+
+@param mtx Source 2D matrix.
+@param vec Destination vector. Its size and type is defined by dim and dtype parameters.
+@param dim Dimension index along which the matrix is reduced. 0 means that the matrix is reduced
+to a single row. 1 means that the matrix is reduced to a single column.
+@param reduceOp Reduction operation that could be one of the following:
+-   **CV_REDUCE_SUM** The output is the sum of all rows/columns of the matrix.
+-   **CV_REDUCE_AVG** The output is the mean vector of all rows/columns of the matrix.
+-   **CV_REDUCE_MAX** The output is the maximum (column/row-wise) of all rows/columns of the
+matrix.
+-   **CV_REDUCE_MIN** The output is the minimum (column/row-wise) of all rows/columns of the
+matrix.
+@param dtype When it is negative, the destination vector will have the same type as the source
+matrix. Otherwise, its type will be CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), mtx.channels()) .
+@param stream Stream for the asynchronous version.
+
+The function reduce reduces the matrix to a vector by treating the matrix rows/columns as a set of
+1D vectors and performing the specified operation on the vectors until a single row/column is
+obtained. For example, the function can be used to compute horizontal and vertical projections of a
+raster image. In case of CV_REDUCE_SUM and CV_REDUCE_AVG , the output may have a larger element
+bit-depth to preserve accuracy. And multi-channel arrays are also supported in these two reduction
+modes.
+
+@sa reduce
+ */
+CV_EXPORTS_W void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());
+
+/** @brief Computes a mean value and a standard deviation of matrix elements.
+
+@param mtx Source matrix. CV_8UC1 matrices are supported for now.
+@param mean Mean value.
+@param stddev Standard deviation value.
+
+@sa meanStdDev
+ */
+CV_EXPORTS_W void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev);
+/** @overload */
+CV_EXPORTS_W void meanStdDev(InputArray mtx, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Computes a standard deviation of integral images.
+
+@param src Source image. Only the CV_32SC1 type is supported.
+@param sqr Squared source image. Only the CV_32FC1 type is supported.
+@param dst Destination image with the same type and size as src .
+@param rect Rectangular window.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null());
+
+/** @brief Normalizes the norm or value range of an array.
+
+@param src Input array.
+@param dst Output array of the same size as src .
+@param alpha Norm value to normalize to or the lower range boundary in case of the range
+normalization.
+@param beta Upper range boundary in case of the range normalization; it is not used for the norm
+normalization.
+@param norm_type Normalization type ( NORM_MINMAX , NORM_L2 , NORM_L1 or NORM_INF ).
+@param dtype When negative, the output array has the same type as src; otherwise, it has the same
+number of channels as src and the depth =CV_MAT_DEPTH(dtype).
+@param mask Optional operation mask.
+@param stream Stream for the asynchronous version.
+
+@sa normalize
+ */
+CV_EXPORTS_W void normalize(InputArray src, OutputArray dst, double alpha, double beta,
+                          int norm_type, int dtype, InputArray mask = noArray(),
+                          Stream& stream = Stream::Null());
+
+/** @brief Computes an integral image.
+
+@param src Source image. Only CV_8UC1 images are supported for now.
+@param sum Integral image containing 32-bit unsigned integer values packed into CV_32SC1 .
+@param stream Stream for the asynchronous version.
+
+@sa integral
+ */
+CV_EXPORTS_W void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null());
+
+/** @brief Computes a squared integral image.
+
+@param src Source image. Only CV_8UC1 images are supported for now.
+@param sqsum Squared integral image containing 64-bit unsigned integer values packed into
+CV_64FC1 .
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null());
+
+//! @} cudaarithm_reduce
+
+//! @addtogroup cudaarithm_arithm
+//! @{
+
+/** @brief Performs generalized matrix multiplication.
+
+@param src1 First multiplied input matrix that should have CV_32FC1 , CV_64FC1 , CV_32FC2 , or
+CV_64FC2 type.
+@param src2 Second multiplied input matrix of the same type as src1 .
+@param alpha Weight of the matrix product.
+@param src3 Third optional delta matrix added to the matrix product. It should have the same type
+as src1 and src2 .
+@param beta Weight of src3 .
+@param dst Destination matrix. It has the proper size and the same type as input matrices.
+@param flags Operation flags:
+-   **GEMM_1_T** transpose src1
+-   **GEMM_2_T** transpose src2
+-   **GEMM_3_T** transpose src3
+@param stream Stream for the asynchronous version.
+
+The function performs generalized matrix multiplication similar to the gemm functions in BLAS level
+3. For example, gemm(src1, src2, alpha, src3, beta, dst, GEMM_1_T + GEMM_3_T) corresponds to
+
+\f[\texttt{dst} =  \texttt{alpha} \cdot \texttt{src1} ^T  \cdot \texttt{src2} +  \texttt{beta} \cdot \texttt{src3} ^T\f]
+
+@note Transposition operation doesn't support CV_64FC2 input type.
+
+@sa gemm
+ */
+CV_EXPORTS_W void gemm(InputArray src1, InputArray src2, double alpha,
+                     InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null());
+
+/** @brief Performs a per-element multiplication of two Fourier spectrums.
+
+@param src1 First spectrum.
+@param src2 Second spectrum with the same size and type as a .
+@param dst Destination spectrum.
+@param flags Mock parameter used for CPU/CUDA interfaces similarity.
+@param conjB Optional flag to specify if the second spectrum needs to be conjugated before the
+multiplication.
+@param stream Stream for the asynchronous version.
+
+Only full (not packed) CV_32FC2 complex spectrums in the interleaved format are supported for now.
+
+@sa mulSpectrums
+ */
+CV_EXPORTS_W void mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null());
+
+/** @brief Performs a per-element multiplication of two Fourier spectrums and scales the result.
+
+@param src1 First spectrum.
+@param src2 Second spectrum with the same size and type as a .
+@param dst Destination spectrum.
+@param flags Mock parameter used for CPU/CUDA interfaces similarity, simply add a `0` value.
+@param scale Scale constant.
+@param conjB Optional flag to specify if the second spectrum needs to be conjugated before the
+multiplication.
+@param stream Stream for the asynchronous version.
+
+Only full (not packed) CV_32FC2 complex spectrums in the interleaved format are supported for now.
+
+@sa mulSpectrums
+ */
+CV_EXPORTS_W void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
+
+/** @brief Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.
+
+@param src Source matrix (real or complex).
+@param dst Destination matrix (real or complex).
+@param dft_size Size of a discrete Fourier transform.
+@param flags Optional flags:
+-   **DFT_ROWS** transforms each individual row of the source matrix.
+-   **DFT_SCALE** scales the result: divide it by the number of elements in the transform
+(obtained from dft_size ).
+-   **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real
+cases are always forward and inverse, respectively).
+-   **DFT_COMPLEX_INPUT** Specifies that input is complex input with 2 channels.
+-   **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of
+real-complex transform, so the destination matrix must be real.
+@param stream Stream for the asynchronous version.
+
+Use to handle real matrices ( CV32FC1 ) and complex matrices in the interleaved format ( CV32FC2 ).
+
+The source matrix should be continuous, otherwise reallocation and data copying is performed. The
+function chooses an operation mode depending on the flags, size, and channel count of the source
+matrix:
+
+-   If the source matrix is complex and the output is not specified as real, the destination
+matrix is complex and has the dft_size size and CV_32FC2 type. The destination matrix
+contains a full result of the DFT (forward or inverse).
+-   If the source matrix is complex and the output is specified as real, the function assumes that
+its input is the result of the forward transform (see the next item). The destination matrix
+has the dft_size size and CV_32FC1 type. It contains the result of the inverse DFT.
+-   If the source matrix is real (its type is CV_32FC1 ), forward DFT is performed. The result of
+the DFT is packed into complex ( CV_32FC2 ) matrix. So, the width of the destination matrix
+is dft_size.width / 2 + 1 . But if the source is a single column, the height is reduced
+instead of the width.
+
+@sa dft
+ */
+CV_EXPORTS_W void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
+
+/** @brief Base class for DFT operator as a cv::Algorithm. :
+ */
+class CV_EXPORTS_W DFT : public Algorithm
+{
+public:
+    /** @brief Computes an FFT of a given image.
+
+    @param image Source image. Only CV_32FC1 images are supported for now.
+    @param result Result image.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void compute(InputArray image, OutputArray result, Stream& stream = Stream::Null()) = 0;
+};
+
+/** @brief Creates implementation for cuda::DFT.
+
+@param dft_size The image size.
+@param flags Optional flags:
+-   **DFT_ROWS** transforms each individual row of the source matrix.
+-   **DFT_SCALE** scales the result: divide it by the number of elements in the transform
+(obtained from dft_size ).
+-   **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real
+cases are always forward and inverse, respectively).
+-   **DFT_COMPLEX_INPUT** Specifies that inputs will be complex with 2 channels.
+-   **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of
+real-complex transform, so the destination matrix must be real.
+ */
+CV_EXPORTS_W Ptr<DFT> createDFT(Size dft_size, int flags);
+
+/** @brief Base class for convolution (or cross-correlation) operator. :
+ */
+class CV_EXPORTS_W Convolution : public Algorithm
+{
+public:
+    /** @brief Computes a convolution (or cross-correlation) of two images.
+
+    @param image Source image. Only CV_32FC1 images are supported for now.
+    @param templ Template image. The size is not greater than the image size. The type is the same as
+    image .
+    @param result Result image. If image is *W x H* and templ is *w x h*, then result must be *W-w+1 x
+    H-h+1*.
+    @param ccorr Flags to evaluate cross-correlation instead of convolution.
+    @param stream Stream for the asynchronous version.
+     */
+    virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0;
+};
+
+/** @brief Creates implementation for cuda::Convolution .
+
+@param user_block_size Block size. If you leave default value Size(0,0) then automatic
+estimation of block size will be used (which is optimized for speed). By varying user_block_size
+you can reduce memory requirements at the cost of speed.
+ */
+CV_EXPORTS_W Ptr<Convolution> createConvolution(Size user_block_size = Size());
+
+//! @} cudaarithm_arithm
+
+//! @} cudaarithm
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_CUDAARITHM_HPP */
diff --git a/modules/cudaarithm/perf/perf_arithm.cpp b/modules/cudaarithm/perf/perf_arithm.cpp
new file mode 100644
index 00000000000..ca23e19dc14
--- /dev/null
+++ b/modules/cudaarithm/perf/perf_arithm.cpp
@@ -0,0 +1,254 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// GEMM
+
+#ifdef HAVE_CUBLAS
+
+CV_FLAGS(GemmFlags, 0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T)
+#define ALL_GEMM_FLAGS Values(GemmFlags(0), GemmFlags(cv::GEMM_1_T), GemmFlags(cv::GEMM_2_T), GemmFlags(cv::GEMM_3_T), \
+                              GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T | cv::GEMM_3_T))
+
+DEF_PARAM_TEST(Sz_Type_Flags, cv::Size, MatType, GemmFlags);
+
+PERF_TEST_P(Sz_Type_Flags, GEMM,
+            Combine(Values(cv::Size(512, 512), cv::Size(1024, 1024)),
+                    Values(CV_32FC1, CV_32FC2, CV_64FC1),
+                    ALL_GEMM_FLAGS))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int flags = GET_PARAM(2);
+
+    cv::Mat src1(size, type);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, type);
+    declare.in(src2, WARMUP_RNG);
+
+    cv::Mat src3(size, type);
+    declare.in(src3, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        declare.time(5.0);
+
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        const cv::cuda::GpuMat d_src3(src3);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, dst, flags);
+
+        CUDA_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        declare.time(50.0);
+
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::gemm(src1, src2, 1.0, src3, 1.0, dst, flags);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+#endif
+
+//////////////////////////////////////////////////////////////////////
+// MulSpectrums
+
+CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
+
+DEF_PARAM_TEST(Sz_Flags, cv::Size, DftFlags);
+
+PERF_TEST_P(Sz_Flags, MulSpectrums,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(0, DftFlags(cv::DFT_ROWS))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int flag = GET_PARAM(1);
+
+    cv::Mat a(size, CV_32FC2);
+    cv::Mat b(size, CV_32FC2);
+    declare.in(a, b, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_a(a);
+        const cv::cuda::GpuMat d_b(b);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::mulSpectrums(d_a, d_b, dst, flag);
+
+        CUDA_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::mulSpectrums(a, b, dst, flag);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MulAndScaleSpectrums
+
+PERF_TEST_P(Sz, MulAndScaleSpectrums,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    const float scale = 1.f / size.area();
+
+    cv::Mat src1(size, CV_32FC2);
+    cv::Mat src2(size, CV_32FC2);
+    declare.in(src1,src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false);
+
+        CUDA_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Dft
+
+PERF_TEST_P(Sz_Flags, Dft,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(0, DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE))))
+{
+    declare.time(10.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int flag = GET_PARAM(1);
+
+    cv::Mat src(size, CV_32FC2);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::dft(d_src, dst, size, flag);
+
+        CUDA_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::dft(src, dst, flag);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Convolve
+
+DEF_PARAM_TEST(Sz_KernelSz_Ccorr, cv::Size, int, bool);
+
+PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(17, 27, 32, 64),
+                    Bool()))
+{
+    declare.time(10.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int templ_size = GET_PARAM(1);
+    const bool ccorr = GET_PARAM(2);
+
+    const cv::Mat image(size, CV_32FC1);
+    const cv::Mat templ(templ_size, templ_size, CV_32FC1);
+    declare.in(image, templ, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::cuda::GpuMat d_image = cv::cuda::createContinuous(size, CV_32FC1);
+        d_image.upload(image);
+
+        cv::cuda::GpuMat d_templ = cv::cuda::createContinuous(templ_size, templ_size, CV_32FC1);
+        d_templ.upload(templ);
+
+        cv::Ptr<cv::cuda::Convolution> convolution = cv::cuda::createConvolution();
+
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() convolution->convolve(d_image, d_templ, dst, ccorr);
+
+        CUDA_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        if (ccorr)
+            FAIL_NO_CPU();
+
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::filter2D(image, dst, image.depth(), templ);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaarithm/perf/perf_core.cpp b/modules/cudaarithm/perf/perf_core.cpp
new file mode 100644
index 00000000000..bc9f0e2f715
--- /dev/null
+++ b/modules/cudaarithm/perf/perf_core.cpp
@@ -0,0 +1,323 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+#define ARITHM_MAT_DEPTH Values(CV_8U, CV_16U, CV_32F, CV_64F)
+
+//////////////////////////////////////////////////////////////////////
+// Merge
+
+DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
+
+PERF_TEST_P(Sz_Depth_Cn, Merge,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH,
+                    Values(2, 3, 4)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    std::vector<cv::Mat> src(channels);
+    for (int i = 0; i < channels; ++i)
+    {
+        src[i].create(size, depth);
+        declare.in(src[i], WARMUP_RNG);
+    }
+
+    if (PERF_RUN_CUDA())
+    {
+        std::vector<cv::cuda::GpuMat> d_src(channels);
+        for (int i = 0; i < channels; ++i)
+            d_src[i].upload(src[i]);
+
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::merge(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::merge(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Split
+
+PERF_TEST_P(Sz_Depth_Cn, Split,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH,
+                    Values(2, 3, 4)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    cv::Mat src(size, CV_MAKE_TYPE(depth, channels));
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        std::vector<cv::cuda::GpuMat> dst;
+
+        TEST_CYCLE() cv::cuda::split(d_src, dst);
+
+        const cv::cuda::GpuMat& dst0 = dst[0];
+        const cv::cuda::GpuMat& dst1 = dst[1];
+
+        CUDA_SANITY_CHECK(dst0, 1e-10);
+        CUDA_SANITY_CHECK(dst1, 1e-10);
+    }
+    else
+    {
+        std::vector<cv::Mat> dst;
+
+        TEST_CYCLE() cv::split(src, dst);
+
+        const cv::Mat& dst0 = dst[0];
+        const cv::Mat& dst1 = dst[1];
+
+        CPU_SANITY_CHECK(dst0);
+        CPU_SANITY_CHECK(dst1);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Transpose
+
+PERF_TEST_P(Sz_Type, Transpose,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8UC1, CV_8UC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32SC2, CV_64FC1)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::transpose(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::transpose(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Flip
+
+enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
+CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Code, cv::Size, MatDepth, MatCn, FlipCode);
+
+PERF_TEST_P(Sz_Depth_Cn_Code, Flip,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4,
+                    FlipCode::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int flipCode = GET_PARAM(3);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::flip(d_src, dst, flipCode);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::flip(src, dst, flipCode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// LutOneChannel
+
+PERF_TEST_P(Sz_Type, LutOneChannel,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8UC1, CV_8UC3)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat lut(1, 256, CV_8UC1);
+    declare.in(lut, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::LookUpTable> lutAlg = cv::cuda::createLookUpTable(lut);
+
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() lutAlg->transform(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::LUT(src, lut, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// LutMultiChannel
+
+PERF_TEST_P(Sz_Type, LutMultiChannel,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values<MatType>(CV_8UC3)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat lut(1, 256, CV_MAKE_TYPE(CV_8U, src.channels()));
+    declare.in(lut, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::LookUpTable> lutAlg = cv::cuda::createLookUpTable(lut);
+
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() lutAlg->transform(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::LUT(src, lut, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CopyMakeBorder
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Border, cv::Size, MatDepth, MatCn, BorderMode);
+
+PERF_TEST_P(Sz_Depth_Cn_Border, CopyMakeBorder,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4,
+                    ALL_BORDER_MODES))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int borderMode = GET_PARAM(3);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::copyMakeBorder(d_src, dst, 5, 5, 5, 5, borderMode);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::copyMakeBorder(src, dst, 5, 5, 5, 5, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaarithm/perf/perf_element_operations.cpp b/modules/cudaarithm/perf/perf_element_operations.cpp
new file mode 100644
index 00000000000..02f412d9949
--- /dev/null
+++ b/modules/cudaarithm/perf/perf_element_operations.cpp
@@ -0,0 +1,1501 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+#define ARITHM_MAT_DEPTH Values(CV_8U, CV_16U, CV_32F, CV_64F)
+
+//////////////////////////////////////////////////////////////////////
+// AddMat
+
+DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
+
+PERF_TEST_P(Sz_Depth, AddMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::add(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::add(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// AddScalar
+
+PERF_TEST_P(Sz_Depth, AddScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar s;
+    declare.in(s, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::add(d_src, s, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::add(src, s, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SubtractMat
+
+PERF_TEST_P(Sz_Depth, SubtractMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::subtract(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::subtract(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SubtractScalar
+
+PERF_TEST_P(Sz_Depth, SubtractScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar s;
+    declare.in(s, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::subtract(d_src, s, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::subtract(src, s, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MultiplyMat
+
+PERF_TEST_P(Sz_Depth, MultiplyMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::multiply(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-6);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::multiply(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MultiplyScalar
+
+PERF_TEST_P(Sz_Depth, MultiplyScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar s;
+    declare.in(s, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::multiply(d_src, s, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-6);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::multiply(src, s, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// DivideMat
+
+PERF_TEST_P(Sz_Depth, DivideMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::divide(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-6);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::divide(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// DivideScalar
+
+PERF_TEST_P(Sz_Depth, DivideScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar s;
+    declare.in(s, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::divide(d_src, s, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-6);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::divide(src, s, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// DivideScalarInv
+
+PERF_TEST_P(Sz_Depth, DivideScalarInv,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar s;
+    declare.in(s, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::divide(s[0], d_src, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-6);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::divide(s, src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// AbsDiffMat
+
+PERF_TEST_P(Sz_Depth, AbsDiffMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::absdiff(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::absdiff(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// AbsDiffScalar
+
+PERF_TEST_P(Sz_Depth, AbsDiffScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar s;
+    declare.in(s, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::absdiff(d_src, s, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::absdiff(src, s, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Abs
+
+PERF_TEST_P(Sz_Depth, Abs,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_16S, CV_32F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::abs(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Sqr
+
+PERF_TEST_P(Sz_Depth, Sqr,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S, CV_32F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::sqr(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Sqrt
+
+PERF_TEST_P(Sz_Depth, Sqrt,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S, CV_32F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    cv::randu(src, 0, 100000);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::sqrt(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::sqrt(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Log
+
+PERF_TEST_P(Sz_Depth, Log,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S, CV_32F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    cv::randu(src, 0, 100000);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::log(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::log(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Exp
+
+PERF_TEST_P(Sz_Depth, Exp,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S, CV_32F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    cv::randu(src, 0, 10);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::exp(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::exp(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Pow
+
+DEF_PARAM_TEST(Sz_Depth_Power, cv::Size, MatDepth, double);
+
+PERF_TEST_P(Sz_Depth_Power, Pow,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S, CV_32F),
+                    Values(0.3, 2.0, 2.4)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const double power = GET_PARAM(2);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::pow(d_src, power, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::pow(src, power, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CompareMat
+
+CV_ENUM(CmpCode, cv::CMP_EQ, cv::CMP_GT, cv::CMP_GE, cv::CMP_LT, cv::CMP_LE, cv::CMP_NE)
+
+DEF_PARAM_TEST(Sz_Depth_Code, cv::Size, MatDepth, CmpCode);
+
+PERF_TEST_P(Sz_Depth_Code, CompareMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH,
+                    CmpCode::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int cmp_code = GET_PARAM(2);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::compare(d_src1, d_src2, dst, cmp_code);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::compare(src1, src2, dst, cmp_code);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CompareScalar
+
+PERF_TEST_P(Sz_Depth_Code, CompareScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH,
+                    CmpCode::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int cmp_code = GET_PARAM(2);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar s;
+    declare.in(s, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::compare(d_src, s, dst, cmp_code);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::compare(src, s, dst, cmp_code);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BitwiseNot
+
+PERF_TEST_P(Sz_Depth, BitwiseNot,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::bitwise_not(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::bitwise_not(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BitwiseAndMat
+
+PERF_TEST_P(Sz_Depth, BitwiseAndMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::bitwise_and(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::bitwise_and(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BitwiseAndScalar
+
+DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
+
+PERF_TEST_P(Sz_Depth_Cn, BitwiseAndScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar s;
+    declare.in(s, WARMUP_RNG);
+    cv::Scalar_<int> is = s;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::bitwise_and(d_src, is, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::bitwise_and(src, is, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BitwiseOrMat
+
+PERF_TEST_P(Sz_Depth, BitwiseOrMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::bitwise_or(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::bitwise_or(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BitwiseOrScalar
+
+PERF_TEST_P(Sz_Depth_Cn, BitwiseOrScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar s;
+    declare.in(s, WARMUP_RNG);
+    cv::Scalar_<int> is = s;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::bitwise_or(d_src, is, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::bitwise_or(src, is, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BitwiseXorMat
+
+PERF_TEST_P(Sz_Depth, BitwiseXorMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::bitwise_xor(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::bitwise_xor(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BitwiseXorScalar
+
+PERF_TEST_P(Sz_Depth_Cn, BitwiseXorScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar s;
+    declare.in(s, WARMUP_RNG);
+    cv::Scalar_<int> is = s;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::bitwise_xor(d_src, is, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::bitwise_xor(src, is, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// RShift
+
+PERF_TEST_P(Sz_Depth_Cn, RShift,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Scalar_<int> val = cv::Scalar_<int>::all(4);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::rshift(d_src, val, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// LShift
+
+PERF_TEST_P(Sz_Depth_Cn, LShift,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Scalar_<int> val = cv::Scalar_<int>::all(4);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::lshift(d_src, val, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MinMat
+
+PERF_TEST_P(Sz_Depth, MinMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::min(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::min(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MinScalar
+
+PERF_TEST_P(Sz_Depth, MinScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar val;
+    declare.in(val, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::min(d_src, val[0], dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::min(src, val[0], dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MaxMat
+
+PERF_TEST_P(Sz_Depth, MaxMat,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src1(size, depth);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::max(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::max(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MaxScalar
+
+PERF_TEST_P(Sz_Depth, MaxScalar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Scalar val;
+    declare.in(val, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::max(d_src, val[0], dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::max(src, val[0], dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// AddWeighted
+
+DEF_PARAM_TEST(Sz_3Depth, cv::Size, MatDepth, MatDepth, MatDepth);
+
+PERF_TEST_P(Sz_3Depth, AddWeighted,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth1 = GET_PARAM(1);
+    const int depth2 = GET_PARAM(2);
+    const int dst_depth = GET_PARAM(3);
+
+    cv::Mat src1(size, depth1);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, depth2);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::addWeighted(d_src1, 0.5, d_src2, 0.5, 10.0, dst, dst_depth);
+
+        CUDA_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::addWeighted(src1, 0.5, src2, 0.5, 10.0, dst, dst_depth);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MagnitudeComplex
+
+PERF_TEST_P(Sz, MagnitudeComplex,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_32FC2);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::magnitude(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat xy[2];
+        cv::split(src, xy);
+
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::magnitude(xy[0], xy[1], dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MagnitudeSqrComplex
+
+PERF_TEST_P(Sz, MagnitudeSqrComplex,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_32FC2);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::magnitudeSqr(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Magnitude
+
+PERF_TEST_P(Sz, Magnitude,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src1(size, CV_32FC1);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, CV_32FC1);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::magnitude(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::magnitude(src1, src2, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MagnitudeSqr
+
+PERF_TEST_P(Sz, MagnitudeSqr,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src1(size, CV_32FC1);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, CV_32FC1);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::magnitudeSqr(d_src1, d_src2, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Phase
+
+DEF_PARAM_TEST(Sz_AngleInDegrees, cv::Size, bool);
+
+PERF_TEST_P(Sz_AngleInDegrees, Phase,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Bool()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const bool angleInDegrees = GET_PARAM(1);
+
+    cv::Mat src1(size, CV_32FC1);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, CV_32FC1);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::phase(d_src1, d_src2, dst, angleInDegrees);
+
+        CUDA_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::phase(src1, src2, dst, angleInDegrees);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CartToPolar
+
+PERF_TEST_P(Sz_AngleInDegrees, CartToPolar,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Bool()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const bool angleInDegrees = GET_PARAM(1);
+
+    cv::Mat src1(size, CV_32FC1);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, CV_32FC1);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat magnitude;
+        cv::cuda::GpuMat angle;
+
+        TEST_CYCLE() cv::cuda::cartToPolar(d_src1, d_src2, magnitude, angle, angleInDegrees);
+
+        CUDA_SANITY_CHECK(magnitude);
+        CUDA_SANITY_CHECK(angle, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat magnitude;
+        cv::Mat angle;
+
+        TEST_CYCLE() cv::cartToPolar(src1, src2, magnitude, angle, angleInDegrees);
+
+        CPU_SANITY_CHECK(magnitude);
+        CPU_SANITY_CHECK(angle);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// PolarToCart
+
+PERF_TEST_P(Sz_AngleInDegrees, PolarToCart,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Bool()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const bool angleInDegrees = GET_PARAM(1);
+
+    cv::Mat magnitude(size, CV_32FC1);
+    declare.in(magnitude, WARMUP_RNG);
+
+    cv::Mat angle(size, CV_32FC1);
+    declare.in(angle, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_magnitude(magnitude);
+        const cv::cuda::GpuMat d_angle(angle);
+        cv::cuda::GpuMat x;
+        cv::cuda::GpuMat y;
+
+        TEST_CYCLE() cv::cuda::polarToCart(d_magnitude, d_angle, x, y, angleInDegrees);
+
+        CUDA_SANITY_CHECK(x);
+        CUDA_SANITY_CHECK(y);
+    }
+    else
+    {
+        cv::Mat x;
+        cv::Mat y;
+
+        TEST_CYCLE() cv::polarToCart(magnitude, angle, x, y, angleInDegrees);
+
+        CPU_SANITY_CHECK(x);
+        CPU_SANITY_CHECK(y);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Threshold
+
+CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
+
+DEF_PARAM_TEST(Sz_Depth_Op, cv::Size, MatDepth, ThreshOp);
+
+PERF_TEST_P(Sz_Depth_Op, Threshold,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+            Values(CV_8U, CV_16U, CV_32F, CV_64F),
+            ThreshOp::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int threshOp = GET_PARAM(2);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::threshold(d_src, dst, 100.0, 255.0, threshOp);
+
+        CUDA_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::threshold(src, dst, 100.0, 255.0, threshOp);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaarithm/perf/perf_main.cpp b/modules/cudaarithm/perf/perf_main.cpp
new file mode 100644
index 00000000000..118d7596ac2
--- /dev/null
+++ b/modules/cudaarithm/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudaarithm)
diff --git a/modules/cudaarithm/perf/perf_precomp.hpp b/modules/cudaarithm/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..071ac946537
--- /dev/null
+++ b/modules/cudaarithm/perf/perf_precomp.hpp
@@ -0,0 +1,55 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudaarithm.hpp"
+
+namespace opencv_test {
+using namespace perf;
+using namespace testing;
+}
+
+#endif
diff --git a/modules/cudaarithm/perf/perf_reductions.cpp b/modules/cudaarithm/perf/perf_reductions.cpp
new file mode 100644
index 00000000000..71bb5524a63
--- /dev/null
+++ b/modules/cudaarithm/perf/perf_reductions.cpp
@@ -0,0 +1,520 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// Norm
+
+DEF_PARAM_TEST(Sz_Depth_Norm, cv::Size, MatDepth, NormType);
+
+PERF_TEST_P(Sz_Depth_Norm, Norm,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S, CV_32F),
+                    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int normType = GET_PARAM(2);
+
+    cv::Mat src(size, depth);
+    if (depth == CV_8U)
+        cv::randu(src, 0, 254);
+    else
+        declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
+        double gpu_dst;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src, normType, d_buf);
+
+        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        double cpu_dst;
+
+        TEST_CYCLE() cpu_dst = cv::norm(src, normType);
+
+        SANITY_CHECK(cpu_dst, 1e-6, ERROR_RELATIVE);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// NormDiff
+
+DEF_PARAM_TEST(Sz_Norm, cv::Size, NormType);
+
+PERF_TEST_P(Sz_Norm, NormDiff,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int normType = GET_PARAM(1);
+
+    cv::Mat src1(size, CV_8UC1);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, CV_8UC1);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        double gpu_dst;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src1, d_src2, normType);
+
+        SANITY_CHECK(gpu_dst);
+
+    }
+    else
+    {
+        double cpu_dst;
+
+        TEST_CYCLE() cpu_dst = cv::norm(src1, src2, normType);
+
+        SANITY_CHECK(cpu_dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Sum
+
+DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
+
+PERF_TEST_P(Sz_Depth_Cn, Sum,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::Scalar gpu_dst;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::sum(d_src);
+
+        SANITY_CHECK(gpu_dst, 1e-5, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Scalar cpu_dst;
+
+        TEST_CYCLE() cpu_dst = cv::sum(src);
+
+        SANITY_CHECK(cpu_dst, 1e-6, ERROR_RELATIVE);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SumAbs
+
+PERF_TEST_P(Sz_Depth_Cn, SumAbs,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::Scalar gpu_dst;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::absSum(d_src);
+
+        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SumSqr
+
+PERF_TEST_P(Sz_Depth_Cn, SumSqr,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values<MatDepth>(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::Scalar gpu_dst;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::sqrSum(d_src);
+
+        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MinMax
+
+DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
+
+PERF_TEST_P(Sz_Depth, MinMax,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    if (depth == CV_8U)
+        cv::randu(src, 0, 254);
+    else
+        declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        double gpu_minVal, gpu_maxVal;
+
+        TEST_CYCLE() cv::cuda::minMax(d_src, &gpu_minVal, &gpu_maxVal, cv::cuda::GpuMat());
+
+        SANITY_CHECK(gpu_minVal, 1e-10);
+        SANITY_CHECK(gpu_maxVal, 1e-10);
+    }
+    else
+    {
+        double cpu_minVal, cpu_maxVal;
+
+        TEST_CYCLE() cv::minMaxLoc(src, &cpu_minVal, &cpu_maxVal);
+
+        SANITY_CHECK(cpu_minVal);
+        SANITY_CHECK(cpu_maxVal);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MinMaxLoc
+
+PERF_TEST_P(Sz_Depth, MinMaxLoc,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    if (depth == CV_8U)
+        cv::randu(src, 0, 254);
+    else
+        declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        double gpu_minVal, gpu_maxVal;
+        cv::Point gpu_minLoc, gpu_maxLoc;
+
+        TEST_CYCLE() cv::cuda::minMaxLoc(d_src, &gpu_minVal, &gpu_maxVal, &gpu_minLoc, &gpu_maxLoc);
+
+        SANITY_CHECK(gpu_minVal, 1e-10);
+        SANITY_CHECK(gpu_maxVal, 1e-10);
+    }
+    else
+    {
+        double cpu_minVal, cpu_maxVal;
+        cv::Point cpu_minLoc, cpu_maxLoc;
+
+        TEST_CYCLE() cv::minMaxLoc(src, &cpu_minVal, &cpu_maxVal, &cpu_minLoc, &cpu_maxLoc);
+
+        SANITY_CHECK(cpu_minVal);
+        SANITY_CHECK(cpu_maxVal);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CountNonZero
+
+PERF_TEST_P(Sz_Depth, CountNonZero,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        int gpu_dst = 0;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::countNonZero(d_src);
+
+        SANITY_CHECK(gpu_dst);
+    }
+    else
+    {
+        int cpu_dst = 0;
+
+        TEST_CYCLE() cpu_dst = cv::countNonZero(src);
+
+        SANITY_CHECK(cpu_dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Reduce
+
+CV_ENUM(ReduceCode, REDUCE_SUM, REDUCE_AVG, REDUCE_MAX, REDUCE_MIN)
+
+enum {Rows = 0, Cols = 1};
+CV_ENUM(ReduceDim, Rows, Cols)
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Code_Dim, cv::Size, MatDepth, MatCn, ReduceCode, ReduceDim);
+
+PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_16S, CV_32F),
+                    Values(1, 2, 3, 4),
+                    ReduceCode::all(),
+                    ReduceDim::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int reduceOp = GET_PARAM(3);
+    const int dim = GET_PARAM(4);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp, CV_32F);
+
+        dst = dst.reshape(dst.channels(), 1);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp, CV_32F);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Normalize
+
+DEF_PARAM_TEST(Sz_Depth_NormType, cv::Size, MatDepth, NormType);
+
+PERF_TEST_P(Sz_Depth_NormType, Normalize,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    Values(NormType(cv::NORM_INF),
+                           NormType(cv::NORM_L1),
+                           NormType(cv::NORM_L2),
+                           NormType(cv::NORM_MINMAX))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int norm_type = GET_PARAM(2);
+
+    const double alpha = 1;
+    const double beta = 0;
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::normalize(d_src, dst, alpha, beta, norm_type, type, cv::cuda::GpuMat());
+
+        CUDA_SANITY_CHECK(dst, 1e-6);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::normalize(src, dst, alpha, beta, norm_type, type);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MeanStdDev
+
+PERF_TEST_P(Sz, MeanStdDev,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::Scalar gpu_mean;
+        cv::Scalar gpu_stddev;
+
+        TEST_CYCLE() cv::cuda::meanStdDev(d_src, gpu_mean, gpu_stddev);
+
+        SANITY_CHECK(gpu_mean);
+        SANITY_CHECK(gpu_stddev);
+    }
+    else
+    {
+        cv::Scalar cpu_mean;
+        cv::Scalar cpu_stddev;
+
+        TEST_CYCLE() cv::meanStdDev(src, cpu_mean, cpu_stddev);
+
+        SANITY_CHECK(cpu_mean);
+        SANITY_CHECK(cpu_stddev);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Integral
+
+PERF_TEST_P(Sz, Integral,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::integral(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::integral(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// IntegralSqr
+
+PERF_TEST_P(Sz, IntegralSqr,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaarithm/src/arithm.cpp b/modules/cudaarithm/src/arithm.cpp
new file mode 100644
index 00000000000..381580cff43
--- /dev/null
+++ b/modules/cudaarithm/src/arithm.cpp
@@ -0,0 +1,582 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::gemm(InputArray, InputArray, double, InputArray, double, OutputArray, int, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::mulSpectrums(InputArray, InputArray, OutputArray, int, bool, Stream&) { throw_no_cuda(); }
+void cv::cuda::mulAndScaleSpectrums(InputArray, InputArray, OutputArray, int, float, bool, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::dft(InputArray, OutputArray, Size, int, Stream&) { throw_no_cuda(); }
+
+Ptr<Convolution> cv::cuda::createConvolution(Size) { throw_no_cuda(); return Ptr<Convolution>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace
+{
+    #define error_entry(entry)  { entry, #entry }
+
+    struct ErrorEntry
+    {
+        int code;
+        const char* str;
+    };
+
+    struct ErrorEntryComparer
+    {
+        int code;
+        ErrorEntryComparer(int code_) : code(code_) {}
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }
+    };
+
+    String getErrorString(int code, const ErrorEntry* errors, size_t n)
+    {
+        size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
+
+        const char* msg = (idx != n) ? errors[idx].str : "Unknown error code";
+        String str = cv::format("%s [Code = %d]", msg, code);
+
+        return str;
+    }
+}
+
+#ifdef HAVE_CUBLAS
+    namespace
+    {
+        const ErrorEntry cublas_errors[] =
+        {
+            error_entry( CUBLAS_STATUS_SUCCESS ),
+            error_entry( CUBLAS_STATUS_NOT_INITIALIZED ),
+            error_entry( CUBLAS_STATUS_ALLOC_FAILED ),
+            error_entry( CUBLAS_STATUS_INVALID_VALUE ),
+            error_entry( CUBLAS_STATUS_ARCH_MISMATCH ),
+            error_entry( CUBLAS_STATUS_MAPPING_ERROR ),
+            error_entry( CUBLAS_STATUS_EXECUTION_FAILED ),
+            error_entry( CUBLAS_STATUS_INTERNAL_ERROR )
+        };
+
+        const size_t cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]);
+
+        static inline void ___cublasSafeCall(cublasStatus_t err, const char* file, const int line, const char* func)
+        {
+            if (CUBLAS_STATUS_SUCCESS != err)
+            {
+                String msg = getErrorString(err, cublas_errors, cublas_error_num);
+                cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
+            }
+        }
+    }
+
+    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, CV_Func)
+#endif // HAVE_CUBLAS
+
+#ifdef HAVE_CUFFT
+    namespace
+    {
+        //////////////////////////////////////////////////////////////////////////
+        // CUFFT errors
+
+        const ErrorEntry cufft_errors[] =
+        {
+            error_entry( CUFFT_INVALID_PLAN ),
+            error_entry( CUFFT_ALLOC_FAILED ),
+            error_entry( CUFFT_INVALID_TYPE ),
+            error_entry( CUFFT_INVALID_VALUE ),
+            error_entry( CUFFT_INTERNAL_ERROR ),
+            error_entry( CUFFT_EXEC_FAILED ),
+            error_entry( CUFFT_SETUP_FAILED ),
+            error_entry( CUFFT_INVALID_SIZE ),
+            error_entry( CUFFT_UNALIGNED_DATA )
+        };
+
+        const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
+
+        void ___cufftSafeCall(int err, const char* file, const int line, const char* func)
+        {
+            if (CUFFT_SUCCESS != err)
+            {
+                String msg = getErrorString(err, cufft_errors, cufft_error_num);
+                cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
+            }
+        }
+    }
+
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, CV_Func)
+
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// gemm
+
+void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray _src3, double beta, OutputArray _dst, int flags, Stream& stream)
+{
+#ifndef HAVE_CUBLAS
+    CV_UNUSED(_src1);
+    CV_UNUSED(_src2);
+    CV_UNUSED(alpha);
+    CV_UNUSED(_src3);
+    CV_UNUSED(beta);
+    CV_UNUSED(_dst);
+    CV_UNUSED(flags);
+    CV_UNUSED(stream);
+    CV_Error(Error::StsNotImplemented, "The library was build without CUBLAS");
+#else
+    // CUBLAS works with column-major matrices
+
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
+    GpuMat src3 = getInputMat(_src3, stream);
+
+    CV_Assert( src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2 );
+    CV_Assert( src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()) );
+
+    if (src1.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    bool tr1 = (flags & GEMM_1_T) != 0;
+    bool tr2 = (flags & GEMM_2_T) != 0;
+    bool tr3 = (flags & GEMM_3_T) != 0;
+
+    if (src1.type() == CV_64FC2)
+    {
+        if (tr1 || tr2 || tr3)
+            CV_Error(cv::Error::StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
+    }
+
+    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
+    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
+    Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
+    Size dstSize(src2Size.width, src1Size.height);
+
+    CV_Assert( src1Size.width == src2Size.height );
+    CV_Assert( src3.empty() || src3Size == dstSize );
+
+    GpuMat dst = getOutputMat(_dst, dstSize, src1.type(), stream);
+
+    if (beta != 0)
+    {
+        if (src3.empty())
+        {
+            dst.setTo(Scalar::all(0), stream);
+        }
+        else
+        {
+            if (tr3)
+            {
+                cuda::transpose(src3, dst, stream);
+            }
+            else
+            {
+                src3.copyTo(dst, stream);
+            }
+        }
+    }
+
+    cublasHandle_t handle;
+    cublasSafeCall( cublasCreate_v2(&handle) );
+
+    cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) );
+
+    cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) );
+
+    const float alphaf = static_cast<float>(alpha);
+    const float betaf = static_cast<float>(beta);
+
+    const cuComplex alphacf = make_cuComplex(alphaf, 0);
+    const cuComplex betacf = make_cuComplex(betaf, 0);
+
+    const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0);
+    const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0);
+
+    cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+    switch (src1.type())
+    {
+    case CV_32FC1:
+        cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alphaf,
+            src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
+            src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
+            &betaf,
+            dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) );
+        break;
+
+    case CV_64FC1:
+        cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alpha,
+            src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
+            src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
+            &beta,
+            dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) );
+        break;
+
+    case CV_32FC2:
+        cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alphacf,
+            src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
+            src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
+            &betacf,
+            dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) );
+        break;
+
+    case CV_64FC2:
+        cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alphac,
+            src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
+            src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
+            &betac,
+            dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) );
+        break;
+    }
+
+    cublasSafeCall( cublasDestroy_v2(handle) );
+
+    syncOutput(dst, _dst, stream);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// DFT function
+
+void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream)
+{
+    if (getInputMat(_src, stream).channels() == 2)
+        flags |= DFT_COMPLEX_INPUT;
+
+    Ptr<DFT> dft = createDFT(dft_size, flags);
+    dft->compute(_src, _dst, stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// DFT algorithm
+
+#ifdef HAVE_CUFFT
+
+namespace
+{
+
+    class DFTImpl : public DFT
+    {
+        Size dft_size, dft_size_opt;
+        bool is_1d_input, is_row_dft, is_scaled_dft, is_inverse, is_complex_input, is_complex_output;
+
+        cufftType dft_type;
+        cufftHandle plan;
+
+    public:
+        DFTImpl(Size dft_size, int flags)
+            : dft_size(dft_size),
+              dft_size_opt(dft_size),
+              is_1d_input((dft_size.height == 1) || (dft_size.width == 1)),
+              is_row_dft((flags & DFT_ROWS) != 0),
+              is_scaled_dft((flags & DFT_SCALE) != 0),
+              is_inverse((flags & DFT_INVERSE) != 0),
+              is_complex_input((flags & DFT_COMPLEX_INPUT) != 0),
+              is_complex_output(!(flags & DFT_REAL_OUTPUT)),
+              dft_type(!is_complex_input ? CUFFT_R2C : (is_complex_output ? CUFFT_C2C : CUFFT_C2R))
+        {
+            // We don't support unpacked output (in the case of real input)
+            CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );
+
+            // We don't support real-to-real transform
+            CV_Assert( is_complex_input || is_complex_output );
+
+            if (is_1d_input && !is_row_dft)
+            {
+                // If the source matrix is single column handle it as single row
+                dft_size_opt.width = std::max(dft_size.width, dft_size.height);
+                dft_size_opt.height = std::min(dft_size.width, dft_size.height);
+            }
+
+            CV_Assert( dft_size_opt.width > 1 );
+
+            if (is_1d_input || is_row_dft)
+                cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
+            else
+                cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );
+        }
+
+        ~DFTImpl()
+        {
+            cufftSafeCall( cufftDestroy(plan) );
+        }
+
+        void compute(InputArray _src, OutputArray _dst, Stream& stream)
+        {
+            GpuMat src = getInputMat(_src, stream);
+
+            CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );
+            CV_Assert( is_complex_input == (src.channels() == 2) );
+
+            // Make sure here we work with the continuous input,
+            // as CUFFT can't handle gaps
+            GpuMat src_cont;
+            if (src.isContinuous())
+            {
+                src_cont = src;
+            }
+            else
+            {
+                BufferPool pool(stream);
+                src_cont.allocator = pool.getAllocator();
+                createContinuous(src.rows, src.cols, src.type(), src_cont);
+                src.copyTo(src_cont, stream);
+            }
+
+            cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
+
+            if (is_complex_input)
+            {
+                if (is_complex_output)
+                {
+                    createContinuous(dft_size, CV_32FC2, _dst);
+                    GpuMat dst = _dst.getGpuMat();
+
+                    cufftSafeCall(cufftExecC2C(
+                            plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
+                            is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
+                }
+                else
+                {
+                    createContinuous(dft_size, CV_32F, _dst);
+                    GpuMat dst = _dst.getGpuMat();
+
+                    cufftSafeCall(cufftExecC2R(
+                            plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
+                }
+            }
+            else
+            {
+                // We could swap dft_size for efficiency. Here we must reflect it
+                if (dft_size == dft_size_opt)
+                    createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
+                else
+                    createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);
+
+                GpuMat dst = _dst.getGpuMat();
+
+                cufftSafeCall(cufftExecR2C(
+                                  plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
+            }
+
+            if (is_scaled_dft)
+                cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);
+        }
+    };
+}
+
+#endif
+
+Ptr<DFT> cv::cuda::createDFT(Size dft_size, int flags)
+{
+#ifndef HAVE_CUFFT
+    CV_UNUSED(dft_size);
+    CV_UNUSED(flags);
+    CV_Error(Error::StsNotImplemented, "The library was build without CUFFT");
+    return Ptr<DFT>();
+#else
+    return makePtr<DFTImpl>(dft_size, flags);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Convolution
+
+#ifdef HAVE_CUFFT
+
+namespace
+{
+    class ConvolutionImpl : public Convolution
+    {
+    public:
+        explicit ConvolutionImpl(Size user_block_size_) : user_block_size(user_block_size_) {}
+
+        void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null());
+
+    private:
+        void create(Size image_size, Size templ_size);
+        static Size estimateBlockSize(Size result_size);
+
+        Size result_size;
+        Size block_size;
+        Size user_block_size;
+        Size dft_size;
+
+        GpuMat image_spect, templ_spect, result_spect;
+        GpuMat image_block, templ_block, result_data;
+    };
+
+    void ConvolutionImpl::create(Size image_size, Size templ_size)
+    {
+        result_size = Size(image_size.width - templ_size.width + 1,
+                           image_size.height - templ_size.height + 1);
+
+        block_size = user_block_size;
+        if (user_block_size.width == 0 || user_block_size.height == 0)
+            block_size = estimateBlockSize(result_size);
+
+        dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
+        dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
+
+        // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
+        // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
+        if (dft_size.width > 8192)
+            dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
+        if (dft_size.height > 8192)
+            dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
+
+        // To avoid wasting time doing small DFTs
+        dft_size.width = std::max(dft_size.width, 512);
+        dft_size.height = std::max(dft_size.height, 512);
+
+        createContinuous(dft_size, CV_32F, image_block);
+        createContinuous(dft_size, CV_32F, templ_block);
+        createContinuous(dft_size, CV_32F, result_data);
+
+        int spect_len = dft_size.height * (dft_size.width / 2 + 1);
+        createContinuous(1, spect_len, CV_32FC2, image_spect);
+        createContinuous(1, spect_len, CV_32FC2, templ_spect);
+        createContinuous(1, spect_len, CV_32FC2, result_spect);
+
+        // Use maximum result matrix block size for the estimated DFT block size
+        block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
+        block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
+    }
+
+    Size ConvolutionImpl::estimateBlockSize(Size result_size)
+    {
+        int width = (result_size.width + 2) / 3;
+        int height = (result_size.height + 2) / 3;
+        width = std::min(width, result_size.width);
+        height = std::min(height, result_size.height);
+        return Size(width, height);
+    }
+
+    void ConvolutionImpl::convolve(InputArray _image, InputArray _templ, OutputArray _result, bool ccorr, Stream& _stream)
+    {
+        GpuMat image = getInputMat(_image, _stream);
+        GpuMat templ = getInputMat(_templ, _stream);
+
+        CV_Assert( image.type() == CV_32FC1 );
+        CV_Assert( templ.type() == CV_32FC1 );
+
+        create(image.size(), templ.size());
+
+        GpuMat result = getOutputMat(_result, result_size, CV_32FC1, _stream);
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        cufftHandle planR2C, planC2R;
+        cufftSafeCall( cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R) );
+        cufftSafeCall( cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C) );
+
+        cufftSafeCall( cufftSetStream(planR2C, stream) );
+        cufftSafeCall( cufftSetStream(planC2R, stream) );
+
+        GpuMat templ_roi(templ.size(), CV_32FC1, templ.data, templ.step);
+        cuda::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
+                            templ_block.cols - templ_roi.cols, 0, Scalar(), _stream);
+
+        cufftSafeCall( cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), templ_spect.ptr<cufftComplex>()) );
+
+        // Process all blocks of the result matrix
+        for (int y = 0; y < result.rows; y += block_size.height)
+        {
+            for (int x = 0; x < result.cols; x += block_size.width)
+            {
+                Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
+                                    std::min(y + dft_size.height, image.rows) - y);
+                GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
+                                 image.step);
+                cuda::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
+                                    0, image_block.cols - image_roi.cols, 0, Scalar(), _stream);
+
+                cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
+                                           image_spect.ptr<cufftComplex>()));
+                cuda::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
+                                          1.f / dft_size.area(), ccorr, _stream);
+                cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
+                                           result_data.ptr<cufftReal>()));
+
+                Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
+                                     std::min(y + block_size.height, result.rows) - y);
+                GpuMat result_roi(result_roi_size, result.type(),
+                                  (void*)(result.ptr<float>(y) + x), result.step);
+                GpuMat result_block(result_roi_size, result_data.type(),
+                                    result_data.ptr(), result_data.step);
+
+                result_block.copyTo(result_roi, _stream);
+            }
+        }
+
+        cufftSafeCall( cufftDestroy(planR2C) );
+        cufftSafeCall( cufftDestroy(planC2R) );
+
+        syncOutput(result, _result, _stream);
+    }
+}
+
+#endif
+
+Ptr<Convolution> cv::cuda::createConvolution(Size user_block_size)
+{
+#ifndef HAVE_CUFFT
+    CV_UNUSED(user_block_size);
+    CV_Error(Error::StsNotImplemented, "The library was build without CUFFT");
+    return Ptr<Convolution>();
+#else
+    return makePtr<ConvolutionImpl>(user_block_size);
+#endif
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaarithm/src/core.cpp b/modules/cudaarithm/src/core.cpp
new file mode 100644
index 00000000000..7dd51f97816
--- /dev/null
+++ b/modules/cudaarithm/src/core.cpp
@@ -0,0 +1,135 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::merge(const GpuMat*, size_t, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::merge(const std::vector<GpuMat>&, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::split(InputArray, GpuMat*, Stream&) { throw_no_cuda(); }
+void cv::cuda::split(InputArray, std::vector<GpuMat>&, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::transpose(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::flip(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
+
+Ptr<LookUpTable> cv::cuda::createLookUpTable(InputArray) { throw_no_cuda(); return Ptr<LookUpTable>(); }
+
+void cv::cuda::copyMakeBorder(InputArray, OutputArray, int, int, int, int, int, Scalar, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+////////////////////////////////////////////////////////////////////////
+// flip
+
+namespace
+{
+    template<int DEPTH> struct NppTypeTraits;
+    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
+    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
+    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
+    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
+    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
+    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
+    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
+
+    template <int DEPTH> struct NppMirrorFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
+    };
+
+    template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
+    {
+        typedef typename NppMirrorFunc<DEPTH>::npp_t npp_t;
+
+        static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize sz;
+            sz.width  = src.cols;
+            sz.height = src.rows;
+
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
+                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
+                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
+        {0,0,0,0},
+        {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
+        {0,0,0,0},
+        {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
+        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
+    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+
+    _dst.create(src.size(), src.type());
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaarithm/src/cuda/absdiff_mat.cu b/modules/cudaarithm/src/cuda/absdiff_mat.cu
new file mode 100644
index 00000000000..ec04f122845
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/absdiff_mat.cu
@@ -0,0 +1,188 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
+
+namespace
+{
+    __device__ __forceinline__ int _abs(int a)
+    {
+        return ::abs(a);
+    }
+    __device__ __forceinline__ float _abs(float a)
+    {
+        return ::fabsf(a);
+    }
+    __device__ __forceinline__ double _abs(double a)
+    {
+        return ::fabs(a);
+    }
+
+    template <typename T> struct AbsDiffOp1 : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            return saturate_cast<T>(_abs(a - b));
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename T>
+    void absDiffMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<T>(dst), AbsDiffOp1<T>(), stream);
+    }
+
+    struct AbsDiffOp2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vabsdiff2(a, b);
+        }
+    };
+
+    void absDiffMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 1;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, AbsDiffOp2(), stream);
+    }
+
+    struct AbsDiffOp4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vabsdiff4(a, b);
+        }
+    };
+
+    void absDiffMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, AbsDiffOp4(), stream);
+    }
+}
+
+void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        absDiffMat_v1<uchar>,
+        absDiffMat_v1<schar>,
+        absDiffMat_v1<ushort>,
+        absDiffMat_v1<short>,
+        absDiffMat_v1<int>,
+        absDiffMat_v1<float>,
+        absDiffMat_v1<double>
+    };
+
+    const int depth = src1.depth();
+
+    CV_DbgAssert( depth <= CV_64F );
+
+    GpuMat src1_ = src1.reshape(1);
+    GpuMat src2_ = src2.reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+
+    if (depth == CV_8U || depth == CV_16U)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
+            {
+                absDiffMat_v4(src1_, src2_, dst_, stream);
+                return;
+            }
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
+            {
+                absDiffMat_v2(src1_, src2_, dst_, stream);
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs[depth];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/absdiff_scalar.cu b/modules/cudaarithm/src/cuda/absdiff_scalar.cu
new file mode 100644
index 00000000000..0955e40c8b1
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/absdiff_scalar.cu
@@ -0,0 +1,133 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
+
+namespace
+{
+    template <typename SrcType, typename ScalarType, typename DstType> struct AbsDiffScalarOp : unary_function<SrcType, DstType>
+    {
+        ScalarType val;
+
+        __device__ __forceinline__ DstType operator ()(SrcType a) const
+        {
+            abs_func<ScalarType> f;
+            return saturate_cast<DstType>(f(saturate_cast<ScalarType>(a) - val));
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename SrcType, typename ScalarDepth>
+    void absDiffScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+    {
+        typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
+
+        cv::Scalar_<ScalarDepth> value_ = value;
+
+        AbsDiffScalarOp<SrcType, ScalarType, SrcType> op;
+        op.val = VecTraits<ScalarType>::make(value_.val);
+        gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<SrcType>(dst), op, stream);
+    }
+}
+
+void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src, cv::Scalar val, GpuMat& dst, Stream& stream);
+    static const func_t funcs[7][4] =
+    {
+        {
+            absDiffScalarImpl<uchar, float>, absDiffScalarImpl<uchar2, float>, absDiffScalarImpl<uchar3, float>, absDiffScalarImpl<uchar4, float>
+        },
+        {
+            absDiffScalarImpl<schar, float>, absDiffScalarImpl<char2, float>, absDiffScalarImpl<char3, float>, absDiffScalarImpl<char4, float>
+        },
+        {
+            absDiffScalarImpl<ushort, float>, absDiffScalarImpl<ushort2, float>, absDiffScalarImpl<ushort3, float>, absDiffScalarImpl<ushort4, float>
+        },
+        {
+            absDiffScalarImpl<short, float>, absDiffScalarImpl<short2, float>, absDiffScalarImpl<short3, float>, absDiffScalarImpl<short4, float>
+        },
+        {
+            absDiffScalarImpl<int, float>, absDiffScalarImpl<int2, float>, absDiffScalarImpl<int3, float>, absDiffScalarImpl<int4, float>
+        },
+        {
+          absDiffScalarImpl<float, float>, absDiffScalarImpl<float2, float>, absDiffScalarImpl<float3, float>, absDiffScalarImpl<float4, float>
+        },
+        {
+          absDiffScalarImpl<double, double>, absDiffScalarImpl<double2, double>, absDiffScalarImpl<double3, double>, absDiffScalarImpl<double4, double>
+        }
+    };
+
+    const int sdepth = src.depth();
+    const int cn = src.channels();
+
+    CV_DbgAssert( sdepth <= CV_64F && cn <= 4 && src.type() == dst.type());
+
+    const func_t func = funcs[sdepth][cn - 1];
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src, val, dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/add_mat.cu b/modules/cudaarithm/src/cuda/add_mat.cu
new file mode 100644
index 00000000000..4166cc104e0
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/add_mat.cu
@@ -0,0 +1,225 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);
+
+namespace
+{
+    template <typename T, typename D> struct AddOp1 : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a + b);
+        }
+    };
+
+    template <typename T, typename D>
+    void addMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
+    {
+        if (mask.data)
+            gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), AddOp1<T, D>(), globPtr<uchar>(mask), stream);
+        else
+            gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), AddOp1<T, D>(), stream);
+    }
+
+    struct AddOp2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vadd2(a, b);
+        }
+    };
+
+    void addMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 1;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, AddOp2(), stream);
+    }
+
+    struct AddOp4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vadd4(a, b);
+        }
+    };
+
+    void addMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, AddOp4(), stream);
+    }
+}
+
+void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs[7][7] =
+    {
+        {
+            addMat_v1<uchar, uchar>,
+            addMat_v1<uchar, schar>,
+            addMat_v1<uchar, ushort>,
+            addMat_v1<uchar, short>,
+            addMat_v1<uchar, int>,
+            addMat_v1<uchar, float>,
+            addMat_v1<uchar, double>
+        },
+        {
+            addMat_v1<schar, uchar>,
+            addMat_v1<schar, schar>,
+            addMat_v1<schar, ushort>,
+            addMat_v1<schar, short>,
+            addMat_v1<schar, int>,
+            addMat_v1<schar, float>,
+            addMat_v1<schar, double>
+        },
+        {
+            0 /*addMat_v1<ushort, uchar>*/,
+            0 /*addMat_v1<ushort, schar>*/,
+            addMat_v1<ushort, ushort>,
+            addMat_v1<ushort, short>,
+            addMat_v1<ushort, int>,
+            addMat_v1<ushort, float>,
+            addMat_v1<ushort, double>
+        },
+        {
+            0 /*addMat_v1<short, uchar>*/,
+            0 /*addMat_v1<short, schar>*/,
+            addMat_v1<short, ushort>,
+            addMat_v1<short, short>,
+            addMat_v1<short, int>,
+            addMat_v1<short, float>,
+            addMat_v1<short, double>
+        },
+        {
+            0 /*addMat_v1<int, uchar>*/,
+            0 /*addMat_v1<int, schar>*/,
+            0 /*addMat_v1<int, ushort>*/,
+            0 /*addMat_v1<int, short>*/,
+            addMat_v1<int, int>,
+            addMat_v1<int, float>,
+            addMat_v1<int, double>
+        },
+        {
+            0 /*addMat_v1<float, uchar>*/,
+            0 /*addMat_v1<float, schar>*/,
+            0 /*addMat_v1<float, ushort>*/,
+            0 /*addMat_v1<float, short>*/,
+            0 /*addMat_v1<float, int>*/,
+            addMat_v1<float, float>,
+            addMat_v1<float, double>
+        },
+        {
+            0 /*addMat_v1<double, uchar>*/,
+            0 /*addMat_v1<double, schar>*/,
+            0 /*addMat_v1<double, ushort>*/,
+            0 /*addMat_v1<double, short>*/,
+            0 /*addMat_v1<double, int>*/,
+            0 /*addMat_v1<double, float>*/,
+            addMat_v1<double, double>
+        }
+    };
+
+    const int sdepth = src1.depth();
+    const int ddepth = dst.depth();
+
+    CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
+
+    GpuMat src1_ = src1.reshape(1);
+    GpuMat src2_ = src2.reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+
+    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
+            {
+                addMat_v4(src1_, src2_, dst_, stream);
+                return;
+            }
+            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
+            {
+                addMat_v2(src1_, src2_, dst_, stream);
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs[sdepth][ddepth];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, mask, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/add_scalar.cu b/modules/cudaarithm/src/cuda/add_scalar.cu
new file mode 100644
index 00000000000..92838a2a57d
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/add_scalar.cu
@@ -0,0 +1,180 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void addScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);
+
+namespace
+{
+    template <typename SrcType, typename ScalarType, typename DstType> struct AddScalarOp : unary_function<SrcType, DstType>
+    {
+        ScalarType val;
+
+        __device__ __forceinline__ DstType operator ()(SrcType a) const
+        {
+            return saturate_cast<DstType>(saturate_cast<ScalarType>(a) + val);
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename SrcType, typename ScalarDepth, typename DstType>
+    void addScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, const GpuMat& mask, Stream& stream)
+    {
+        typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
+
+        cv::Scalar_<ScalarDepth> value_ = value;
+
+        AddScalarOp<SrcType, ScalarType, DstType> op;
+        op.val = VecTraits<ScalarType>::make(value_.val);
+
+        if (mask.data)
+            gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, globPtr<uchar>(mask), stream);
+        else
+            gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
+    }
+}
+
+void addScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src, cv::Scalar val, GpuMat& dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs[7][7][4] =
+    {
+        {
+            {addScalarImpl<uchar, float, uchar>, addScalarImpl<uchar2, float, uchar2>, addScalarImpl<uchar3, float, uchar3>, addScalarImpl<uchar4, float, uchar4>},
+            {addScalarImpl<uchar, float, schar>, addScalarImpl<uchar2, float, char2>, addScalarImpl<uchar3, float, char3>, addScalarImpl<uchar4, float, char4>},
+            {addScalarImpl<uchar, float, ushort>, addScalarImpl<uchar2, float, ushort2>, addScalarImpl<uchar3, float, ushort3>, addScalarImpl<uchar4, float, ushort4>},
+            {addScalarImpl<uchar, float, short>, addScalarImpl<uchar2, float, short2>, addScalarImpl<uchar3, float, short3>, addScalarImpl<uchar4, float, short4>},
+            {addScalarImpl<uchar, float, int>, addScalarImpl<uchar2, float, int2>, addScalarImpl<uchar3, float, int3>, addScalarImpl<uchar4, float, int4>},
+            {addScalarImpl<uchar, float, float>, addScalarImpl<uchar2, float, float2>, addScalarImpl<uchar3, float, float3>, addScalarImpl<uchar4, float, float4>},
+            {addScalarImpl<uchar, double, double>, addScalarImpl<uchar2, double, double2>, addScalarImpl<uchar3, double, double3>, addScalarImpl<uchar4, double, double4>}
+        },
+        {
+            {addScalarImpl<schar, float, uchar>, addScalarImpl<char2, float, uchar2>, addScalarImpl<char3, float, uchar3>, addScalarImpl<char4, float, uchar4>},
+            {addScalarImpl<schar, float, schar>, addScalarImpl<char2, float, char2>, addScalarImpl<char3, float, char3>, addScalarImpl<char4, float, char4>},
+            {addScalarImpl<schar, float, ushort>, addScalarImpl<char2, float, ushort2>, addScalarImpl<char3, float, ushort3>, addScalarImpl<char4, float, ushort4>},
+            {addScalarImpl<schar, float, short>, addScalarImpl<char2, float, short2>, addScalarImpl<char3, float, short3>, addScalarImpl<char4, float, short4>},
+            {addScalarImpl<schar, float, int>, addScalarImpl<char2, float, int2>, addScalarImpl<char3, float, int3>, addScalarImpl<char4, float, int4>},
+            {addScalarImpl<schar, float, float>, addScalarImpl<char2, float, float2>, addScalarImpl<char3, float, float3>, addScalarImpl<char4, float, float4>},
+            {addScalarImpl<schar, double, double>, addScalarImpl<char2, double, double2>, addScalarImpl<char3, double, double3>, addScalarImpl<char4, double, double4>}
+        },
+        {
+            {0 /*addScalarImpl<ushort, float, uchar>*/, 0 /*addScalarImpl<ushort2, float, uchar2>*/, 0 /*addScalarImpl<ushort3, float, uchar3>*/, 0 /*addScalarImpl<ushort4, float, uchar4>*/},
+            {0 /*addScalarImpl<ushort, float, schar>*/, 0 /*addScalarImpl<ushort2, float, char2>*/, 0 /*addScalarImpl<ushort3, float, char3>*/, 0 /*addScalarImpl<ushort4, float, char4>*/},
+            {addScalarImpl<ushort, float, ushort>, addScalarImpl<ushort2, float, ushort2>, addScalarImpl<ushort3, float, ushort3>, addScalarImpl<ushort4, float, ushort4>},
+            {addScalarImpl<ushort, float, short>, addScalarImpl<ushort2, float, short2>, addScalarImpl<ushort3, float, short3>, addScalarImpl<ushort4, float, short4>},
+            {addScalarImpl<ushort, float, int>, addScalarImpl<ushort2, float, int2>, addScalarImpl<ushort3, float, int3>, addScalarImpl<ushort4, float, int4>},
+            {addScalarImpl<ushort, float, float>, addScalarImpl<ushort2, float, float2>, addScalarImpl<ushort3, float, float3>, addScalarImpl<ushort4, float, float4>},
+            {addScalarImpl<ushort, double, double>, addScalarImpl<ushort2, double, double2>, addScalarImpl<ushort3, double, double3>, addScalarImpl<ushort4, double, double4>}
+        },
+        {
+            {0 /*addScalarImpl<short, float, uchar>*/, 0 /*addScalarImpl<short2, float, uchar2>*/, 0 /*addScalarImpl<short3, float, uchar3>*/, 0 /*addScalarImpl<short4, float, uchar4>*/},
+            {0 /*addScalarImpl<short, float, schar>*/, 0 /*addScalarImpl<short2, float, char2>*/, 0 /*addScalarImpl<short3, float, char3>*/, 0 /*addScalarImpl<short4, float, char4>*/},
+            {addScalarImpl<short, float, ushort>, addScalarImpl<short2, float, ushort2>, addScalarImpl<short3, float, ushort3>, addScalarImpl<short4, float, ushort4>},
+            {addScalarImpl<short, float, short>, addScalarImpl<short2, float, short2>, addScalarImpl<short3, float, short3>, addScalarImpl<short4, float, short4>},
+            {addScalarImpl<short, float, int>, addScalarImpl<short2, float, int2>, addScalarImpl<short3, float, int3>, addScalarImpl<short4, float, int4>},
+            {addScalarImpl<short, float, float>, addScalarImpl<short2, float, float2>, addScalarImpl<short3, float, float3>, addScalarImpl<short4, float, float4>},
+            {addScalarImpl<short, double, double>, addScalarImpl<short2, double, double2>, addScalarImpl<short3, double, double3>, addScalarImpl<short4, double, double4>}
+        },
+        {
+            {0 /*addScalarImpl<int, float, uchar>*/, 0 /*addScalarImpl<int2, float, uchar2>*/, 0 /*addScalarImpl<int3, float, uchar3>*/, 0 /*addScalarImpl<int4, float, uchar4>*/},
+            {0 /*addScalarImpl<int, float, schar>*/, 0 /*addScalarImpl<int2, float, char2>*/, 0 /*addScalarImpl<int3, float, char3>*/, 0 /*addScalarImpl<int4, float, char4>*/},
+            {0 /*addScalarImpl<int, float, ushort>*/, 0 /*addScalarImpl<int2, float, ushort2>*/, 0 /*addScalarImpl<int3, float, ushort3>*/, 0 /*addScalarImpl<int4, float, ushort4>*/},
+            {0 /*addScalarImpl<int, float, short>*/, 0 /*addScalarImpl<int2, float, short2>*/, 0 /*addScalarImpl<int3, float, short3>*/, 0 /*addScalarImpl<int4, float, short4>*/},
+            {addScalarImpl<int, float, int>, addScalarImpl<int2, float, int2>, addScalarImpl<int3, float, int3>, addScalarImpl<int4, float, int4>},
+            {addScalarImpl<int, float, float>, addScalarImpl<int2, float, float2>, addScalarImpl<int3, float, float3>, addScalarImpl<int4, float, float4>},
+            {addScalarImpl<int, double, double>, addScalarImpl<int2, double, double2>, addScalarImpl<int3, double, double3>, addScalarImpl<int4, double, double4>}
+        },
+        {
+            {0 /*addScalarImpl<float, float, uchar>*/, 0 /*addScalarImpl<float2, float, uchar2>*/, 0 /*addScalarImpl<float3, float, uchar3>*/, 0 /*addScalarImpl<float4, float, uchar4>*/},
+            {0 /*addScalarImpl<float, float, schar>*/, 0 /*addScalarImpl<float2, float, char2>*/, 0 /*addScalarImpl<float3, float, char3>*/, 0 /*addScalarImpl<float4, float, char4>*/},
+            {0 /*addScalarImpl<float, float, ushort>*/, 0 /*addScalarImpl<float2, float, ushort2>*/, 0 /*addScalarImpl<float3, float, ushort3>*/, 0 /*addScalarImpl<float4, float, ushort4>*/},
+            {0 /*addScalarImpl<float, float, short>*/, 0 /*addScalarImpl<float2, float, short2>*/, 0 /*addScalarImpl<float3, float, short3>*/, 0 /*addScalarImpl<float4, float, short4>*/},
+            {0 /*addScalarImpl<float, float, int>*/, 0 /*addScalarImpl<float2, float, int2>*/, 0 /*addScalarImpl<float3, float, int3>*/, 0 /*addScalarImpl<float4, float, int4>*/},
+            {addScalarImpl<float, float, float>, addScalarImpl<float2, float, float2>, addScalarImpl<float3, float, float3>, addScalarImpl<float4, float, float4>},
+            {addScalarImpl<float, double, double>, addScalarImpl<float2, double, double2>, addScalarImpl<float3, double, double3>, addScalarImpl<float4, double, double4>}
+        },
+        {
+            {0 /*addScalarImpl<double, double, uchar>*/, 0 /*addScalarImpl<double2, double, uchar2>*/, 0 /*addScalarImpl<double3, double, uchar3>*/, 0 /*addScalarImpl<double4, double, uchar4>*/},
+            {0 /*addScalarImpl<double, double, schar>*/, 0 /*addScalarImpl<double2, double, char2>*/, 0 /*addScalarImpl<double3, double, char3>*/, 0 /*addScalarImpl<double4, double, char4>*/},
+            {0 /*addScalarImpl<double, double, ushort>*/, 0 /*addScalarImpl<double2, double, ushort2>*/, 0 /*addScalarImpl<double3, double, ushort3>*/, 0 /*addScalarImpl<double4, double, ushort4>*/},
+            {0 /*addScalarImpl<double, double, short>*/, 0 /*addScalarImpl<double2, double, short2>*/, 0 /*addScalarImpl<double3, double, short3>*/, 0 /*addScalarImpl<double4, double, short4>*/},
+            {0 /*addScalarImpl<double, double, int>*/, 0 /*addScalarImpl<double2, double, int2>*/, 0 /*addScalarImpl<double3, double, int3>*/, 0 /*addScalarImpl<double4, double, int4>*/},
+            {0 /*addScalarImpl<double, double, float>*/, 0 /*addScalarImpl<double2, double, float2>*/, 0 /*addScalarImpl<double3, double, float3>*/, 0 /*addScalarImpl<double4, double, float4>*/},
+            {addScalarImpl<double, double, double>, addScalarImpl<double2, double, double2>, addScalarImpl<double3, double, double3>, addScalarImpl<double4, double, double4>}
+        }
+    };
+
+    const int sdepth = src.depth();
+    const int ddepth = dst.depth();
+    const int cn = src.channels();
+
+    CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
+
+    const func_t func = funcs[sdepth][ddepth][cn - 1];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src, val, dst, mask, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/add_weighted.cu b/modules/cudaarithm/src/cuda/add_weighted.cu
new file mode 100644
index 00000000000..929301076d3
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/add_weighted.cu
@@ -0,0 +1,596 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    template <typename T1, typename T2, typename D, typename S> struct AddWeightedOp : binary_function<T1, T2, D>
+    {
+        S alpha;
+        S beta;
+        S gamma;
+
+        __device__ __forceinline__ D operator ()(T1 a, T2 b) const
+        {
+            return cudev::saturate_cast<D>(a * alpha + b * beta + gamma);
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename T1, typename T2, typename D>
+    void addWeightedImpl(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, Stream& stream)
+    {
+        typedef typename LargerType<T1, T2>::type larger_type1;
+        typedef typename LargerType<larger_type1, D>::type larger_type2;
+        typedef typename LargerType<larger_type2, float>::type scalar_type;
+
+        AddWeightedOp<T1, T2, D, scalar_type> op;
+        op.alpha = static_cast<scalar_type>(alpha);
+        op.beta = static_cast<scalar_type>(beta);
+        op.gamma = static_cast<scalar_type>(gamma);
+
+        gridTransformBinary_< TransformPolicy<scalar_type> >(globPtr<T1>(src1), globPtr<T2>(src2), globPtr<D>(dst), op, stream);
+    }
+}
+
+void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, double beta, double gamma, OutputArray _dst, int ddepth, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, Stream& stream);
+    static const func_t funcs[7][7][7] =
+    {
+        {
+            {
+                addWeightedImpl<uchar, uchar, uchar >,
+                addWeightedImpl<uchar, uchar, schar >,
+                addWeightedImpl<uchar, uchar, ushort>,
+                addWeightedImpl<uchar, uchar, short >,
+                addWeightedImpl<uchar, uchar, int   >,
+                addWeightedImpl<uchar, uchar, float >,
+                addWeightedImpl<uchar, uchar, double>
+            },
+            {
+                addWeightedImpl<uchar, schar, uchar >,
+                addWeightedImpl<uchar, schar, schar >,
+                addWeightedImpl<uchar, schar, ushort>,
+                addWeightedImpl<uchar, schar, short >,
+                addWeightedImpl<uchar, schar, int   >,
+                addWeightedImpl<uchar, schar, float >,
+                addWeightedImpl<uchar, schar, double>
+            },
+            {
+                addWeightedImpl<uchar, ushort, uchar >,
+                addWeightedImpl<uchar, ushort, schar >,
+                addWeightedImpl<uchar, ushort, ushort>,
+                addWeightedImpl<uchar, ushort, short >,
+                addWeightedImpl<uchar, ushort, int   >,
+                addWeightedImpl<uchar, ushort, float >,
+                addWeightedImpl<uchar, ushort, double>
+            },
+            {
+                addWeightedImpl<uchar, short, uchar >,
+                addWeightedImpl<uchar, short, schar >,
+                addWeightedImpl<uchar, short, ushort>,
+                addWeightedImpl<uchar, short, short >,
+                addWeightedImpl<uchar, short, int   >,
+                addWeightedImpl<uchar, short, float >,
+                addWeightedImpl<uchar, short, double>
+            },
+            {
+                addWeightedImpl<uchar, int, uchar >,
+                addWeightedImpl<uchar, int, schar >,
+                addWeightedImpl<uchar, int, ushort>,
+                addWeightedImpl<uchar, int, short >,
+                addWeightedImpl<uchar, int, int   >,
+                addWeightedImpl<uchar, int, float >,
+                addWeightedImpl<uchar, int, double>
+            },
+            {
+                addWeightedImpl<uchar, float, uchar >,
+                addWeightedImpl<uchar, float, schar >,
+                addWeightedImpl<uchar, float, ushort>,
+                addWeightedImpl<uchar, float, short >,
+                addWeightedImpl<uchar, float, int   >,
+                addWeightedImpl<uchar, float, float >,
+                addWeightedImpl<uchar, float, double>
+            },
+            {
+                addWeightedImpl<uchar, double, uchar >,
+                addWeightedImpl<uchar, double, schar >,
+                addWeightedImpl<uchar, double, ushort>,
+                addWeightedImpl<uchar, double, short >,
+                addWeightedImpl<uchar, double, int   >,
+                addWeightedImpl<uchar, double, float >,
+                addWeightedImpl<uchar, double, double>
+            }
+        },
+        {
+            {
+                0/*addWeightedImpl<schar, uchar, uchar >*/,
+                0/*addWeightedImpl<schar, uchar, schar >*/,
+                0/*addWeightedImpl<schar, uchar, ushort>*/,
+                0/*addWeightedImpl<schar, uchar, short >*/,
+                0/*addWeightedImpl<schar, uchar, int   >*/,
+                0/*addWeightedImpl<schar, uchar, float >*/,
+                0/*addWeightedImpl<schar, uchar, double>*/
+            },
+            {
+                addWeightedImpl<schar, schar, uchar >,
+                addWeightedImpl<schar, schar, schar >,
+                addWeightedImpl<schar, schar, ushort>,
+                addWeightedImpl<schar, schar, short >,
+                addWeightedImpl<schar, schar, int   >,
+                addWeightedImpl<schar, schar, float >,
+                addWeightedImpl<schar, schar, double>
+            },
+            {
+                addWeightedImpl<schar, ushort, uchar >,
+                addWeightedImpl<schar, ushort, schar >,
+                addWeightedImpl<schar, ushort, ushort>,
+                addWeightedImpl<schar, ushort, short >,
+                addWeightedImpl<schar, ushort, int   >,
+                addWeightedImpl<schar, ushort, float >,
+                addWeightedImpl<schar, ushort, double>
+            },
+            {
+                addWeightedImpl<schar, short, uchar >,
+                addWeightedImpl<schar, short, schar >,
+                addWeightedImpl<schar, short, ushort>,
+                addWeightedImpl<schar, short, short >,
+                addWeightedImpl<schar, short, int   >,
+                addWeightedImpl<schar, short, float >,
+                addWeightedImpl<schar, short, double>
+            },
+            {
+                addWeightedImpl<schar, int, uchar >,
+                addWeightedImpl<schar, int, schar >,
+                addWeightedImpl<schar, int, ushort>,
+                addWeightedImpl<schar, int, short >,
+                addWeightedImpl<schar, int, int   >,
+                addWeightedImpl<schar, int, float >,
+                addWeightedImpl<schar, int, double>
+            },
+            {
+                addWeightedImpl<schar, float, uchar >,
+                addWeightedImpl<schar, float, schar >,
+                addWeightedImpl<schar, float, ushort>,
+                addWeightedImpl<schar, float, short >,
+                addWeightedImpl<schar, float, int   >,
+                addWeightedImpl<schar, float, float >,
+                addWeightedImpl<schar, float, double>
+            },
+            {
+                addWeightedImpl<schar, double, uchar >,
+                addWeightedImpl<schar, double, schar >,
+                addWeightedImpl<schar, double, ushort>,
+                addWeightedImpl<schar, double, short >,
+                addWeightedImpl<schar, double, int   >,
+                addWeightedImpl<schar, double, float >,
+                addWeightedImpl<schar, double, double>
+            }
+        },
+        {
+            {
+                0/*addWeightedImpl<ushort, uchar, uchar >*/,
+                0/*addWeightedImpl<ushort, uchar, schar >*/,
+                0/*addWeightedImpl<ushort, uchar, ushort>*/,
+                0/*addWeightedImpl<ushort, uchar, short >*/,
+                0/*addWeightedImpl<ushort, uchar, int   >*/,
+                0/*addWeightedImpl<ushort, uchar, float >*/,
+                0/*addWeightedImpl<ushort, uchar, double>*/
+            },
+            {
+                0/*addWeightedImpl<ushort, schar, uchar >*/,
+                0/*addWeightedImpl<ushort, schar, schar >*/,
+                0/*addWeightedImpl<ushort, schar, ushort>*/,
+                0/*addWeightedImpl<ushort, schar, short >*/,
+                0/*addWeightedImpl<ushort, schar, int   >*/,
+                0/*addWeightedImpl<ushort, schar, float >*/,
+                0/*addWeightedImpl<ushort, schar, double>*/
+            },
+            {
+                addWeightedImpl<ushort, ushort, uchar >,
+                addWeightedImpl<ushort, ushort, schar >,
+                addWeightedImpl<ushort, ushort, ushort>,
+                addWeightedImpl<ushort, ushort, short >,
+                addWeightedImpl<ushort, ushort, int   >,
+                addWeightedImpl<ushort, ushort, float >,
+                addWeightedImpl<ushort, ushort, double>
+            },
+            {
+                addWeightedImpl<ushort, short, uchar >,
+                addWeightedImpl<ushort, short, schar >,
+                addWeightedImpl<ushort, short, ushort>,
+                addWeightedImpl<ushort, short, short >,
+                addWeightedImpl<ushort, short, int   >,
+                addWeightedImpl<ushort, short, float >,
+                addWeightedImpl<ushort, short, double>
+            },
+            {
+                addWeightedImpl<ushort, int, uchar >,
+                addWeightedImpl<ushort, int, schar >,
+                addWeightedImpl<ushort, int, ushort>,
+                addWeightedImpl<ushort, int, short >,
+                addWeightedImpl<ushort, int, int   >,
+                addWeightedImpl<ushort, int, float >,
+                addWeightedImpl<ushort, int, double>
+            },
+            {
+                addWeightedImpl<ushort, float, uchar >,
+                addWeightedImpl<ushort, float, schar >,
+                addWeightedImpl<ushort, float, ushort>,
+                addWeightedImpl<ushort, float, short >,
+                addWeightedImpl<ushort, float, int   >,
+                addWeightedImpl<ushort, float, float >,
+                addWeightedImpl<ushort, float, double>
+            },
+            {
+                addWeightedImpl<ushort, double, uchar >,
+                addWeightedImpl<ushort, double, schar >,
+                addWeightedImpl<ushort, double, ushort>,
+                addWeightedImpl<ushort, double, short >,
+                addWeightedImpl<ushort, double, int   >,
+                addWeightedImpl<ushort, double, float >,
+                addWeightedImpl<ushort, double, double>
+            }
+        },
+        {
+            {
+                0/*addWeightedImpl<short, uchar, uchar >*/,
+                0/*addWeightedImpl<short, uchar, schar >*/,
+                0/*addWeightedImpl<short, uchar, ushort>*/,
+                0/*addWeightedImpl<short, uchar, short >*/,
+                0/*addWeightedImpl<short, uchar, int   >*/,
+                0/*addWeightedImpl<short, uchar, float >*/,
+                0/*addWeightedImpl<short, uchar, double>*/
+            },
+            {
+                0/*addWeightedImpl<short, schar, uchar >*/,
+                0/*addWeightedImpl<short, schar, schar >*/,
+                0/*addWeightedImpl<short, schar, ushort>*/,
+                0/*addWeightedImpl<short, schar, short >*/,
+                0/*addWeightedImpl<short, schar, int   >*/,
+                0/*addWeightedImpl<short, schar, float >*/,
+                0/*addWeightedImpl<short, schar, double>*/
+            },
+            {
+                0/*addWeightedImpl<short, ushort, uchar >*/,
+                0/*addWeightedImpl<short, ushort, schar >*/,
+                0/*addWeightedImpl<short, ushort, ushort>*/,
+                0/*addWeightedImpl<short, ushort, short >*/,
+                0/*addWeightedImpl<short, ushort, int   >*/,
+                0/*addWeightedImpl<short, ushort, float >*/,
+                0/*addWeightedImpl<short, ushort, double>*/
+            },
+            {
+                addWeightedImpl<short, short, uchar >,
+                addWeightedImpl<short, short, schar >,
+                addWeightedImpl<short, short, ushort>,
+                addWeightedImpl<short, short, short >,
+                addWeightedImpl<short, short, int   >,
+                addWeightedImpl<short, short, float >,
+                addWeightedImpl<short, short, double>
+            },
+            {
+                addWeightedImpl<short, int, uchar >,
+                addWeightedImpl<short, int, schar >,
+                addWeightedImpl<short, int, ushort>,
+                addWeightedImpl<short, int, short >,
+                addWeightedImpl<short, int, int   >,
+                addWeightedImpl<short, int, float >,
+                addWeightedImpl<short, int, double>
+            },
+            {
+                addWeightedImpl<short, float, uchar >,
+                addWeightedImpl<short, float, schar >,
+                addWeightedImpl<short, float, ushort>,
+                addWeightedImpl<short, float, short >,
+                addWeightedImpl<short, float, int   >,
+                addWeightedImpl<short, float, float >,
+                addWeightedImpl<short, float, double>
+            },
+            {
+                addWeightedImpl<short, double, uchar >,
+                addWeightedImpl<short, double, schar >,
+                addWeightedImpl<short, double, ushort>,
+                addWeightedImpl<short, double, short >,
+                addWeightedImpl<short, double, int   >,
+                addWeightedImpl<short, double, float >,
+                addWeightedImpl<short, double, double>
+            }
+        },
+        {
+            {
+                0/*addWeightedImpl<int, uchar, uchar >*/,
+                0/*addWeightedImpl<int, uchar, schar >*/,
+                0/*addWeightedImpl<int, uchar, ushort>*/,
+                0/*addWeightedImpl<int, uchar, short >*/,
+                0/*addWeightedImpl<int, uchar, int   >*/,
+                0/*addWeightedImpl<int, uchar, float >*/,
+                0/*addWeightedImpl<int, uchar, double>*/
+            },
+            {
+                0/*addWeightedImpl<int, schar, uchar >*/,
+                0/*addWeightedImpl<int, schar, schar >*/,
+                0/*addWeightedImpl<int, schar, ushort>*/,
+                0/*addWeightedImpl<int, schar, short >*/,
+                0/*addWeightedImpl<int, schar, int   >*/,
+                0/*addWeightedImpl<int, schar, float >*/,
+                0/*addWeightedImpl<int, schar, double>*/
+            },
+            {
+                0/*addWeightedImpl<int, ushort, uchar >*/,
+                0/*addWeightedImpl<int, ushort, schar >*/,
+                0/*addWeightedImpl<int, ushort, ushort>*/,
+                0/*addWeightedImpl<int, ushort, short >*/,
+                0/*addWeightedImpl<int, ushort, int   >*/,
+                0/*addWeightedImpl<int, ushort, float >*/,
+                0/*addWeightedImpl<int, ushort, double>*/
+            },
+            {
+                0/*addWeightedImpl<int, short, uchar >*/,
+                0/*addWeightedImpl<int, short, schar >*/,
+                0/*addWeightedImpl<int, short, ushort>*/,
+                0/*addWeightedImpl<int, short, short >*/,
+                0/*addWeightedImpl<int, short, int   >*/,
+                0/*addWeightedImpl<int, short, float >*/,
+                0/*addWeightedImpl<int, short, double>*/
+            },
+            {
+                addWeightedImpl<int, int, uchar >,
+                addWeightedImpl<int, int, schar >,
+                addWeightedImpl<int, int, ushort>,
+                addWeightedImpl<int, int, short >,
+                addWeightedImpl<int, int, int   >,
+                addWeightedImpl<int, int, float >,
+                addWeightedImpl<int, int, double>
+            },
+            {
+                addWeightedImpl<int, float, uchar >,
+                addWeightedImpl<int, float, schar >,
+                addWeightedImpl<int, float, ushort>,
+                addWeightedImpl<int, float, short >,
+                addWeightedImpl<int, float, int   >,
+                addWeightedImpl<int, float, float >,
+                addWeightedImpl<int, float, double>
+            },
+            {
+                addWeightedImpl<int, double, uchar >,
+                addWeightedImpl<int, double, schar >,
+                addWeightedImpl<int, double, ushort>,
+                addWeightedImpl<int, double, short >,
+                addWeightedImpl<int, double, int   >,
+                addWeightedImpl<int, double, float >,
+                addWeightedImpl<int, double, double>
+            }
+        },
+        {
+            {
+                0/*addWeightedImpl<float, uchar, uchar >*/,
+                0/*addWeightedImpl<float, uchar, schar >*/,
+                0/*addWeightedImpl<float, uchar, ushort>*/,
+                0/*addWeightedImpl<float, uchar, short >*/,
+                0/*addWeightedImpl<float, uchar, int   >*/,
+                0/*addWeightedImpl<float, uchar, float >*/,
+                0/*addWeightedImpl<float, uchar, double>*/
+            },
+            {
+                0/*addWeightedImpl<float, schar, uchar >*/,
+                0/*addWeightedImpl<float, schar, schar >*/,
+                0/*addWeightedImpl<float, schar, ushort>*/,
+                0/*addWeightedImpl<float, schar, short >*/,
+                0/*addWeightedImpl<float, schar, int   >*/,
+                0/*addWeightedImpl<float, schar, float >*/,
+                0/*addWeightedImpl<float, schar, double>*/
+            },
+            {
+                0/*addWeightedImpl<float, ushort, uchar >*/,
+                0/*addWeightedImpl<float, ushort, schar >*/,
+                0/*addWeightedImpl<float, ushort, ushort>*/,
+                0/*addWeightedImpl<float, ushort, short >*/,
+                0/*addWeightedImpl<float, ushort, int   >*/,
+                0/*addWeightedImpl<float, ushort, float >*/,
+                0/*addWeightedImpl<float, ushort, double>*/
+            },
+            {
+                0/*addWeightedImpl<float, short, uchar >*/,
+                0/*addWeightedImpl<float, short, schar >*/,
+                0/*addWeightedImpl<float, short, ushort>*/,
+                0/*addWeightedImpl<float, short, short >*/,
+                0/*addWeightedImpl<float, short, int   >*/,
+                0/*addWeightedImpl<float, short, float >*/,
+                0/*addWeightedImpl<float, short, double>*/
+            },
+            {
+                0/*addWeightedImpl<float, int, uchar >*/,
+                0/*addWeightedImpl<float, int, schar >*/,
+                0/*addWeightedImpl<float, int, ushort>*/,
+                0/*addWeightedImpl<float, int, short >*/,
+                0/*addWeightedImpl<float, int, int   >*/,
+                0/*addWeightedImpl<float, int, float >*/,
+                0/*addWeightedImpl<float, int, double>*/
+            },
+            {
+                addWeightedImpl<float, float, uchar >,
+                addWeightedImpl<float, float, schar >,
+                addWeightedImpl<float, float, ushort>,
+                addWeightedImpl<float, float, short >,
+                addWeightedImpl<float, float, int   >,
+                addWeightedImpl<float, float, float >,
+                addWeightedImpl<float, float, double>
+            },
+            {
+                addWeightedImpl<float, double, uchar >,
+                addWeightedImpl<float, double, schar >,
+                addWeightedImpl<float, double, ushort>,
+                addWeightedImpl<float, double, short >,
+                addWeightedImpl<float, double, int   >,
+                addWeightedImpl<float, double, float >,
+                addWeightedImpl<float, double, double>
+            }
+        },
+        {
+            {
+                0/*addWeightedImpl<double, uchar, uchar >*/,
+                0/*addWeightedImpl<double, uchar, schar >*/,
+                0/*addWeightedImpl<double, uchar, ushort>*/,
+                0/*addWeightedImpl<double, uchar, short >*/,
+                0/*addWeightedImpl<double, uchar, int   >*/,
+                0/*addWeightedImpl<double, uchar, float >*/,
+                0/*addWeightedImpl<double, uchar, double>*/
+            },
+            {
+                0/*addWeightedImpl<double, schar, uchar >*/,
+                0/*addWeightedImpl<double, schar, schar >*/,
+                0/*addWeightedImpl<double, schar, ushort>*/,
+                0/*addWeightedImpl<double, schar, short >*/,
+                0/*addWeightedImpl<double, schar, int   >*/,
+                0/*addWeightedImpl<double, schar, float >*/,
+                0/*addWeightedImpl<double, schar, double>*/
+            },
+            {
+                0/*addWeightedImpl<double, ushort, uchar >*/,
+                0/*addWeightedImpl<double, ushort, schar >*/,
+                0/*addWeightedImpl<double, ushort, ushort>*/,
+                0/*addWeightedImpl<double, ushort, short >*/,
+                0/*addWeightedImpl<double, ushort, int   >*/,
+                0/*addWeightedImpl<double, ushort, float >*/,
+                0/*addWeightedImpl<double, ushort, double>*/
+            },
+            {
+                0/*addWeightedImpl<double, short, uchar >*/,
+                0/*addWeightedImpl<double, short, schar >*/,
+                0/*addWeightedImpl<double, short, ushort>*/,
+                0/*addWeightedImpl<double, short, short >*/,
+                0/*addWeightedImpl<double, short, int   >*/,
+                0/*addWeightedImpl<double, short, float >*/,
+                0/*addWeightedImpl<double, short, double>*/
+            },
+            {
+                0/*addWeightedImpl<double, int, uchar >*/,
+                0/*addWeightedImpl<double, int, schar >*/,
+                0/*addWeightedImpl<double, int, ushort>*/,
+                0/*addWeightedImpl<double, int, short >*/,
+                0/*addWeightedImpl<double, int, int   >*/,
+                0/*addWeightedImpl<double, int, float >*/,
+                0/*addWeightedImpl<double, int, double>*/
+            },
+            {
+                0/*addWeightedImpl<double, float, uchar >*/,
+                0/*addWeightedImpl<double, float, schar >*/,
+                0/*addWeightedImpl<double, float, ushort>*/,
+                0/*addWeightedImpl<double, float, short >*/,
+                0/*addWeightedImpl<double, float, int   >*/,
+                0/*addWeightedImpl<double, float, float >*/,
+                0/*addWeightedImpl<double, float, double>*/
+            },
+            {
+                addWeightedImpl<double, double, uchar >,
+                addWeightedImpl<double, double, schar >,
+                addWeightedImpl<double, double, ushort>,
+                addWeightedImpl<double, double, short >,
+                addWeightedImpl<double, double, int   >,
+                addWeightedImpl<double, double, float >,
+                addWeightedImpl<double, double, double>
+            }
+        }
+    };
+
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
+
+    int sdepth1 = src1.depth();
+    int sdepth2 = src2.depth();
+
+    ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2);
+    const int cn = src1.channels();
+
+    CV_Assert( src2.size() == src1.size() && src2.channels() == cn );
+    CV_Assert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F );
+
+    GpuMat dst = getOutputMat(_dst, src1.size(), CV_MAKE_TYPE(ddepth, cn), stream);
+
+    GpuMat src1_single = src1.reshape(1);
+    GpuMat src2_single = src2.reshape(1);
+    GpuMat dst_single = dst.reshape(1);
+
+    if (sdepth1 > sdepth2)
+    {
+        src1_single.swap(src2_single);
+        std::swap(alpha, beta);
+        std::swap(sdepth1, sdepth2);
+    }
+
+    const func_t func = funcs[sdepth1][sdepth2][ddepth];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_single, alpha, src2_single, beta, gamma, dst_single, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/bitwise_mat.cu b/modules/cudaarithm/src/cuda/bitwise_mat.cu
new file mode 100644
index 00000000000..f151c1a4862
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/bitwise_mat.cu
@@ -0,0 +1,230 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
+
+//////////////////////////////////////////////////////////////////////////////
+/// bitwise_not
+
+void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+    GpuMat mask = getInputMat(_mask, stream);
+
+    const int depth = src.depth();
+
+    CV_DbgAssert( depth <= CV_32F );
+    CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+    if (mask.empty())
+    {
+        const int bcols = (int) (src.cols * src.elemSize());
+
+        if ((bcols & 3) == 0)
+        {
+            const int vcols = bcols >> 2;
+
+            GlobPtrSz<uint> vsrc = globPtr((uint*) src.data, src.step, src.rows, vcols);
+            GlobPtrSz<uint> vdst = globPtr((uint*) dst.data, dst.step, src.rows, vcols);
+
+            gridTransformUnary(vsrc, vdst, bit_not<uint>(), stream);
+        }
+        else if ((bcols & 1) == 0)
+        {
+            const int vcols = bcols >> 1;
+
+            GlobPtrSz<ushort> vsrc = globPtr((ushort*) src.data, src.step, src.rows, vcols);
+            GlobPtrSz<ushort> vdst = globPtr((ushort*) dst.data, dst.step, src.rows, vcols);
+
+            gridTransformUnary(vsrc, vdst, bit_not<ushort>(), stream);
+        }
+        else
+        {
+            GlobPtrSz<uchar> vsrc = globPtr((uchar*) src.data, src.step, src.rows, bcols);
+            GlobPtrSz<uchar> vdst = globPtr((uchar*) dst.data, dst.step, src.rows, bcols);
+
+            gridTransformUnary(vsrc, vdst, bit_not<uchar>(), stream);
+        }
+    }
+    else
+    {
+        if (depth == CV_32F || depth == CV_32S)
+        {
+            GlobPtrSz<uint> vsrc = globPtr((uint*) src.data, src.step, src.rows, src.cols * src.channels());
+            GlobPtrSz<uint> vdst = globPtr((uint*) dst.data, dst.step, src.rows, src.cols * src.channels());
+
+            gridTransformUnary(vsrc, vdst, bit_not<uint>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
+        }
+        else if (depth == CV_16S || depth == CV_16U)
+        {
+            GlobPtrSz<ushort> vsrc = globPtr((ushort*) src.data, src.step, src.rows, src.cols * src.channels());
+            GlobPtrSz<ushort> vdst = globPtr((ushort*) dst.data, dst.step, src.rows, src.cols * src.channels());
+
+            gridTransformUnary(vsrc, vdst, bit_not<ushort>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
+        }
+        else
+        {
+            GlobPtrSz<uchar> vsrc = globPtr((uchar*) src.data, src.step, src.rows, src.cols * src.channels());
+            GlobPtrSz<uchar> vdst = globPtr((uchar*) dst.data, dst.step, src.rows, src.cols * src.channels());
+
+            gridTransformUnary(vsrc, vdst, bit_not<uchar>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
+        }
+    }
+
+    syncOutput(dst, _dst, stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/// Binary bitwise logical operations
+
+namespace
+{
+    template <template <typename> class Op, typename T>
+    void bitMatOp(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
+    {
+        GlobPtrSz<T> vsrc1 = globPtr((T*) src1.data, src1.step, src1.rows, src1.cols * src1.channels());
+        GlobPtrSz<T> vsrc2 = globPtr((T*) src2.data, src2.step, src1.rows, src1.cols * src1.channels());
+        GlobPtrSz<T> vdst = globPtr((T*) dst.data, dst.step, src1.rows, src1.cols * src1.channels());
+
+        if (mask.data)
+            gridTransformBinary(vsrc1, vsrc2, vdst, Op<T>(), singleMaskChannels(globPtr<uchar>(mask), src1.channels()), stream);
+        else
+            gridTransformBinary(vsrc1, vsrc2, vdst, Op<T>(), stream);
+    }
+}
+
+void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs32[] =
+    {
+        bitMatOp<bit_and, uint>,
+        bitMatOp<bit_or, uint>,
+        bitMatOp<bit_xor, uint>
+    };
+    static const func_t funcs16[] =
+    {
+        bitMatOp<bit_and, ushort>,
+        bitMatOp<bit_or, ushort>,
+        bitMatOp<bit_xor, ushort>
+    };
+    static const func_t funcs8[] =
+    {
+        bitMatOp<bit_and, uchar>,
+        bitMatOp<bit_or, uchar>,
+        bitMatOp<bit_xor, uchar>
+    };
+
+    const int depth = src1.depth();
+
+    CV_DbgAssert( depth <= CV_32F );
+    CV_DbgAssert( op >= 0 && op < 3 );
+
+    if (mask.empty())
+    {
+        const int bcols = (int) (src1.cols * src1.elemSize());
+
+        if ((bcols & 3) == 0)
+        {
+            const int vcols = bcols >> 2;
+
+            GpuMat vsrc1(src1.rows, vcols, CV_32SC1, src1.data, src1.step);
+            GpuMat vsrc2(src1.rows, vcols, CV_32SC1, src2.data, src2.step);
+            GpuMat vdst(src1.rows, vcols, CV_32SC1, dst.data, dst.step);
+
+            funcs32[op](vsrc1, vsrc2, vdst, GpuMat(), stream);
+        }
+        else if ((bcols & 1) == 0)
+        {
+            const int vcols = bcols >> 1;
+
+            GpuMat vsrc1(src1.rows, vcols, CV_16UC1, src1.data, src1.step);
+            GpuMat vsrc2(src1.rows, vcols, CV_16UC1, src2.data, src2.step);
+            GpuMat vdst(src1.rows, vcols, CV_16UC1, dst.data, dst.step);
+
+            funcs16[op](vsrc1, vsrc2, vdst, GpuMat(), stream);
+        }
+        else
+        {
+            GpuMat vsrc1(src1.rows, bcols, CV_8UC1, src1.data, src1.step);
+            GpuMat vsrc2(src1.rows, bcols, CV_8UC1, src2.data, src2.step);
+            GpuMat vdst(src1.rows, bcols, CV_8UC1, dst.data, dst.step);
+
+            funcs8[op](vsrc1, vsrc2, vdst, GpuMat(), stream);
+        }
+    }
+    else
+    {
+        if (depth == CV_32F || depth == CV_32S)
+        {
+            funcs32[op](src1, src2, dst, mask, stream);
+        }
+        else if (depth == CV_16S || depth == CV_16U)
+        {
+            funcs16[op](src1, src2, dst, mask, stream);
+        }
+        else
+        {
+            funcs8[op](src1, src2, dst, mask, stream);
+        }
+    }
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/bitwise_scalar.cu b/modules/cudaarithm/src/cuda/bitwise_scalar.cu
new file mode 100644
index 00000000000..0dd99e8cd85
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/bitwise_scalar.cu
@@ -0,0 +1,171 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv::cudev;
+
+void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
+
+namespace
+{
+    template <template <typename> class Op, typename T>
+    void bitScalarOp(const GpuMat& src, uint value, GpuMat& dst, Stream& stream)
+    {
+        gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), value), stream);
+    }
+
+    typedef void (*bit_scalar_func_t)(const GpuMat& src, uint value, GpuMat& dst, Stream& stream);
+
+    template <typename T, bit_scalar_func_t func> struct BitScalar
+    {
+        static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+        {
+            func(src, cv::saturate_cast<T>(value[0]), dst, stream);
+        }
+    };
+
+    template <bit_scalar_func_t func> struct BitScalar4
+    {
+        static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+        {
+            uint packedVal = 0;
+
+            packedVal |= cv::saturate_cast<uchar>(value[0]);
+            packedVal |= cv::saturate_cast<uchar>(value[1]) << 8;
+            packedVal |= cv::saturate_cast<uchar>(value[2]) << 16;
+            packedVal |= cv::saturate_cast<uchar>(value[3]) << 24;
+
+            func(src, packedVal, dst, stream);
+        }
+    };
+
+    template <int DEPTH, int cn> struct NppBitwiseCFunc
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
+    };
+
+    template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
+    {
+        typedef typename NppBitwiseCFunc<DEPTH, cn>::npp_type npp_type;
+
+        static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& _stream)
+        {
+            cudaStream_t stream = StreamAccessor::getStream(_stream);
+            NppStreamHandler h(stream);
+
+            NppiSize oSizeROI;
+            oSizeROI.width = src.cols;
+            oSizeROI.height = src.rows;
+
+            const npp_type pConstants[] =
+            {
+                cv::saturate_cast<npp_type>(value[0]),
+                cv::saturate_cast<npp_type>(value[1]),
+                cv::saturate_cast<npp_type>(value[2]),
+                cv::saturate_cast<npp_type>(value[3])
+            };
+
+            nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
+
+            if (stream == 0)
+                CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
+{
+    CV_UNUSED(mask);
+
+    typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
+    static const func_t funcs[3][6][4] =
+    {
+        {
+            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        },
+        {
+            {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+            {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_or, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_or, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+        },
+        {
+            {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+            {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+        }
+    };
+
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_DbgAssert( depth <= CV_32F );
+    CV_DbgAssert( cn == 1 || cn == 3 || cn == 4 );
+    CV_DbgAssert( mask.empty() );
+    CV_DbgAssert( op >= 0 && op < 3 );
+
+    funcs[op][depth][cn - 1](src, value, dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/cmp_mat.cu b/modules/cudaarithm/src/cuda/cmp_mat.cu
new file mode 100644
index 00000000000..3693fc2b784
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/cmp_mat.cu
@@ -0,0 +1,219 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);
+
+namespace
+{
+    template <class Op, typename T> struct CmpOp : binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T a, T b) const
+        {
+            Op op;
+            return -op(a, b);
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <template <typename> class Op, typename T>
+    void cmpMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        CmpOp<Op<T>, T> op;
+        gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<uchar>(dst), op, stream);
+    }
+
+    struct VCmpEq4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmpeq4(a, b);
+        }
+    };
+    struct VCmpNe4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmpne4(a, b);
+        }
+    };
+    struct VCmpLt4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmplt4(a, b);
+        }
+    };
+    struct VCmpLe4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmple4(a, b);
+        }
+    };
+
+    void cmpMatEq_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, VCmpEq4(), stream);
+    }
+    void cmpMatNe_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, VCmpNe4(), stream);
+    }
+    void cmpMatLt_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, VCmpLt4(), stream);
+    }
+    void cmpMatLe_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, VCmpLe4(), stream);
+    }
+}
+
+void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+    static const func_t funcs[7][4] =
+    {
+        {cmpMat_v1<equal_to, uchar> , cmpMat_v1<not_equal_to, uchar> , cmpMat_v1<less, uchar> , cmpMat_v1<less_equal, uchar> },
+        {cmpMat_v1<equal_to, schar> , cmpMat_v1<not_equal_to, schar> , cmpMat_v1<less, schar> , cmpMat_v1<less_equal, schar> },
+        {cmpMat_v1<equal_to, ushort>, cmpMat_v1<not_equal_to, ushort>, cmpMat_v1<less, ushort>, cmpMat_v1<less_equal, ushort>},
+        {cmpMat_v1<equal_to, short> , cmpMat_v1<not_equal_to, short> , cmpMat_v1<less, short> , cmpMat_v1<less_equal, short> },
+        {cmpMat_v1<equal_to, int>   , cmpMat_v1<not_equal_to, int>   , cmpMat_v1<less, int>   , cmpMat_v1<less_equal, int>   },
+        {cmpMat_v1<equal_to, float> , cmpMat_v1<not_equal_to, float> , cmpMat_v1<less, float> , cmpMat_v1<less_equal, float> },
+        {cmpMat_v1<equal_to, double>, cmpMat_v1<not_equal_to, double>, cmpMat_v1<less, double>, cmpMat_v1<less_equal, double>}
+    };
+
+    typedef void (*func_v4_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+    static const func_v4_t funcs_v4[] =
+    {
+        cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
+    };
+
+    const int depth = src1.depth();
+
+    CV_DbgAssert( depth <= CV_64F );
+
+    static const int codes[] =
+    {
+        0, 2, 3, 2, 3, 1
+    };
+    const GpuMat* psrc1[] =
+    {
+        &src1, &src2, &src2, &src1, &src1, &src1
+    };
+    const GpuMat* psrc2[] =
+    {
+        &src2, &src1, &src1, &src2, &src2, &src2
+    };
+
+    const int code = codes[cmpop];
+
+    GpuMat src1_ = psrc1[cmpop]->reshape(1);
+    GpuMat src2_ = psrc2[cmpop]->reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+
+    if (depth == CV_8U && (src1_.cols & 3) == 0)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            funcs_v4[code](src1_, src2_, dst_, stream);
+            return;
+        }
+    }
+
+    const func_t func = funcs[depth][code];
+
+    func(src1_, src2_, dst_, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/cmp_scalar.cu b/modules/cudaarithm/src/cuda/cmp_scalar.cu
new file mode 100644
index 00000000000..df57bc00436
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/cmp_scalar.cu
@@ -0,0 +1,225 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void cmpScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);
+
+namespace
+{
+    template <class Op, typename T> struct CmpOp : binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T a, T b) const
+        {
+            Op op;
+            return -op(a, b);
+        }
+    };
+
+#define MAKE_VEC(_type, _cn) typename MakeVec<_type, _cn>::type
+
+    template <class Op, typename T, int cn> struct CmpScalarOp;
+
+    template <class Op, typename T>
+    struct CmpScalarOp<Op, T, 1> : unary_function<T, uchar>
+    {
+        T val;
+
+        __device__ __forceinline__ uchar operator()(T src) const
+        {
+            CmpOp<Op, T> op;
+            return op(src, val);
+        }
+    };
+
+    template <class Op, typename T>
+    struct CmpScalarOp<Op, T, 2> : unary_function<MAKE_VEC(T, 2), MAKE_VEC(uchar, 2)>
+    {
+        MAKE_VEC(T, 2) val;
+
+        __device__ __forceinline__ MAKE_VEC(uchar, 2) operator()(const MAKE_VEC(T, 2) & src) const
+        {
+            CmpOp<Op, T> op;
+            return VecTraits<MAKE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
+        }
+    };
+
+    template <class Op, typename T>
+    struct CmpScalarOp<Op, T, 3> : unary_function<MAKE_VEC(T, 3), MAKE_VEC(uchar, 3)>
+    {
+        MAKE_VEC(T, 3) val;
+
+        __device__ __forceinline__ MAKE_VEC(uchar, 3) operator()(const MAKE_VEC(T, 3) & src) const
+        {
+            CmpOp<Op, T> op;
+            return VecTraits<MAKE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
+        }
+    };
+
+    template <class Op, typename T>
+    struct CmpScalarOp<Op, T, 4> : unary_function<MAKE_VEC(T, 4), MAKE_VEC(uchar, 4)>
+    {
+        MAKE_VEC(T, 4) val;
+
+        __device__ __forceinline__ MAKE_VEC(uchar, 4) operator()(const MAKE_VEC(T, 4) & src) const
+        {
+            CmpOp<Op, T> op;
+            return VecTraits<MAKE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
+        }
+    };
+
+#undef TYPE_VEC
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <template <typename> class Op, typename T, int cn>
+    void cmpScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+    {
+        typedef typename MakeVec<T, cn>::type src_type;
+        typedef typename MakeVec<uchar, cn>::type dst_type;
+
+        cv::Scalar_<T> value_ = value;
+
+        CmpScalarOp<Op<T>, T, cn> op;
+        op.val = VecTraits<src_type>::make(value_.val);
+
+        gridTransformUnary_< TransformPolicy<T> >(globPtr<src_type>(src), globPtr<dst_type>(dst), op, stream);
+    }
+}
+
+void cmpScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop)
+{
+    typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
+    static const func_t funcs[7][6][4] =
+    {
+        {
+            {cmpScalarImpl<equal_to,      uchar, 1>, cmpScalarImpl<equal_to,      uchar, 2>, cmpScalarImpl<equal_to,      uchar, 3>, cmpScalarImpl<equal_to,      uchar, 4>},
+            {cmpScalarImpl<greater,       uchar, 1>, cmpScalarImpl<greater,       uchar, 2>, cmpScalarImpl<greater,       uchar, 3>, cmpScalarImpl<greater,       uchar, 4>},
+            {cmpScalarImpl<greater_equal, uchar, 1>, cmpScalarImpl<greater_equal, uchar, 2>, cmpScalarImpl<greater_equal, uchar, 3>, cmpScalarImpl<greater_equal, uchar, 4>},
+            {cmpScalarImpl<less,          uchar, 1>, cmpScalarImpl<less,          uchar, 2>, cmpScalarImpl<less,          uchar, 3>, cmpScalarImpl<less,          uchar, 4>},
+            {cmpScalarImpl<less_equal,    uchar, 1>, cmpScalarImpl<less_equal,    uchar, 2>, cmpScalarImpl<less_equal,    uchar, 3>, cmpScalarImpl<less_equal,    uchar, 4>},
+            {cmpScalarImpl<not_equal_to,  uchar, 1>, cmpScalarImpl<not_equal_to,  uchar, 2>, cmpScalarImpl<not_equal_to,  uchar, 3>, cmpScalarImpl<not_equal_to,  uchar, 4>}
+        },
+        {
+            {cmpScalarImpl<equal_to,      schar, 1>, cmpScalarImpl<equal_to,      schar, 2>, cmpScalarImpl<equal_to,      schar, 3>, cmpScalarImpl<equal_to,      schar, 4>},
+            {cmpScalarImpl<greater,       schar, 1>, cmpScalarImpl<greater,       schar, 2>, cmpScalarImpl<greater,       schar, 3>, cmpScalarImpl<greater,       schar, 4>},
+            {cmpScalarImpl<greater_equal, schar, 1>, cmpScalarImpl<greater_equal, schar, 2>, cmpScalarImpl<greater_equal, schar, 3>, cmpScalarImpl<greater_equal, schar, 4>},
+            {cmpScalarImpl<less,          schar, 1>, cmpScalarImpl<less,          schar, 2>, cmpScalarImpl<less,          schar, 3>, cmpScalarImpl<less,          schar, 4>},
+            {cmpScalarImpl<less_equal,    schar, 1>, cmpScalarImpl<less_equal,    schar, 2>, cmpScalarImpl<less_equal,    schar, 3>, cmpScalarImpl<less_equal,    schar, 4>},
+            {cmpScalarImpl<not_equal_to,  schar, 1>, cmpScalarImpl<not_equal_to,  schar, 2>, cmpScalarImpl<not_equal_to,  schar, 3>, cmpScalarImpl<not_equal_to,  schar, 4>}
+        },
+        {
+            {cmpScalarImpl<equal_to,      ushort, 1>, cmpScalarImpl<equal_to,      ushort, 2>, cmpScalarImpl<equal_to,      ushort, 3>, cmpScalarImpl<equal_to,      ushort, 4>},
+            {cmpScalarImpl<greater,       ushort, 1>, cmpScalarImpl<greater,       ushort, 2>, cmpScalarImpl<greater,       ushort, 3>, cmpScalarImpl<greater,       ushort, 4>},
+            {cmpScalarImpl<greater_equal, ushort, 1>, cmpScalarImpl<greater_equal, ushort, 2>, cmpScalarImpl<greater_equal, ushort, 3>, cmpScalarImpl<greater_equal, ushort, 4>},
+            {cmpScalarImpl<less,          ushort, 1>, cmpScalarImpl<less,          ushort, 2>, cmpScalarImpl<less,          ushort, 3>, cmpScalarImpl<less,          ushort, 4>},
+            {cmpScalarImpl<less_equal,    ushort, 1>, cmpScalarImpl<less_equal,    ushort, 2>, cmpScalarImpl<less_equal,    ushort, 3>, cmpScalarImpl<less_equal,    ushort, 4>},
+            {cmpScalarImpl<not_equal_to,  ushort, 1>, cmpScalarImpl<not_equal_to,  ushort, 2>, cmpScalarImpl<not_equal_to,  ushort, 3>, cmpScalarImpl<not_equal_to,  ushort, 4>}
+        },
+        {
+            {cmpScalarImpl<equal_to,      short, 1>, cmpScalarImpl<equal_to,      short, 2>, cmpScalarImpl<equal_to,      short, 3>, cmpScalarImpl<equal_to,      short, 4>},
+            {cmpScalarImpl<greater,       short, 1>, cmpScalarImpl<greater,       short, 2>, cmpScalarImpl<greater,       short, 3>, cmpScalarImpl<greater,       short, 4>},
+            {cmpScalarImpl<greater_equal, short, 1>, cmpScalarImpl<greater_equal, short, 2>, cmpScalarImpl<greater_equal, short, 3>, cmpScalarImpl<greater_equal, short, 4>},
+            {cmpScalarImpl<less,          short, 1>, cmpScalarImpl<less,          short, 2>, cmpScalarImpl<less,          short, 3>, cmpScalarImpl<less,          short, 4>},
+            {cmpScalarImpl<less_equal,    short, 1>, cmpScalarImpl<less_equal,    short, 2>, cmpScalarImpl<less_equal,    short, 3>, cmpScalarImpl<less_equal,    short, 4>},
+            {cmpScalarImpl<not_equal_to,  short, 1>, cmpScalarImpl<not_equal_to,  short, 2>, cmpScalarImpl<not_equal_to,  short, 3>, cmpScalarImpl<not_equal_to,  short, 4>}
+        },
+        {
+            {cmpScalarImpl<equal_to,      int, 1>, cmpScalarImpl<equal_to,      int, 2>, cmpScalarImpl<equal_to,      int, 3>, cmpScalarImpl<equal_to,      int, 4>},
+            {cmpScalarImpl<greater,       int, 1>, cmpScalarImpl<greater,       int, 2>, cmpScalarImpl<greater,       int, 3>, cmpScalarImpl<greater,       int, 4>},
+            {cmpScalarImpl<greater_equal, int, 1>, cmpScalarImpl<greater_equal, int, 2>, cmpScalarImpl<greater_equal, int, 3>, cmpScalarImpl<greater_equal, int, 4>},
+            {cmpScalarImpl<less,          int, 1>, cmpScalarImpl<less,          int, 2>, cmpScalarImpl<less,          int, 3>, cmpScalarImpl<less,          int, 4>},
+            {cmpScalarImpl<less_equal,    int, 1>, cmpScalarImpl<less_equal,    int, 2>, cmpScalarImpl<less_equal,    int, 3>, cmpScalarImpl<less_equal,    int, 4>},
+            {cmpScalarImpl<not_equal_to,  int, 1>, cmpScalarImpl<not_equal_to,  int, 2>, cmpScalarImpl<not_equal_to,  int, 3>, cmpScalarImpl<not_equal_to,  int, 4>}
+        },
+        {
+            {cmpScalarImpl<equal_to,      float, 1>, cmpScalarImpl<equal_to,      float, 2>, cmpScalarImpl<equal_to,      float, 3>, cmpScalarImpl<equal_to,      float, 4>},
+            {cmpScalarImpl<greater,       float, 1>, cmpScalarImpl<greater,       float, 2>, cmpScalarImpl<greater,       float, 3>, cmpScalarImpl<greater,       float, 4>},
+            {cmpScalarImpl<greater_equal, float, 1>, cmpScalarImpl<greater_equal, float, 2>, cmpScalarImpl<greater_equal, float, 3>, cmpScalarImpl<greater_equal, float, 4>},
+            {cmpScalarImpl<less,          float, 1>, cmpScalarImpl<less,          float, 2>, cmpScalarImpl<less,          float, 3>, cmpScalarImpl<less,          float, 4>},
+            {cmpScalarImpl<less_equal,    float, 1>, cmpScalarImpl<less_equal,    float, 2>, cmpScalarImpl<less_equal,    float, 3>, cmpScalarImpl<less_equal,    float, 4>},
+            {cmpScalarImpl<not_equal_to,  float, 1>, cmpScalarImpl<not_equal_to,  float, 2>, cmpScalarImpl<not_equal_to,  float, 3>, cmpScalarImpl<not_equal_to,  float, 4>}
+        },
+        {
+            {cmpScalarImpl<equal_to,      double, 1>, cmpScalarImpl<equal_to,      double, 2>, cmpScalarImpl<equal_to,      double, 3>, cmpScalarImpl<equal_to,      double, 4>},
+            {cmpScalarImpl<greater,       double, 1>, cmpScalarImpl<greater,       double, 2>, cmpScalarImpl<greater,       double, 3>, cmpScalarImpl<greater,       double, 4>},
+            {cmpScalarImpl<greater_equal, double, 1>, cmpScalarImpl<greater_equal, double, 2>, cmpScalarImpl<greater_equal, double, 3>, cmpScalarImpl<greater_equal, double, 4>},
+            {cmpScalarImpl<less,          double, 1>, cmpScalarImpl<less,          double, 2>, cmpScalarImpl<less,          double, 3>, cmpScalarImpl<less,          double, 4>},
+            {cmpScalarImpl<less_equal,    double, 1>, cmpScalarImpl<less_equal,    double, 2>, cmpScalarImpl<less_equal,    double, 3>, cmpScalarImpl<less_equal,    double, 4>},
+            {cmpScalarImpl<not_equal_to,  double, 1>, cmpScalarImpl<not_equal_to,  double, 2>, cmpScalarImpl<not_equal_to,  double, 3>, cmpScalarImpl<not_equal_to,  double, 4>}
+        }
+    };
+
+    if (inv)
+    {
+        // src1 is a scalar; swap it with src2
+        cmpop = cmpop == cv::CMP_LT ? cv::CMP_GT : cmpop == cv::CMP_LE ? cv::CMP_GE :
+            cmpop == cv::CMP_GE ? cv::CMP_LE : cmpop == cv::CMP_GT ? cv::CMP_LT : cmpop;
+    }
+
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_DbgAssert( depth <= CV_64F && cn <= 4 );
+
+    funcs[depth][cmpop][cn - 1](src, val, dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/copy_make_border.cu b/modules/cudaarithm/src/cuda/copy_make_border.cu
new file mode 100644
index 00000000000..ce9cda36cfc
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/copy_make_border.cu
@@ -0,0 +1,159 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    struct ShiftMap
+    {
+        typedef int2 value_type;
+        typedef int index_type;
+
+        int top;
+        int left;
+
+        __device__ __forceinline__ int2 operator ()(int y, int x) const
+        {
+            return make_int2(x - left, y - top);
+        }
+    };
+
+    struct ShiftMapSz : ShiftMap
+    {
+        int rows, cols;
+    };
+}
+
+namespace cv { namespace cudev {
+
+template <> struct PtrTraits<ShiftMapSz> : PtrTraitsBase<ShiftMapSz, ShiftMap>
+{
+};
+
+}}
+
+namespace
+{
+    template <typename T, int cn>
+    void copyMakeBorderImpl(const GpuMat& src, GpuMat& dst, int top, int left, int borderMode, cv::Scalar borderValue, Stream& stream)
+    {
+        typedef typename MakeVec<T, cn>::type src_type;
+
+        cv::Scalar_<T> borderValue_ = borderValue;
+        const src_type brdVal = VecTraits<src_type>::make(borderValue_.val);
+
+        ShiftMapSz map;
+        map.top = top;
+        map.left = left;
+        map.rows = dst.rows;
+        map.cols = dst.cols;
+
+        switch (borderMode)
+        {
+        case cv::BORDER_CONSTANT:
+            gridCopy(remapPtr(brdConstant(globPtr<src_type>(src), brdVal), map), globPtr<src_type>(dst), stream);
+            break;
+        case cv::BORDER_REPLICATE:
+            gridCopy(remapPtr(brdReplicate(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
+            break;
+        case cv::BORDER_REFLECT:
+            gridCopy(remapPtr(brdReflect(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
+            break;
+        case cv::BORDER_WRAP:
+            gridCopy(remapPtr(brdWrap(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
+            break;
+        case cv::BORDER_REFLECT_101:
+            gridCopy(remapPtr(brdReflect101(globPtr<src_type>(src)), map), globPtr<src_type>(dst), stream);
+            break;
+        };
+    }
+}
+
+void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int top, int left, int borderMode, cv::Scalar borderValue, Stream& stream);
+    static const func_t funcs[6][4] =
+    {
+        {    copyMakeBorderImpl<uchar , 1>  ,     copyMakeBorderImpl<uchar , 2>  ,     copyMakeBorderImpl<uchar , 3>  ,     copyMakeBorderImpl<uchar , 4>  },
+        {0 /*copyMakeBorderImpl<schar , 1>*/, 0 /*copyMakeBorderImpl<schar , 2>*/, 0 /*copyMakeBorderImpl<schar , 3>*/, 0 /*copyMakeBorderImpl<schar , 4>*/},
+        {    copyMakeBorderImpl<ushort, 1>  , 0 /*copyMakeBorderImpl<ushort, 2>*/,     copyMakeBorderImpl<ushort, 3>  ,     copyMakeBorderImpl<ushort, 4>  },
+        {    copyMakeBorderImpl<short , 1>  , 0 /*copyMakeBorderImpl<short , 2>*/,     copyMakeBorderImpl<short , 3>  ,     copyMakeBorderImpl<short , 4>  },
+        {0 /*copyMakeBorderImpl<int   , 1>*/, 0 /*copyMakeBorderImpl<int   , 2>*/, 0 /*copyMakeBorderImpl<int   , 3>*/, 0 /*copyMakeBorderImpl<int   , 4>*/},
+        {    copyMakeBorderImpl<float , 1>  , 0 /*copyMakeBorderImpl<float , 2>*/,     copyMakeBorderImpl<float , 3>  ,     copyMakeBorderImpl<float  ,4>  }
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_Assert( depth <= CV_32F && cn <= 4 );
+    CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
+
+    GpuMat dst = getOutputMat(_dst, src.rows + top + bottom, src.cols + left + right, src.type(), stream);
+
+    const func_t func = funcs[depth][cn - 1];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src, dst, top, left, borderType, value, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/countnonzero.cu b/modules/cudaarithm/src/cuda/countnonzero.cu
new file mode 100644
index 00000000000..fb7324660aa
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/countnonzero.cu
@@ -0,0 +1,113 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    template <typename T, typename D>
+    void countNonZeroImpl(const GpuMat& _src, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
+
+        gridCountNonZero(src, dst, stream);
+    }
+}
+
+void cv::cuda::countNonZero(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        countNonZeroImpl<uchar, int>,
+        countNonZeroImpl<schar, int>,
+        countNonZeroImpl<ushort, int>,
+        countNonZeroImpl<short, int>,
+        countNonZeroImpl<int, int>,
+        countNonZeroImpl<float, int>,
+        countNonZeroImpl<double, int>,
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.depth() <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_32SC1, stream);
+
+    const func_t func = funcs[src.depth()];
+    func(src, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+int cv::cuda::countNonZero(InputArray _src)
+{
+    Stream& stream = Stream::Null();
+
+    BufferPool pool(stream);
+    GpuMat buf = pool.getBuffer(1, 1, CV_32SC1);
+
+    countNonZero(_src, buf, stream);
+
+    int data;
+    buf.download(Mat(1, 1, CV_32SC1, &data));
+
+    return data;
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/div_mat.cu b/modules/cudaarithm/src/cuda/div_mat.cu
new file mode 100644
index 00000000000..2a2fb9bf51a
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/div_mat.cu
@@ -0,0 +1,242 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int);
+void divMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+void divMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+
+namespace
+{
+    template <typename T, typename D> struct DivOp : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return b != 0 ? saturate_cast<D>(a / b) : 0;
+        }
+    };
+    template <typename T> struct DivOp<T, float> : binary_function<T, T, float>
+    {
+        __device__ __forceinline__ float operator ()(T a, T b) const
+        {
+            return b != 0 ? static_cast<float>(a) / b : 0.0f;
+        }
+    };
+    template <typename T> struct DivOp<T, double> : binary_function<T, T, double>
+    {
+        __device__ __forceinline__ double operator ()(T a, T b) const
+        {
+            return b != 0 ? static_cast<double>(a) / b : 0.0;
+        }
+    };
+
+    template <typename T, typename S, typename D> struct DivScaleOp : binary_function<T, T, D>
+    {
+        S scale;
+
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename T, typename S, typename D>
+    void divMatImpl(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream)
+    {
+        if (scale == 1)
+        {
+            DivOp<T, D> op;
+            gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
+        }
+        else
+        {
+            DivScaleOp<T, S, D> op;
+            op.scale = static_cast<S>(scale);
+            gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
+        }
+    }
+}
+
+void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream);
+    static const func_t funcs[7][7] =
+    {
+        {
+            divMatImpl<uchar, float, uchar>,
+            divMatImpl<uchar, float, schar>,
+            divMatImpl<uchar, float, ushort>,
+            divMatImpl<uchar, float, short>,
+            divMatImpl<uchar, float, int>,
+            divMatImpl<uchar, float, float>,
+            divMatImpl<uchar, double, double>
+        },
+        {
+            divMatImpl<schar, float, uchar>,
+            divMatImpl<schar, float, schar>,
+            divMatImpl<schar, float, ushort>,
+            divMatImpl<schar, float, short>,
+            divMatImpl<schar, float, int>,
+            divMatImpl<schar, float, float>,
+            divMatImpl<schar, double, double>
+        },
+        {
+            0 /*divMatImpl<ushort, float, uchar>*/,
+            0 /*divMatImpl<ushort, float, schar>*/,
+            divMatImpl<ushort, float, ushort>,
+            divMatImpl<ushort, float, short>,
+            divMatImpl<ushort, float, int>,
+            divMatImpl<ushort, float, float>,
+            divMatImpl<ushort, double, double>
+        },
+        {
+            0 /*divMatImpl<short, float, uchar>*/,
+            0 /*divMatImpl<short, float, schar>*/,
+            divMatImpl<short, float, ushort>,
+            divMatImpl<short, float, short>,
+            divMatImpl<short, float, int>,
+            divMatImpl<short, float, float>,
+            divMatImpl<short, double, double>
+        },
+        {
+            0 /*divMatImpl<int, float, uchar>*/,
+            0 /*divMatImpl<int, float, schar>*/,
+            0 /*divMatImpl<int, float, ushort>*/,
+            0 /*divMatImpl<int, float, short>*/,
+            divMatImpl<int, float, int>,
+            divMatImpl<int, float, float>,
+            divMatImpl<int, double, double>
+        },
+        {
+            0 /*divMatImpl<float, float, uchar>*/,
+            0 /*divMatImpl<float, float, schar>*/,
+            0 /*divMatImpl<float, float, ushort>*/,
+            0 /*divMatImpl<float, float, short>*/,
+            0 /*divMatImpl<float, float, int>*/,
+            divMatImpl<float, float, float>,
+            divMatImpl<float, double, double>
+        },
+        {
+            0 /*divMatImpl<double, double, uchar>*/,
+            0 /*divMatImpl<double, double, schar>*/,
+            0 /*divMatImpl<double, double, ushort>*/,
+            0 /*divMatImpl<double, double, short>*/,
+            0 /*divMatImpl<double, double, int>*/,
+            0 /*divMatImpl<double, double, float>*/,
+            divMatImpl<double, double, double>
+        }
+    };
+
+    const int sdepth = src1.depth();
+    const int ddepth = dst.depth();
+
+    CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
+
+    GpuMat src1_ = src1.reshape(1);
+    GpuMat src2_ = src2.reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+
+    const func_t func = funcs[sdepth][ddepth];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, scale, stream);
+}
+
+namespace
+{
+    template <typename T>
+    struct DivOpSpecial : binary_function<T, float, T>
+    {
+        __device__ __forceinline__ T operator ()(const T& a, float b) const
+        {
+            typedef typename VecTraits<T>::elem_type elem_type;
+
+            T res = VecTraits<T>::all(0);
+
+            if (b != 0)
+            {
+                b = 1.0f / b;
+                res.x = saturate_cast<elem_type>(a.x * b);
+                res.y = saturate_cast<elem_type>(a.y * b);
+                res.z = saturate_cast<elem_type>(a.z * b);
+                res.w = saturate_cast<elem_type>(a.w * b);
+            }
+
+            return res;
+        }
+    };
+}
+
+void divMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+{
+    gridTransformBinary(globPtr<uchar4>(src1), globPtr<float>(src2), globPtr<uchar4>(dst), DivOpSpecial<uchar4>(), stream);
+}
+
+void divMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+{
+    gridTransformBinary(globPtr<short4>(src1), globPtr<float>(src2), globPtr<short4>(dst), DivOpSpecial<short4>(), stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/div_scalar.cu b/modules/cudaarithm/src/cuda/div_scalar.cu
new file mode 100644
index 00000000000..97ada834104
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/div_scalar.cu
@@ -0,0 +1,260 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void divScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int);
+
+namespace
+{
+    template <typename T, int cn> struct SafeDiv;
+    template <typename T> struct SafeDiv<T, 1>
+    {
+        __device__ __forceinline__ static T op(T a, T b)
+        {
+            return b != 0 ? a / b : 0;
+        }
+    };
+    template <typename T> struct SafeDiv<T, 2>
+    {
+        __device__ __forceinline__ static T op(const T& a, const T& b)
+        {
+            T res;
+
+            res.x = b.x != 0 ? a.x / b.x : 0;
+            res.y = b.y != 0 ? a.y / b.y : 0;
+
+            return res;
+        }
+    };
+    template <typename T> struct SafeDiv<T, 3>
+    {
+        __device__ __forceinline__ static T op(const T& a, const T& b)
+        {
+            T res;
+
+            res.x = b.x != 0 ? a.x / b.x : 0;
+            res.y = b.y != 0 ? a.y / b.y : 0;
+            res.z = b.z != 0 ? a.z / b.z : 0;
+
+            return res;
+        }
+    };
+    template <typename T> struct SafeDiv<T, 4>
+    {
+        __device__ __forceinline__ static T op(const T& a, const T& b)
+        {
+            T res;
+
+            res.x = b.x != 0 ? a.x / b.x : 0;
+            res.y = b.y != 0 ? a.y / b.y : 0;
+            res.z = b.z != 0 ? a.z / b.z : 0;
+            res.w = b.w != 0 ? a.w / b.w : 0;
+
+            return res;
+        }
+    };
+
+    template <typename SrcType, typename ScalarType, typename DstType> struct DivScalarOp : unary_function<SrcType, DstType>
+    {
+        ScalarType val;
+
+        __device__ __forceinline__ DstType operator ()(SrcType a) const
+        {
+            return saturate_cast<DstType>(SafeDiv<ScalarType, VecTraits<ScalarType>::cn>::op(saturate_cast<ScalarType>(a), val));
+        }
+    };
+
+    template <typename SrcType, typename ScalarType, typename DstType> struct DivScalarOpInv : unary_function<SrcType, DstType>
+    {
+        ScalarType val;
+
+        __device__ __forceinline__ DstType operator ()(SrcType a) const
+        {
+            return saturate_cast<DstType>(SafeDiv<ScalarType, VecTraits<ScalarType>::cn>::op(val, saturate_cast<ScalarType>(a)));
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename SrcType, typename ScalarDepth, typename DstType>
+    void divScalarImpl(const GpuMat& src, cv::Scalar value, bool inv, GpuMat& dst, Stream& stream)
+    {
+        typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
+
+        cv::Scalar_<ScalarDepth> value_ = value;
+
+        if (inv)
+        {
+            DivScalarOpInv<SrcType, ScalarType, DstType> op;
+            op.val = VecTraits<ScalarType>::make(value_.val);
+
+            gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
+        }
+        else
+        {
+            DivScalarOp<SrcType, ScalarType, DstType> op;
+            op.val = VecTraits<ScalarType>::make(value_.val);
+
+            gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
+        }
+    }
+}
+
+void divScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, Stream& stream);
+    static const func_t funcs[7][7][4] =
+    {
+        {
+            {divScalarImpl<uchar, float, uchar>, divScalarImpl<uchar2, float, uchar2>, divScalarImpl<uchar3, float, uchar3>, divScalarImpl<uchar4, float, uchar4>},
+            {divScalarImpl<uchar, float, schar>, divScalarImpl<uchar2, float, char2>, divScalarImpl<uchar3, float, char3>, divScalarImpl<uchar4, float, char4>},
+            {divScalarImpl<uchar, float, ushort>, divScalarImpl<uchar2, float, ushort2>, divScalarImpl<uchar3, float, ushort3>, divScalarImpl<uchar4, float, ushort4>},
+            {divScalarImpl<uchar, float, short>, divScalarImpl<uchar2, float, short2>, divScalarImpl<uchar3, float, short3>, divScalarImpl<uchar4, float, short4>},
+            {divScalarImpl<uchar, float, int>, divScalarImpl<uchar2, float, int2>, divScalarImpl<uchar3, float, int3>, divScalarImpl<uchar4, float, int4>},
+            {divScalarImpl<uchar, float, float>, divScalarImpl<uchar2, float, float2>, divScalarImpl<uchar3, float, float3>, divScalarImpl<uchar4, float, float4>},
+            {divScalarImpl<uchar, double, double>, divScalarImpl<uchar2, double, double2>, divScalarImpl<uchar3, double, double3>, divScalarImpl<uchar4, double, double4>}
+        },
+        {
+            {divScalarImpl<schar, float, uchar>, divScalarImpl<char2, float, uchar2>, divScalarImpl<char3, float, uchar3>, divScalarImpl<char4, float, uchar4>},
+            {divScalarImpl<schar, float, schar>, divScalarImpl<char2, float, char2>, divScalarImpl<char3, float, char3>, divScalarImpl<char4, float, char4>},
+            {divScalarImpl<schar, float, ushort>, divScalarImpl<char2, float, ushort2>, divScalarImpl<char3, float, ushort3>, divScalarImpl<char4, float, ushort4>},
+            {divScalarImpl<schar, float, short>, divScalarImpl<char2, float, short2>, divScalarImpl<char3, float, short3>, divScalarImpl<char4, float, short4>},
+            {divScalarImpl<schar, float, int>, divScalarImpl<char2, float, int2>, divScalarImpl<char3, float, int3>, divScalarImpl<char4, float, int4>},
+            {divScalarImpl<schar, float, float>, divScalarImpl<char2, float, float2>, divScalarImpl<char3, float, float3>, divScalarImpl<char4, float, float4>},
+            {divScalarImpl<schar, double, double>, divScalarImpl<char2, double, double2>, divScalarImpl<char3, double, double3>, divScalarImpl<char4, double, double4>}
+        },
+        {
+            {0 /*divScalarImpl<ushort, float, uchar>*/, 0 /*divScalarImpl<ushort2, float, uchar2>*/, 0 /*divScalarImpl<ushort3, float, uchar3>*/, 0 /*divScalarImpl<ushort4, float, uchar4>*/},
+            {0 /*divScalarImpl<ushort, float, schar>*/, 0 /*divScalarImpl<ushort2, float, char2>*/, 0 /*divScalarImpl<ushort3, float, char3>*/, 0 /*divScalarImpl<ushort4, float, char4>*/},
+            {divScalarImpl<ushort, float, ushort>, divScalarImpl<ushort2, float, ushort2>, divScalarImpl<ushort3, float, ushort3>, divScalarImpl<ushort4, float, ushort4>},
+            {divScalarImpl<ushort, float, short>, divScalarImpl<ushort2, float, short2>, divScalarImpl<ushort3, float, short3>, divScalarImpl<ushort4, float, short4>},
+            {divScalarImpl<ushort, float, int>, divScalarImpl<ushort2, float, int2>, divScalarImpl<ushort3, float, int3>, divScalarImpl<ushort4, float, int4>},
+            {divScalarImpl<ushort, float, float>, divScalarImpl<ushort2, float, float2>, divScalarImpl<ushort3, float, float3>, divScalarImpl<ushort4, float, float4>},
+            {divScalarImpl<ushort, double, double>, divScalarImpl<ushort2, double, double2>, divScalarImpl<ushort3, double, double3>, divScalarImpl<ushort4, double, double4>}
+        },
+        {
+            {0 /*divScalarImpl<short, float, uchar>*/, 0 /*divScalarImpl<short2, float, uchar2>*/, 0 /*divScalarImpl<short3, float, uchar3>*/, 0 /*divScalarImpl<short4, float, uchar4>*/},
+            {0 /*divScalarImpl<short, float, schar>*/, 0 /*divScalarImpl<short2, float, char2>*/, 0 /*divScalarImpl<short3, float, char3>*/, 0 /*divScalarImpl<short4, float, char4>*/},
+            {divScalarImpl<short, float, ushort>, divScalarImpl<short2, float, ushort2>, divScalarImpl<short3, float, ushort3>, divScalarImpl<short4, float, ushort4>},
+            {divScalarImpl<short, float, short>, divScalarImpl<short2, float, short2>, divScalarImpl<short3, float, short3>, divScalarImpl<short4, float, short4>},
+            {divScalarImpl<short, float, int>, divScalarImpl<short2, float, int2>, divScalarImpl<short3, float, int3>, divScalarImpl<short4, float, int4>},
+            {divScalarImpl<short, float, float>, divScalarImpl<short2, float, float2>, divScalarImpl<short3, float, float3>, divScalarImpl<short4, float, float4>},
+            {divScalarImpl<short, double, double>, divScalarImpl<short2, double, double2>, divScalarImpl<short3, double, double3>, divScalarImpl<short4, double, double4>}
+        },
+        {
+            {0 /*divScalarImpl<int, float, uchar>*/, 0 /*divScalarImpl<int2, float, uchar2>*/, 0 /*divScalarImpl<int3, float, uchar3>*/, 0 /*divScalarImpl<int4, float, uchar4>*/},
+            {0 /*divScalarImpl<int, float, schar>*/, 0 /*divScalarImpl<int2, float, char2>*/, 0 /*divScalarImpl<int3, float, char3>*/, 0 /*divScalarImpl<int4, float, char4>*/},
+            {0 /*divScalarImpl<int, float, ushort>*/, 0 /*divScalarImpl<int2, float, ushort2>*/, 0 /*divScalarImpl<int3, float, ushort3>*/, 0 /*divScalarImpl<int4, float, ushort4>*/},
+            {0 /*divScalarImpl<int, float, short>*/, 0 /*divScalarImpl<int2, float, short2>*/, 0 /*divScalarImpl<int3, float, short3>*/, 0 /*divScalarImpl<int4, float, short4>*/},
+            {divScalarImpl<int, float, int>, divScalarImpl<int2, float, int2>, divScalarImpl<int3, float, int3>, divScalarImpl<int4, float, int4>},
+            {divScalarImpl<int, float, float>, divScalarImpl<int2, float, float2>, divScalarImpl<int3, float, float3>, divScalarImpl<int4, float, float4>},
+            {divScalarImpl<int, double, double>, divScalarImpl<int2, double, double2>, divScalarImpl<int3, double, double3>, divScalarImpl<int4, double, double4>}
+        },
+        {
+            {0 /*divScalarImpl<float, float, uchar>*/, 0 /*divScalarImpl<float2, float, uchar2>*/, 0 /*divScalarImpl<float3, float, uchar3>*/, 0 /*divScalarImpl<float4, float, uchar4>*/},
+            {0 /*divScalarImpl<float, float, schar>*/, 0 /*divScalarImpl<float2, float, char2>*/, 0 /*divScalarImpl<float3, float, char3>*/, 0 /*divScalarImpl<float4, float, char4>*/},
+            {0 /*divScalarImpl<float, float, ushort>*/, 0 /*divScalarImpl<float2, float, ushort2>*/, 0 /*divScalarImpl<float3, float, ushort3>*/, 0 /*divScalarImpl<float4, float, ushort4>*/},
+            {0 /*divScalarImpl<float, float, short>*/, 0 /*divScalarImpl<float2, float, short2>*/, 0 /*divScalarImpl<float3, float, short3>*/, 0 /*divScalarImpl<float4, float, short4>*/},
+            {0 /*divScalarImpl<float, float, int>*/, 0 /*divScalarImpl<float2, float, int2>*/, 0 /*divScalarImpl<float3, float, int3>*/, 0 /*divScalarImpl<float4, float, int4>*/},
+            {divScalarImpl<float, float, float>, divScalarImpl<float2, float, float2>, divScalarImpl<float3, float, float3>, divScalarImpl<float4, float, float4>},
+            {divScalarImpl<float, double, double>, divScalarImpl<float2, double, double2>, divScalarImpl<float3, double, double3>, divScalarImpl<float4, double, double4>}
+        },
+        {
+            {0 /*divScalarImpl<double, double, uchar>*/, 0 /*divScalarImpl<double2, double, uchar2>*/, 0 /*divScalarImpl<double3, double, uchar3>*/, 0 /*divScalarImpl<double4, double, uchar4>*/},
+            {0 /*divScalarImpl<double, double, schar>*/, 0 /*divScalarImpl<double2, double, char2>*/, 0 /*divScalarImpl<double3, double, char3>*/, 0 /*divScalarImpl<double4, double, char4>*/},
+            {0 /*divScalarImpl<double, double, ushort>*/, 0 /*divScalarImpl<double2, double, ushort2>*/, 0 /*divScalarImpl<double3, double, ushort3>*/, 0 /*divScalarImpl<double4, double, ushort4>*/},
+            {0 /*divScalarImpl<double, double, short>*/, 0 /*divScalarImpl<double2, double, short2>*/, 0 /*divScalarImpl<double3, double, short3>*/, 0 /*divScalarImpl<double4, double, short4>*/},
+            {0 /*divScalarImpl<double, double, int>*/, 0 /*divScalarImpl<double2, double, int2>*/, 0 /*divScalarImpl<double3, double, int3>*/, 0 /*divScalarImpl<double4, double, int4>*/},
+            {0 /*divScalarImpl<double, double, float>*/, 0 /*divScalarImpl<double2, double, float2>*/, 0 /*divScalarImpl<double3, double, float3>*/, 0 /*divScalarImpl<double4, double, float4>*/},
+            {divScalarImpl<double, double, double>, divScalarImpl<double2, double, double2>, divScalarImpl<double3, double, double3>, divScalarImpl<double4, double, double4>}
+        }
+    };
+
+    const int sdepth = src.depth();
+    const int ddepth = dst.depth();
+    const int cn = src.channels();
+
+    CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
+
+    if (inv)
+    {
+        val[0] *= scale;
+        val[1] *= scale;
+        val[2] *= scale;
+        val[3] *= scale;
+    }
+    else
+    {
+        val[0] /= scale;
+        val[1] /= scale;
+        val[2] /= scale;
+        val[3] /= scale;
+    }
+
+    const func_t func = funcs[sdepth][ddepth][cn - 1];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src, val, inv, dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/integral.cu b/modules/cudaarithm/src/cuda/integral.cu
new file mode 100644
index 00000000000..4a70ab0de85
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/integral.cu
@@ -0,0 +1,107 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+////////////////////////////////////////////////////////////////////////
+// integral
+
+void cv::cuda::integral(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.type() == CV_8UC1 );
+
+    BufferPool pool(stream);
+    GpuMat_<int> res(src.size(), pool.getAllocator());
+
+    gridIntegral(globPtr<uchar>(src), res, stream);
+
+    GpuMat dst = getOutputMat(_dst, src.rows + 1, src.cols + 1, CV_32SC1, stream);
+
+    dst.setTo(Scalar::all(0), stream);
+
+    GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
+    res.copyTo(inner, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// sqrIntegral
+
+void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.type() == CV_8UC1 );
+
+    BufferPool pool(Stream::Null());
+    GpuMat_<double> res(pool.getBuffer(src.size(), CV_64FC1));
+
+    gridIntegral(sqr_(cvt_<int>(globPtr<uchar>(src))), res, stream);
+
+    GpuMat dst = getOutputMat(_dst, src.rows + 1, src.cols + 1, CV_64FC1, stream);
+
+    dst.setTo(Scalar::all(0), stream);
+
+    GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
+    res.copyTo(inner, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/lut.cu b/modules/cudaarithm/src/cuda/lut.cu
new file mode 100644
index 00000000000..1769116d30c
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/lut.cu
@@ -0,0 +1,210 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    texture<uchar, cudaTextureType1D, cudaReadModeElementType> texLutTable;
+
+    class LookUpTableImpl : public LookUpTable
+    {
+    public:
+        LookUpTableImpl(InputArray lut);
+        ~LookUpTableImpl();
+
+        void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) CV_OVERRIDE;
+
+    private:
+        GpuMat d_lut;
+        cudaTextureObject_t texLutTableObj;
+        bool cc30;
+    };
+
+    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
+    {
+        if (_lut.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            d_lut = _lut.getGpuMat();
+        }
+        else
+        {
+            Mat h_lut = _lut.getMat();
+            d_lut.upload(Mat(1, 256, h_lut.type(), h_lut.data));
+        }
+
+        CV_Assert( d_lut.depth() == CV_8U );
+        CV_Assert( d_lut.rows == 1 && d_lut.cols == 256 );
+
+        cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
+
+        if (cc30)
+        {
+            // Use the texture object
+            cudaResourceDesc texRes;
+            std::memset(&texRes, 0, sizeof(texRes));
+            texRes.resType = cudaResourceTypeLinear;
+            texRes.res.linear.devPtr = d_lut.data;
+            texRes.res.linear.desc = cudaCreateChannelDesc<uchar>();
+            texRes.res.linear.sizeInBytes = 256 * d_lut.channels() * sizeof(uchar);
+
+            cudaTextureDesc texDescr;
+            std::memset(&texDescr, 0, sizeof(texDescr));
+
+            CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&texLutTableObj, &texRes, &texDescr, 0) );
+        }
+        else
+        {
+            // Use the texture reference
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
+            CV_CUDEV_SAFE_CALL( cudaBindTexture(0, &texLutTable, d_lut.data, &desc) );
+        }
+    }
+
+    LookUpTableImpl::~LookUpTableImpl()
+    {
+        if (cc30)
+        {
+            // Use the texture object
+            cudaDestroyTextureObject(texLutTableObj);
+        }
+        else
+        {
+            // Use the texture reference
+            cudaUnbindTexture(texLutTable);
+        }
+    }
+
+    struct LutTablePtrC1
+    {
+        typedef uchar value_type;
+        typedef uchar index_type;
+
+        cudaTextureObject_t texLutTableObj;
+
+        __device__ __forceinline__ uchar operator ()(uchar, uchar x) const
+        {
+        #if CV_CUDEV_ARCH < 300
+            // Use the texture reference
+            return tex1Dfetch(texLutTable, x);
+        #else
+            // Use the texture object
+            return tex1Dfetch<uchar>(texLutTableObj, x);
+        #endif
+        }
+    };
+    struct LutTablePtrC3
+    {
+        typedef uchar3 value_type;
+        typedef uchar3 index_type;
+
+        cudaTextureObject_t texLutTableObj;
+
+        __device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const
+        {
+        #if CV_CUDEV_ARCH < 300
+            // Use the texture reference
+            return make_uchar3(tex1Dfetch(texLutTable, x.x * 3), tex1Dfetch(texLutTable, x.y * 3 + 1), tex1Dfetch(texLutTable, x.z * 3 + 2));
+        #else
+            // Use the texture object
+            return make_uchar3(tex1Dfetch<uchar>(texLutTableObj, x.x * 3), tex1Dfetch<uchar>(texLutTableObj, x.y * 3 + 1), tex1Dfetch<uchar>(texLutTableObj, x.z * 3 + 2));
+        #endif
+        }
+    };
+
+    void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& stream)
+    {
+        GpuMat src = getInputMat(_src, stream);
+
+        const int cn = src.channels();
+        const int lut_cn = d_lut.channels();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+        CV_Assert( lut_cn == 1 || lut_cn == cn );
+
+        GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+        if (lut_cn == 1)
+        {
+            GpuMat_<uchar> src1(src.reshape(1));
+            GpuMat_<uchar> dst1(dst.reshape(1));
+
+            LutTablePtrC1 tbl;
+            tbl.texLutTableObj = texLutTableObj;
+
+            dst1.assign(lut_(src1, tbl), stream);
+        }
+        else if (lut_cn == 3)
+        {
+            GpuMat_<uchar3>& src3 = (GpuMat_<uchar3>&) src;
+            GpuMat_<uchar3>& dst3 = (GpuMat_<uchar3>&) dst;
+
+            LutTablePtrC3 tbl;
+            tbl.texLutTableObj = texLutTableObj;
+
+            dst3.assign(lut_(src3, tbl), stream);
+        }
+
+        syncOutput(dst, _dst, stream);
+    }
+}
+
+Ptr<LookUpTable> cv::cuda::createLookUpTable(InputArray lut)
+{
+    return makePtr<LookUpTableImpl>(lut);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/math.cu b/modules/cudaarithm/src/cuda/math.cu
new file mode 100644
index 00000000000..b8853196593
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/math.cu
@@ -0,0 +1,341 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/// abs
+
+namespace
+{
+    template <typename T>
+    void absMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
+    {
+        gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), abs_func<T>(), stream);
+    }
+}
+
+void cv::cuda::abs(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        absMat<uchar>,
+        absMat<schar>,
+        absMat<ushort>,
+        absMat<short>,
+        absMat<int>,
+        absMat<float>,
+        absMat<double>
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.depth() <= CV_64F );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/// sqr
+
+namespace
+{
+    template <typename T> struct SqrOp : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(T x) const
+        {
+            return cudev::saturate_cast<T>(x * x);
+        }
+    };
+
+    template <typename T>
+    void sqrMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
+    {
+        gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), SqrOp<T>(), stream);
+    }
+}
+
+void cv::cuda::sqr(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        sqrMat<uchar>,
+        sqrMat<schar>,
+        sqrMat<ushort>,
+        sqrMat<short>,
+        sqrMat<int>,
+        sqrMat<float>,
+        sqrMat<double>
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.depth() <= CV_64F );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/// sqrt
+
+namespace
+{
+    template <typename T>
+    void sqrtMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
+    {
+        gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), sqrt_func<T>(), stream);
+    }
+}
+
+void cv::cuda::sqrt(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        sqrtMat<uchar>,
+        sqrtMat<schar>,
+        sqrtMat<ushort>,
+        sqrtMat<short>,
+        sqrtMat<int>,
+        sqrtMat<float>,
+        sqrtMat<double>
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.depth() <= CV_64F );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+////////////////////////////////////////////////////////////////////////
+/// exp
+
+namespace
+{
+    template <typename T> struct ExpOp : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(T x) const
+        {
+            exp_func<T> f;
+            return cudev::saturate_cast<T>(f(x));
+        }
+    };
+
+    template <typename T>
+    void expMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
+    {
+        gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), ExpOp<T>(), stream);
+    }
+}
+
+void cv::cuda::exp(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        expMat<uchar>,
+        expMat<schar>,
+        expMat<ushort>,
+        expMat<short>,
+        expMat<int>,
+        expMat<float>,
+        expMat<double>
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.depth() <= CV_64F );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+////////////////////////////////////////////////////////////////////////
+// log
+
+namespace
+{
+    template <typename T>
+    void logMat(const GpuMat& src, const GpuMat& dst, Stream& stream)
+    {
+        gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), log_func<T>(), stream);
+    }
+}
+
+void cv::cuda::log(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        logMat<uchar>,
+        logMat<schar>,
+        logMat<ushort>,
+        logMat<short>,
+        logMat<int>,
+        logMat<float>,
+        logMat<double>
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.depth() <= CV_64F );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+////////////////////////////////////////////////////////////////////////
+// pow
+
+namespace
+{
+    template<typename T, bool Signed = numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
+    {
+        typedef typename LargerType<T, float>::type LargerType;
+        LargerType power;
+
+        __device__ __forceinline__ T operator()(T e) const
+        {
+            T res = cudev::saturate_cast<T>(__powf(e < 0 ? -e : e, power));
+
+            if ((e < 0) && (1 & static_cast<int>(power)))
+                res *= -1;
+
+            return res;
+        }
+    };
+
+    template<typename T> struct PowOp<T, false> : unary_function<T, T>
+    {
+        typedef typename LargerType<T, float>::type LargerType;
+        LargerType power;
+
+        __device__ __forceinline__ T operator()(T e) const
+        {
+            return cudev::saturate_cast<T>(__powf(e, power));
+        }
+    };
+
+    template<typename T>
+    void powMat(const GpuMat& src, double power, const GpuMat& dst, Stream& stream)
+    {
+        PowOp<T> op;
+        op.power = static_cast<typename LargerType<T, float>::type>(power);
+
+        gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), op, stream);
+    }
+}
+
+void cv::cuda::pow(InputArray _src, double power, OutputArray _dst, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, double power, const GpuMat& dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        powMat<uchar>,
+        powMat<schar>,
+        powMat<ushort>,
+        powMat<short>,
+        powMat<int>,
+        powMat<float>,
+        powMat<double>
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.depth() <= CV_64F );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+    funcs[src.depth()](src.reshape(1), power, dst.reshape(1), stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/minmax.cu b/modules/cudaarithm/src/cuda/minmax.cu
new file mode 100644
index 00000000000..c5e912c8e91
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/minmax.cu
@@ -0,0 +1,189 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    template <typename T, typename R>
+    void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<R>& dst = (GpuMat_<R>&) _dst;
+
+        if (mask.empty())
+            gridFindMinMaxVal(src, dst, stream);
+        else
+            gridFindMinMaxVal(src, dst, globPtr<uchar>(mask), stream);
+    }
+
+    template <typename T, typename R>
+    void minMaxImpl(const GpuMat& src, const GpuMat& mask, double* minVal, double* maxVal)
+    {
+        BufferPool pool(Stream::Null());
+        GpuMat buf(pool.getBuffer(1, 2, DataType<R>::type));
+
+        minMaxImpl<T, R>(src, mask, buf, Stream::Null());
+
+        R data[2];
+        buf.download(Mat(1, 2, buf.type(), data));
+
+    }
+}
+
+void cv::cuda::findMinMax(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        minMaxImpl<uchar, int>,
+        minMaxImpl<schar, int>,
+        minMaxImpl<ushort, int>,
+        minMaxImpl<short, int>,
+        minMaxImpl<int, int>,
+        minMaxImpl<float, float>,
+        minMaxImpl<double, double>
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    const int src_depth = src.depth();
+    const int dst_depth = src_depth < CV_32F ? CV_32S : src_depth;
+
+    GpuMat dst = getOutputMat(_dst, 1, 2, dst_depth, stream);
+
+    const func_t func = funcs[src.depth()];
+    func(src, mask, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    findMinMax(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().convertTo(Mat(1, 2, CV_64FC1, &vals[0]), CV_64F);
+
+    if (minVal)
+        *minVal = vals[0];
+
+    if (maxVal)
+        *maxVal = vals[1];
+}
+
+namespace cv { namespace cuda { namespace device {
+
+void findMaxAbs(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream);
+
+}}}
+
+namespace
+{
+    template <typename T, typename R>
+    void findMaxAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<R>& dst = (GpuMat_<R>&) _dst;
+
+        if (mask.empty())
+            gridFindMaxVal(abs_(src), dst, stream);
+        else
+            gridFindMaxVal(abs_(src), dst, globPtr<uchar>(mask), stream);
+    }
+}
+
+void cv::cuda::device::findMaxAbs(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        findMaxAbsImpl<uchar, int>,
+        findMaxAbsImpl<schar, int>,
+        findMaxAbsImpl<ushort, int>,
+        findMaxAbsImpl<short, int>,
+        findMaxAbsImpl<int, int>,
+        findMaxAbsImpl<float, float>,
+        findMaxAbsImpl<double, double>
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    const int src_depth = src.depth();
+    const int dst_depth = src_depth < CV_32F ? CV_32S : src_depth;
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, dst_depth, stream);
+
+    const func_t func = funcs[src.depth()];
+    func(src, mask, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/minmax_mat.cu b/modules/cudaarithm/src/cuda/minmax_mat.cu
new file mode 100644
index 00000000000..405b230868a
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/minmax_mat.cu
@@ -0,0 +1,243 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void minMaxMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int op);
+
+void minMaxScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int op);
+
+///////////////////////////////////////////////////////////////////////
+/// minMaxMat
+
+namespace
+{
+    template <template <typename> class Op, typename T>
+    void minMaxMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<T>(dst), Op<T>(), stream);
+    }
+
+    struct MinOp2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vmin2(a, b);
+        }
+    };
+
+    struct MaxOp2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vmax2(a, b);
+        }
+    };
+
+    template <class Op2>
+    void minMaxMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 1;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, Op2(), stream);
+    }
+
+    struct MinOp4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vmin4(a, b);
+        }
+    };
+
+    struct MaxOp4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vmax4(a, b);
+        }
+    };
+
+    template <class Op4>
+    void minMaxMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, Op4(), stream);
+    }
+}
+
+void minMaxMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int op)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+    static const func_t funcs_v1[2][7] =
+    {
+        {
+            minMaxMat_v1<minimum, uchar>,
+            minMaxMat_v1<minimum, schar>,
+            minMaxMat_v1<minimum, ushort>,
+            minMaxMat_v1<minimum, short>,
+            minMaxMat_v1<minimum, int>,
+            minMaxMat_v1<minimum, float>,
+            minMaxMat_v1<minimum, double>
+        },
+        {
+            minMaxMat_v1<maximum, uchar>,
+            minMaxMat_v1<maximum, schar>,
+            minMaxMat_v1<maximum, ushort>,
+            minMaxMat_v1<maximum, short>,
+            minMaxMat_v1<maximum, int>,
+            minMaxMat_v1<maximum, float>,
+            minMaxMat_v1<maximum, double>
+        }
+    };
+
+    static const func_t funcs_v2[2] =
+    {
+        minMaxMat_v2<MinOp2>, minMaxMat_v2<MaxOp2>
+    };
+
+    static const func_t funcs_v4[2] =
+    {
+        minMaxMat_v4<MinOp4>, minMaxMat_v4<MaxOp4>
+    };
+
+    const int depth = src1.depth();
+
+    CV_DbgAssert( depth <= CV_64F );
+
+    GpuMat src1_ = src1.reshape(1);
+    GpuMat src2_ = src2.reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+
+    if (depth == CV_8U || depth == CV_16U)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
+            {
+                funcs_v4[op](src1_, src2_, dst_, stream);
+                return;
+            }
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
+            {
+                funcs_v2[op](src1_, src2_, dst_, stream);
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs_v1[op][depth];
+
+    func(src1_, src2_, dst_, stream);
+}
+
+///////////////////////////////////////////////////////////////////////
+/// minMaxScalar
+
+namespace
+{
+    template <template <typename> class Op, typename T>
+    void minMaxScalar(const GpuMat& src, double value, GpuMat& dst, Stream& stream)
+    {
+        gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), cv::saturate_cast<T>(value)), stream);
+    }
+}
+
+void minMaxScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int op)
+{
+    typedef void (*func_t)(const GpuMat& src, double value, GpuMat& dst, Stream& stream);
+    static const func_t funcs[2][7] =
+    {
+        {
+            minMaxScalar<minimum, uchar>,
+            minMaxScalar<minimum, schar>,
+            minMaxScalar<minimum, ushort>,
+            minMaxScalar<minimum, short>,
+            minMaxScalar<minimum, int>,
+            minMaxScalar<minimum, float>,
+            minMaxScalar<minimum, double>
+        },
+        {
+            minMaxScalar<maximum, uchar>,
+            minMaxScalar<maximum, schar>,
+            minMaxScalar<maximum, ushort>,
+            minMaxScalar<maximum, short>,
+            minMaxScalar<maximum, int>,
+            minMaxScalar<maximum, float>,
+            minMaxScalar<maximum, double>
+        }
+    };
+
+    const int depth = src.depth();
+
+    CV_DbgAssert( depth <= CV_64F );
+    CV_DbgAssert( src.channels() == 1 );
+
+    funcs[op][depth](src, value[0], dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/minmaxloc.cu b/modules/cudaarithm/src/cuda/minmaxloc.cu
new file mode 100644
index 00000000000..b7c5ec872fc
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/minmaxloc.cu
@@ -0,0 +1,159 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    template <typename T, typename R>
+    void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<R>& valBuf = (GpuMat_<R>&) _valBuf;
+        GpuMat_<int>& locBuf = (GpuMat_<int>&) _locBuf;
+
+        if (mask.empty())
+            gridMinMaxLoc(src, valBuf, locBuf, stream);
+        else
+            gridMinMaxLoc(src, valBuf, locBuf, globPtr<uchar>(mask), stream);
+    }
+}
+
+void cv::cuda::findMinMaxLoc(InputArray _src, OutputArray _minMaxVals, OutputArray _loc, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, Stream& stream);
+    static const func_t funcs[] =
+    {
+        minMaxLocImpl<uchar, int>,
+        minMaxLocImpl<schar, int>,
+        minMaxLocImpl<ushort, int>,
+        minMaxLocImpl<short, int>,
+        minMaxLocImpl<int, int>,
+        minMaxLocImpl<float, float>,
+        minMaxLocImpl<double, double>
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    const int src_depth = src.depth();
+
+    BufferPool pool(stream);
+    GpuMat valBuf(pool.getAllocator());
+    GpuMat locBuf(pool.getAllocator());
+
+    const func_t func = funcs[src_depth];
+    func(src, mask, valBuf, locBuf, stream);
+
+    GpuMat minMaxVals = valBuf.colRange(0, 1);
+    GpuMat loc = locBuf.colRange(0, 1);
+
+    if (_minMaxVals.kind() == _InputArray::CUDA_GPU_MAT)
+    {
+        minMaxVals.copyTo(_minMaxVals, stream);
+    }
+    else
+    {
+        minMaxVals.download(_minMaxVals, stream);
+    }
+
+    if (_loc.kind() == _InputArray::CUDA_GPU_MAT)
+    {
+        loc.copyTo(_loc, stream);
+    }
+    else
+    {
+        loc.download(_loc, stream);
+    }
+}
+
+void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem minMaxVals, locVals;
+    findMinMaxLoc(_src, minMaxVals, locVals, _mask, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    minMaxVals.createMatHeader().convertTo(Mat(minMaxVals.size(), CV_64FC1, &vals[0]), CV_64F);
+
+    int locs[2];
+    locVals.createMatHeader().copyTo(Mat(locVals.size(), CV_32SC1, &locs[0]));
+    Size size = _src.size();
+    cv::Point locs2D[] = {
+        cv::Point(locs[0] % size.width, locs[0] / size.width),
+        cv::Point(locs[1] % size.width, locs[1] / size.width),
+    };
+
+    if (minVal)
+        *minVal = vals[0];
+
+    if (maxVal)
+        *maxVal = vals[1];
+
+    if (minLoc)
+        *minLoc = locs2D[0];
+
+    if (maxLoc)
+        *maxLoc = locs2D[1];
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/mul_mat.cu b/modules/cudaarithm/src/cuda/mul_mat.cu
new file mode 100644
index 00000000000..6ea70655731
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/mul_mat.cu
@@ -0,0 +1,224 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int);
+void mulMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+void mulMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+
+namespace
+{
+    template <typename T, typename D> struct MulOp : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a * b);
+        }
+    };
+
+    template <typename T, typename S, typename D> struct MulScaleOp : binary_function<T, T, D>
+    {
+        S scale;
+
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(scale * a * b);
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename T, typename S, typename D>
+    void mulMatImpl(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream)
+    {
+        if (scale == 1)
+        {
+            MulOp<T, D> op;
+            gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
+        }
+        else
+        {
+            MulScaleOp<T, S, D> op;
+            op.scale = static_cast<S>(scale);
+            gridTransformBinary_< TransformPolicy<S> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), op, stream);
+        }
+    }
+}
+
+void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, const GpuMat& dst, double scale, Stream& stream);
+    static const func_t funcs[7][7] =
+    {
+        {
+            mulMatImpl<uchar, float, uchar>,
+            mulMatImpl<uchar, float, schar>,
+            mulMatImpl<uchar, float, ushort>,
+            mulMatImpl<uchar, float, short>,
+            mulMatImpl<uchar, float, int>,
+            mulMatImpl<uchar, float, float>,
+            mulMatImpl<uchar, double, double>
+        },
+        {
+            mulMatImpl<schar, float, uchar>,
+            mulMatImpl<schar, float, schar>,
+            mulMatImpl<schar, float, ushort>,
+            mulMatImpl<schar, float, short>,
+            mulMatImpl<schar, float, int>,
+            mulMatImpl<schar, float, float>,
+            mulMatImpl<schar, double, double>
+        },
+        {
+            0 /*mulMatImpl<ushort, float, uchar>*/,
+            0 /*mulMatImpl<ushort, float, schar>*/,
+            mulMatImpl<ushort, float, ushort>,
+            mulMatImpl<ushort, float, short>,
+            mulMatImpl<ushort, float, int>,
+            mulMatImpl<ushort, float, float>,
+            mulMatImpl<ushort, double, double>
+        },
+        {
+            0 /*mulMatImpl<short, float, uchar>*/,
+            0 /*mulMatImpl<short, float, schar>*/,
+            mulMatImpl<short, float, ushort>,
+            mulMatImpl<short, float, short>,
+            mulMatImpl<short, float, int>,
+            mulMatImpl<short, float, float>,
+            mulMatImpl<short, double, double>
+        },
+        {
+            0 /*mulMatImpl<int, float, uchar>*/,
+            0 /*mulMatImpl<int, float, schar>*/,
+            0 /*mulMatImpl<int, float, ushort>*/,
+            0 /*mulMatImpl<int, float, short>*/,
+            mulMatImpl<int, float, int>,
+            mulMatImpl<int, float, float>,
+            mulMatImpl<int, double, double>
+        },
+        {
+            0 /*mulMatImpl<float, float, uchar>*/,
+            0 /*mulMatImpl<float, float, schar>*/,
+            0 /*mulMatImpl<float, float, ushort>*/,
+            0 /*mulMatImpl<float, float, short>*/,
+            0 /*mulMatImpl<float, float, int>*/,
+            mulMatImpl<float, float, float>,
+            mulMatImpl<float, double, double>
+        },
+        {
+            0 /*mulMatImpl<double, double, uchar>*/,
+            0 /*mulMatImpl<double, double, schar>*/,
+            0 /*mulMatImpl<double, double, ushort>*/,
+            0 /*mulMatImpl<double, double, short>*/,
+            0 /*mulMatImpl<double, double, int>*/,
+            0 /*mulMatImpl<double, double, float>*/,
+            mulMatImpl<double, double, double>
+        }
+    };
+
+    const int sdepth = src1.depth();
+    const int ddepth = dst.depth();
+
+    CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
+
+    GpuMat src1_ = src1.reshape(1);
+    GpuMat src2_ = src2.reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+
+    const func_t func = funcs[sdepth][ddepth];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, scale, stream);
+}
+
+namespace
+{
+    template <typename T>
+    struct MulOpSpecial : binary_function<T, float, T>
+    {
+        __device__ __forceinline__ T operator ()(const T& a, float b) const
+        {
+            typedef typename VecTraits<T>::elem_type elem_type;
+
+            T res;
+
+            res.x = saturate_cast<elem_type>(a.x * b);
+            res.y = saturate_cast<elem_type>(a.y * b);
+            res.z = saturate_cast<elem_type>(a.z * b);
+            res.w = saturate_cast<elem_type>(a.w * b);
+
+            return res;
+        }
+    };
+}
+
+void mulMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+{
+    gridTransformBinary(globPtr<uchar4>(src1), globPtr<float>(src2), globPtr<uchar4>(dst), MulOpSpecial<uchar4>(), stream);
+}
+
+void mulMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+{
+    gridTransformBinary(globPtr<short4>(src1), globPtr<float>(src2), globPtr<short4>(dst), MulOpSpecial<short4>(), stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/mul_scalar.cu b/modules/cudaarithm/src/cuda/mul_scalar.cu
new file mode 100644
index 00000000000..f27ef26ddd7
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/mul_scalar.cu
@@ -0,0 +1,182 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void mulScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int);
+
+namespace
+{
+    template <typename SrcType, typename ScalarType, typename DstType> struct MulScalarOp : unary_function<SrcType, DstType>
+    {
+        ScalarType val;
+
+        __device__ __forceinline__ DstType operator ()(SrcType a) const
+        {
+            return saturate_cast<DstType>(saturate_cast<ScalarType>(a) * val);
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename SrcType, typename ScalarDepth, typename DstType>
+    void mulScalarImpl(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+    {
+        typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
+
+        cv::Scalar_<ScalarDepth> value_ = value;
+
+        MulScalarOp<SrcType, ScalarType, DstType> op;
+        op.val = VecTraits<ScalarType>::make(value_.val);
+
+        gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
+    }
+}
+
+void mulScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src, cv::Scalar val, GpuMat& dst, Stream& stream);
+    static const func_t funcs[7][7][4] =
+    {
+        {
+            {mulScalarImpl<uchar, float, uchar>, mulScalarImpl<uchar2, float, uchar2>, mulScalarImpl<uchar3, float, uchar3>, mulScalarImpl<uchar4, float, uchar4>},
+            {mulScalarImpl<uchar, float, schar>, mulScalarImpl<uchar2, float, char2>, mulScalarImpl<uchar3, float, char3>, mulScalarImpl<uchar4, float, char4>},
+            {mulScalarImpl<uchar, float, ushort>, mulScalarImpl<uchar2, float, ushort2>, mulScalarImpl<uchar3, float, ushort3>, mulScalarImpl<uchar4, float, ushort4>},
+            {mulScalarImpl<uchar, float, short>, mulScalarImpl<uchar2, float, short2>, mulScalarImpl<uchar3, float, short3>, mulScalarImpl<uchar4, float, short4>},
+            {mulScalarImpl<uchar, float, int>, mulScalarImpl<uchar2, float, int2>, mulScalarImpl<uchar3, float, int3>, mulScalarImpl<uchar4, float, int4>},
+            {mulScalarImpl<uchar, float, float>, mulScalarImpl<uchar2, float, float2>, mulScalarImpl<uchar3, float, float3>, mulScalarImpl<uchar4, float, float4>},
+            {mulScalarImpl<uchar, double, double>, mulScalarImpl<uchar2, double, double2>, mulScalarImpl<uchar3, double, double3>, mulScalarImpl<uchar4, double, double4>}
+        },
+        {
+            {mulScalarImpl<schar, float, uchar>, mulScalarImpl<char2, float, uchar2>, mulScalarImpl<char3, float, uchar3>, mulScalarImpl<char4, float, uchar4>},
+            {mulScalarImpl<schar, float, schar>, mulScalarImpl<char2, float, char2>, mulScalarImpl<char3, float, char3>, mulScalarImpl<char4, float, char4>},
+            {mulScalarImpl<schar, float, ushort>, mulScalarImpl<char2, float, ushort2>, mulScalarImpl<char3, float, ushort3>, mulScalarImpl<char4, float, ushort4>},
+            {mulScalarImpl<schar, float, short>, mulScalarImpl<char2, float, short2>, mulScalarImpl<char3, float, short3>, mulScalarImpl<char4, float, short4>},
+            {mulScalarImpl<schar, float, int>, mulScalarImpl<char2, float, int2>, mulScalarImpl<char3, float, int3>, mulScalarImpl<char4, float, int4>},
+            {mulScalarImpl<schar, float, float>, mulScalarImpl<char2, float, float2>, mulScalarImpl<char3, float, float3>, mulScalarImpl<char4, float, float4>},
+            {mulScalarImpl<schar, double, double>, mulScalarImpl<char2, double, double2>, mulScalarImpl<char3, double, double3>, mulScalarImpl<char4, double, double4>}
+        },
+        {
+            {0 /*mulScalarImpl<ushort, float, uchar>*/, 0 /*mulScalarImpl<ushort2, float, uchar2>*/, 0 /*mulScalarImpl<ushort3, float, uchar3>*/, 0 /*mulScalarImpl<ushort4, float, uchar4>*/},
+            {0 /*mulScalarImpl<ushort, float, schar>*/, 0 /*mulScalarImpl<ushort2, float, char2>*/, 0 /*mulScalarImpl<ushort3, float, char3>*/, 0 /*mulScalarImpl<ushort4, float, char4>*/},
+            {mulScalarImpl<ushort, float, ushort>, mulScalarImpl<ushort2, float, ushort2>, mulScalarImpl<ushort3, float, ushort3>, mulScalarImpl<ushort4, float, ushort4>},
+            {mulScalarImpl<ushort, float, short>, mulScalarImpl<ushort2, float, short2>, mulScalarImpl<ushort3, float, short3>, mulScalarImpl<ushort4, float, short4>},
+            {mulScalarImpl<ushort, float, int>, mulScalarImpl<ushort2, float, int2>, mulScalarImpl<ushort3, float, int3>, mulScalarImpl<ushort4, float, int4>},
+            {mulScalarImpl<ushort, float, float>, mulScalarImpl<ushort2, float, float2>, mulScalarImpl<ushort3, float, float3>, mulScalarImpl<ushort4, float, float4>},
+            {mulScalarImpl<ushort, double, double>, mulScalarImpl<ushort2, double, double2>, mulScalarImpl<ushort3, double, double3>, mulScalarImpl<ushort4, double, double4>}
+        },
+        {
+            {0 /*mulScalarImpl<short, float, uchar>*/, 0 /*mulScalarImpl<short2, float, uchar2>*/, 0 /*mulScalarImpl<short3, float, uchar3>*/, 0 /*mulScalarImpl<short4, float, uchar4>*/},
+            {0 /*mulScalarImpl<short, float, schar>*/, 0 /*mulScalarImpl<short2, float, char2>*/, 0 /*mulScalarImpl<short3, float, char3>*/, 0 /*mulScalarImpl<short4, float, char4>*/},
+            {mulScalarImpl<short, float, ushort>, mulScalarImpl<short2, float, ushort2>, mulScalarImpl<short3, float, ushort3>, mulScalarImpl<short4, float, ushort4>},
+            {mulScalarImpl<short, float, short>, mulScalarImpl<short2, float, short2>, mulScalarImpl<short3, float, short3>, mulScalarImpl<short4, float, short4>},
+            {mulScalarImpl<short, float, int>, mulScalarImpl<short2, float, int2>, mulScalarImpl<short3, float, int3>, mulScalarImpl<short4, float, int4>},
+            {mulScalarImpl<short, float, float>, mulScalarImpl<short2, float, float2>, mulScalarImpl<short3, float, float3>, mulScalarImpl<short4, float, float4>},
+            {mulScalarImpl<short, double, double>, mulScalarImpl<short2, double, double2>, mulScalarImpl<short3, double, double3>, mulScalarImpl<short4, double, double4>}
+        },
+        {
+            {0 /*mulScalarImpl<int, float, uchar>*/, 0 /*mulScalarImpl<int2, float, uchar2>*/, 0 /*mulScalarImpl<int3, float, uchar3>*/, 0 /*mulScalarImpl<int4, float, uchar4>*/},
+            {0 /*mulScalarImpl<int, float, schar>*/, 0 /*mulScalarImpl<int2, float, char2>*/, 0 /*mulScalarImpl<int3, float, char3>*/, 0 /*mulScalarImpl<int4, float, char4>*/},
+            {0 /*mulScalarImpl<int, float, ushort>*/, 0 /*mulScalarImpl<int2, float, ushort2>*/, 0 /*mulScalarImpl<int3, float, ushort3>*/, 0 /*mulScalarImpl<int4, float, ushort4>*/},
+            {0 /*mulScalarImpl<int, float, short>*/, 0 /*mulScalarImpl<int2, float, short2>*/, 0 /*mulScalarImpl<int3, float, short3>*/, 0 /*mulScalarImpl<int4, float, short4>*/},
+            {mulScalarImpl<int, float, int>, mulScalarImpl<int2, float, int2>, mulScalarImpl<int3, float, int3>, mulScalarImpl<int4, float, int4>},
+            {mulScalarImpl<int, float, float>, mulScalarImpl<int2, float, float2>, mulScalarImpl<int3, float, float3>, mulScalarImpl<int4, float, float4>},
+            {mulScalarImpl<int, double, double>, mulScalarImpl<int2, double, double2>, mulScalarImpl<int3, double, double3>, mulScalarImpl<int4, double, double4>}
+        },
+        {
+            {0 /*mulScalarImpl<float, float, uchar>*/, 0 /*mulScalarImpl<float2, float, uchar2>*/, 0 /*mulScalarImpl<float3, float, uchar3>*/, 0 /*mulScalarImpl<float4, float, uchar4>*/},
+            {0 /*mulScalarImpl<float, float, schar>*/, 0 /*mulScalarImpl<float2, float, char2>*/, 0 /*mulScalarImpl<float3, float, char3>*/, 0 /*mulScalarImpl<float4, float, char4>*/},
+            {0 /*mulScalarImpl<float, float, ushort>*/, 0 /*mulScalarImpl<float2, float, ushort2>*/, 0 /*mulScalarImpl<float3, float, ushort3>*/, 0 /*mulScalarImpl<float4, float, ushort4>*/},
+            {0 /*mulScalarImpl<float, float, short>*/, 0 /*mulScalarImpl<float2, float, short2>*/, 0 /*mulScalarImpl<float3, float, short3>*/, 0 /*mulScalarImpl<float4, float, short4>*/},
+            {0 /*mulScalarImpl<float, float, int>*/, 0 /*mulScalarImpl<float2, float, int2>*/, 0 /*mulScalarImpl<float3, float, int3>*/, 0 /*mulScalarImpl<float4, float, int4>*/},
+            {mulScalarImpl<float, float, float>, mulScalarImpl<float2, float, float2>, mulScalarImpl<float3, float, float3>, mulScalarImpl<float4, float, float4>},
+            {mulScalarImpl<float, double, double>, mulScalarImpl<float2, double, double2>, mulScalarImpl<float3, double, double3>, mulScalarImpl<float4, double, double4>}
+        },
+        {
+            {0 /*mulScalarImpl<double, double, uchar>*/, 0 /*mulScalarImpl<double2, double, uchar2>*/, 0 /*mulScalarImpl<double3, double, uchar3>*/, 0 /*mulScalarImpl<double4, double, uchar4>*/},
+            {0 /*mulScalarImpl<double, double, schar>*/, 0 /*mulScalarImpl<double2, double, char2>*/, 0 /*mulScalarImpl<double3, double, char3>*/, 0 /*mulScalarImpl<double4, double, char4>*/},
+            {0 /*mulScalarImpl<double, double, ushort>*/, 0 /*mulScalarImpl<double2, double, ushort2>*/, 0 /*mulScalarImpl<double3, double, ushort3>*/, 0 /*mulScalarImpl<double4, double, ushort4>*/},
+            {0 /*mulScalarImpl<double, double, short>*/, 0 /*mulScalarImpl<double2, double, short2>*/, 0 /*mulScalarImpl<double3, double, short3>*/, 0 /*mulScalarImpl<double4, double, short4>*/},
+            {0 /*mulScalarImpl<double, double, int>*/, 0 /*mulScalarImpl<double2, double, int2>*/, 0 /*mulScalarImpl<double3, double, int3>*/, 0 /*mulScalarImpl<double4, double, int4>*/},
+            {0 /*mulScalarImpl<double, double, float>*/, 0 /*mulScalarImpl<double2, double, float2>*/, 0 /*mulScalarImpl<double3, double, float3>*/, 0 /*mulScalarImpl<double4, double, float4>*/},
+            {mulScalarImpl<double, double, double>, mulScalarImpl<double2, double, double2>, mulScalarImpl<double3, double, double3>, mulScalarImpl<double4, double, double4>}
+        }
+    };
+
+    const int sdepth = src.depth();
+    const int ddepth = dst.depth();
+    const int cn = src.channels();
+
+    CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
+
+    val[0] *= scale;
+    val[1] *= scale;
+    val[2] *= scale;
+    val[3] *= scale;
+
+    const func_t func = funcs[sdepth][ddepth][cn - 1];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src, val, dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/mul_spectrums.cu b/modules/cudaarithm/src/cuda/mul_spectrums.cu
new file mode 100644
index 00000000000..10a4eff6134
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/mul_spectrums.cu
@@ -0,0 +1,170 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+//////////////////////////////////////////////////////////////////////////////
+// mulSpectrums
+
+namespace
+{
+    __device__ __forceinline__ float real(const float2& val)
+    {
+        return val.x;
+    }
+
+    __device__ __forceinline__ float imag(const float2& val)
+    {
+        return val.y;
+    }
+
+    __device__ __forceinline__ float2 cmul(const float2& a, const float2& b)
+    {
+        return make_float2((real(a) * real(b)) - (imag(a) * imag(b)),
+                           (real(a) * imag(b)) + (imag(a) * real(b)));
+    }
+
+    __device__ __forceinline__ float2 conj(const float2& a)
+    {
+        return make_float2(real(a), -imag(a));
+    }
+
+    struct comlex_mul : binary_function<float2, float2, float2>
+    {
+        __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
+        {
+            return cmul(a, b);
+        }
+    };
+
+    struct comlex_mul_conj : binary_function<float2, float2, float2>
+    {
+        __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
+        {
+            return cmul(a, conj(b));
+        }
+    };
+
+    struct comlex_mul_scale : binary_function<float2, float2, float2>
+    {
+        float scale;
+
+        __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
+        {
+            return scale * cmul(a, b);
+        }
+    };
+
+    struct comlex_mul_conj_scale : binary_function<float2, float2, float2>
+    {
+        float scale;
+
+        __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
+        {
+            return scale * cmul(a, conj(b));
+        }
+    };
+}
+
+void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
+{
+    CV_UNUSED(flags);
+
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
+
+    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
+    CV_Assert( src1.size() == src2.size() );
+
+    GpuMat dst = getOutputMat(_dst, src1.size(), CV_32FC2, stream);
+
+    if (conjB)
+        gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul_conj(), stream);
+    else
+        gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul(), stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
+{
+    CV_UNUSED(flags);
+
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
+
+    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
+    CV_Assert( src1.size() == src2.size() );
+
+    GpuMat dst = getOutputMat(_dst, src1.size(), CV_32FC2, stream);
+
+    if (conjB)
+    {
+        comlex_mul_conj_scale op;
+        op.scale = scale;
+        gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), op, stream);
+    }
+    else
+    {
+        comlex_mul_scale op;
+        op.scale = scale;
+        gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), op, stream);
+    }
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/norm.cu b/modules/cudaarithm/src/cuda/norm.cu
new file mode 100644
index 00000000000..43bd358f32b
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/norm.cu
@@ -0,0 +1,189 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    void normDiffInf(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
+        const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
+        GpuMat_<int>& dst = (GpuMat_<int>&) _dst;
+
+        gridFindMaxVal(abs_(cvt_<int>(src1) - cvt_<int>(src2)), dst, stream);
+    }
+
+    void normDiffL1(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
+        const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
+        GpuMat_<int>& dst = (GpuMat_<int>&) _dst;
+
+        gridCalcSum(abs_(cvt_<int>(src1) - cvt_<int>(src2)), dst, stream);
+    }
+
+    void normDiffL2(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
+        const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
+        GpuMat_<double>& dst = (GpuMat_<double>&) _dst;
+
+        BufferPool pool(stream);
+        GpuMat_<double> buf(1, 1, pool.getAllocator());
+
+        gridCalcSum(sqr_(cvt_<double>(src1) - cvt_<double>(src2)), buf, stream);
+        gridTransformUnary(buf, dst, sqrt_func<double>(), stream);
+    }
+}
+
+void cv::cuda::calcNormDiff(InputArray _src1, InputArray _src2, OutputArray _dst, int normType, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        0, normDiffInf, normDiffL1, 0, normDiffL2
+    };
+
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
+
+    CV_Assert( src1.type() == CV_8UC1 );
+    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() );
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, normType == NORM_L2 ? CV_64FC1 : CV_32SC1, stream);
+
+    const func_t func = funcs[normType];
+    func(src1, src2, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+double cv::cuda::norm(InputArray _src1, InputArray _src2, int normType)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcNormDiff(_src1, _src2, dst, normType, stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    dst.createMatHeader().convertTo(Mat(1, 1, CV_64FC1, &val), CV_64F);
+
+    return val;
+}
+
+namespace cv { namespace cuda { namespace device {
+
+void normL2(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream);
+
+}}}
+
+namespace
+{
+    template <typename T, typename R>
+    void normL2Impl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<R>& dst = (GpuMat_<R>&) _dst;
+
+        BufferPool pool(stream);
+        GpuMat_<double> buf(1, 1, pool.getAllocator());
+
+        if (mask.empty())
+        {
+            gridCalcSum(sqr_(cvt_<double>(src)), buf, stream);
+        }
+        else
+        {
+            gridCalcSum(sqr_(cvt_<double>(src)), buf, globPtr<uchar>(mask), stream);
+        }
+
+        gridTransformUnary(buf, dst, sqrt_func<double>(), stream);
+    }
+}
+
+void cv::cuda::device::normL2(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        normL2Impl<uchar, double>,
+        normL2Impl<schar, double>,
+        normL2Impl<ushort, double>,
+        normL2Impl<short, double>,
+        normL2Impl<int, double>,
+        normL2Impl<float, double>,
+        normL2Impl<double, double>
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC1, stream);
+
+    const func_t func = funcs[src.depth()];
+    func(src, mask, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/normalize.cu b/modules/cudaarithm/src/cuda/normalize.cu
new file mode 100644
index 00000000000..c83f2c0dff4
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/normalize.cu
@@ -0,0 +1,294 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace {
+
+template <typename T, typename R, typename I>
+struct ConvertorMinMax : unary_function<T, R>
+{
+    typedef typename LargerType<T, R>::type larger_type1;
+    typedef typename LargerType<larger_type1, I>::type larger_type2;
+    typedef typename LargerType<larger_type2, float>::type scalar_type;
+
+    scalar_type dmin, dmax;
+    const I* minMaxVals;
+
+    __device__ R operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        const scalar_type smin = minMaxVals[0];
+        const scalar_type smax = minMaxVals[1];
+
+        const scalar_type scale = (dmax - dmin) * (smax - smin > numeric_limits<scalar_type>::epsilon() ? 1.0 / (smax - smin) : 0.0);
+        const scalar_type shift = dmin - smin * scale;
+
+        return cudev::saturate_cast<R>(scale * src + shift);
+    }
+};
+
+template <typename T, typename R, typename I>
+void normalizeMinMax(const GpuMat& _src, GpuMat& _dst, double a, double b, const GpuMat& mask, Stream& stream)
+{
+    const GpuMat_<T>& src = (const GpuMat_<T>&)_src;
+    GpuMat_<R>& dst = (GpuMat_<R>&)_dst;
+
+    BufferPool pool(stream);
+    GpuMat_<I> minMaxVals(1, 2, pool.getAllocator());
+
+    if (mask.empty())
+    {
+        gridFindMinMaxVal(src, minMaxVals, stream);
+    }
+    else
+    {
+        gridFindMinMaxVal(src, minMaxVals, globPtr<uchar>(mask), stream);
+    }
+
+    ConvertorMinMax<T, R, I> cvt;
+    cvt.dmin = std::min(a, b);
+    cvt.dmax = std::max(a, b);
+    cvt.minMaxVals = minMaxVals[0];
+
+    if (mask.empty())
+    {
+        gridTransformUnary(src, dst, cvt, stream);
+    }
+    else
+    {
+        dst.setTo(Scalar::all(0), stream);
+        gridTransformUnary(src, dst, cvt, globPtr<uchar>(mask), stream);
+    }
+}
+
+template <typename T, typename R, typename I, bool normL2>
+struct ConvertorNorm : unary_function<T, R>
+{
+    typedef typename LargerType<T, R>::type larger_type1;
+    typedef typename LargerType<larger_type1, I>::type larger_type2;
+    typedef typename LargerType<larger_type2, float>::type scalar_type;
+
+    scalar_type a;
+    const I* normVal;
+
+    __device__ R operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        sqrt_func<scalar_type> sqrt;
+
+        scalar_type scale = normL2 ? sqrt(*normVal) : *normVal;
+        scale = scale > numeric_limits<scalar_type>::epsilon() ? a / scale : 0.0;
+
+        return cudev::saturate_cast<R>(scale * src);
+    }
+};
+
+template <typename T, typename R, typename I>
+void normalizeNorm(const GpuMat& _src, GpuMat& _dst, double a, int normType, const GpuMat& mask, Stream& stream)
+{
+    const GpuMat_<T>& src = (const GpuMat_<T>&)_src;
+    GpuMat_<R>& dst = (GpuMat_<R>&)_dst;
+
+    BufferPool pool(stream);
+    GpuMat_<I> normVal(1, 1, pool.getAllocator());
+
+    if (normType == NORM_L1)
+    {
+        if (mask.empty())
+        {
+            gridCalcSum(abs_(cvt_<I>(src)), normVal, stream);
+        }
+        else
+        {
+            gridCalcSum(abs_(cvt_<I>(src)), normVal, globPtr<uchar>(mask), stream);
+        }
+    }
+    else if (normType == NORM_L2)
+    {
+        if (mask.empty())
+        {
+            gridCalcSum(sqr_(cvt_<I>(src)), normVal, stream);
+        }
+        else
+        {
+            gridCalcSum(sqr_(cvt_<I>(src)), normVal, globPtr<uchar>(mask), stream);
+        }
+    }
+    else // NORM_INF
+    {
+        if (mask.empty())
+        {
+            gridFindMaxVal(abs_(cvt_<I>(src)), normVal, stream);
+        }
+        else
+        {
+            gridFindMaxVal(abs_(cvt_<I>(src)), normVal, globPtr<uchar>(mask), stream);
+        }
+    }
+
+    if (normType == NORM_L2)
+    {
+        ConvertorNorm<T, R, I, true> cvt;
+        cvt.a = a;
+        cvt.normVal = normVal[0];
+
+        if (mask.empty())
+        {
+            gridTransformUnary(src, dst, cvt, stream);
+        }
+        else
+        {
+            dst.setTo(Scalar::all(0), stream);
+            gridTransformUnary(src, dst, cvt, globPtr<uchar>(mask), stream);
+        }
+    }
+    else
+    {
+        ConvertorNorm<T, R, I, false> cvt;
+        cvt.a = a;
+        cvt.normVal = normVal[0];
+
+        if (mask.empty())
+        {
+            gridTransformUnary(src, dst, cvt, stream);
+        }
+        else
+        {
+            dst.setTo(Scalar::all(0), stream);
+            gridTransformUnary(src, dst, cvt, globPtr<uchar>(mask), stream);
+        }
+    }
+}
+
+} // namespace
+
+void cv::cuda::normalize(InputArray _src, OutputArray _dst, double a, double b, int normType, int dtype, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_minmax_t)(const GpuMat& _src, GpuMat& _dst, double a, double b, const GpuMat& mask, Stream& stream);
+    typedef void (*func_norm_t)(const GpuMat& _src, GpuMat& _dst, double a, int normType, const GpuMat& mask, Stream& stream);
+
+    static const func_minmax_t funcs_minmax[] =
+    {
+        normalizeMinMax<uchar, float, float>,
+        normalizeMinMax<schar, float, float>,
+        normalizeMinMax<ushort, float, float>,
+        normalizeMinMax<short, float, float>,
+        normalizeMinMax<int, float, float>,
+        normalizeMinMax<float, float, float>,
+        normalizeMinMax<double, double, double>
+    };
+
+    static const func_norm_t funcs_norm[] =
+    {
+        normalizeNorm<uchar, float, float>,
+        normalizeNorm<schar, float, float>,
+        normalizeNorm<ushort, float, float>,
+        normalizeNorm<short, float, float>,
+        normalizeNorm<int, float, float>,
+        normalizeNorm<float, float, float>,
+        normalizeNorm<double, double, double>
+    };
+
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_MINMAX );
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    if (dtype < 0)
+    {
+        dtype = _dst.fixedType() ? _dst.type() : src.type();
+    }
+    dtype = CV_MAT_DEPTH(dtype);
+
+    const int src_depth = src.depth();
+    const int tmp_depth = src_depth <= CV_32F ? CV_32F : src_depth;
+
+    GpuMat dst;
+    if (dtype == tmp_depth)
+    {
+        _dst.create(src.size(), tmp_depth);
+        dst = getOutputMat(_dst, src.size(), tmp_depth, stream);
+    }
+    else
+    {
+        BufferPool pool(stream);
+        dst = pool.getBuffer(src.size(), tmp_depth);
+    }
+
+    if (normType == NORM_MINMAX)
+    {
+        const func_minmax_t func = funcs_minmax[src_depth];
+        func(src, dst, a, b, mask, stream);
+    }
+    else
+    {
+        const func_norm_t func = funcs_norm[src_depth];
+        func(src, dst, a, normType, mask, stream);
+    }
+
+    if (dtype == tmp_depth)
+    {
+        syncOutput(dst, _dst, stream);
+    }
+    else
+    {
+        dst.convertTo(_dst, dtype, stream);
+    }
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/polar_cart.cu b/modules/cudaarithm/src/cuda/polar_cart.cu
new file mode 100644
index 00000000000..0a949b42ed1
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/polar_cart.cu
@@ -0,0 +1,217 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+void cv::cuda::magnitude(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
+{
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);
+
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );
+
+    GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream);
+
+    GpuMat_<float> xc(x.reshape(1));
+    GpuMat_<float> yc(y.reshape(1));
+    GpuMat_<float> magc(dst.reshape(1));
+
+    gridTransformBinary(xc, yc, magc, magnitude_func<float>(), stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::magnitudeSqr(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
+{
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);
+
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );
+
+    GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream);
+
+    GpuMat_<float> xc(x.reshape(1));
+    GpuMat_<float> yc(y.reshape(1));
+    GpuMat_<float> magc(dst.reshape(1));
+
+    gridTransformBinary(xc, yc, magc, magnitude_sqr_func<float>(), stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleInDegrees, Stream& stream)
+{
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);
+
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );
+
+    GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream);
+
+    GpuMat_<float> xc(x.reshape(1));
+    GpuMat_<float> yc(y.reshape(1));
+    GpuMat_<float> anglec(dst.reshape(1));
+
+    if (angleInDegrees)
+        gridTransformBinary(xc, yc, anglec, direction_func<float, true>(), stream);
+    else
+        gridTransformBinary(xc, yc, anglec, direction_func<float, false>(), stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, OutputArray _angle, bool angleInDegrees, Stream& stream)
+{
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);
+
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );
+
+    GpuMat mag = getOutputMat(_mag, x.size(), CV_32FC1, stream);
+    GpuMat angle = getOutputMat(_angle, x.size(), CV_32FC1, stream);
+
+    GpuMat_<float> xc(x.reshape(1));
+    GpuMat_<float> yc(y.reshape(1));
+    GpuMat_<float> magc(mag.reshape(1));
+    GpuMat_<float> anglec(angle.reshape(1));
+
+    if (angleInDegrees)
+    {
+        gridTransformTuple(zipPtr(xc, yc),
+                           tie(magc, anglec),
+                           make_tuple(
+                               binaryTupleAdapter<0, 1>(magnitude_func<float>()),
+                               binaryTupleAdapter<0, 1>(direction_func<float, true>())),
+                           stream);
+    }
+    else
+    {
+        gridTransformTuple(zipPtr(xc, yc),
+                           tie(magc, anglec),
+                           make_tuple(
+                               binaryTupleAdapter<0, 1>(magnitude_func<float>()),
+                               binaryTupleAdapter<0, 1>(direction_func<float, false>())),
+                           stream);
+    }
+
+    syncOutput(mag, _mag, stream);
+    syncOutput(angle, _angle, stream);
+}
+
+namespace
+{
+    template <bool useMag>
+    __global__ void polarToCartImpl(const GlobPtr<float> mag, const GlobPtr<float> angle, GlobPtr<float> xmat, GlobPtr<float> ymat, const float scale, const int rows, const int cols)
+    {
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x >= cols || y >= rows)
+            return;
+
+        const float mag_val = useMag ? mag(y, x) : 1.0f;
+        const float angle_val = angle(y, x);
+
+        float sin_a, cos_a;
+        ::sincosf(scale * angle_val, &sin_a, &cos_a);
+
+        xmat(y, x) = mag_val * cos_a;
+        ymat(y, x) = mag_val * sin_a;
+    }
+}
+
+void cv::cuda::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, OutputArray _y, bool angleInDegrees, Stream& _stream)
+{
+    GpuMat mag = getInputMat(_mag, _stream);
+    GpuMat angle = getInputMat(_angle, _stream);
+
+    CV_Assert( angle.depth() == CV_32F );
+    CV_Assert( mag.empty() || (mag.type() == angle.type() && mag.size() == angle.size()) );
+
+    GpuMat x = getOutputMat(_x, angle.size(), CV_32FC1, _stream);
+    GpuMat y = getOutputMat(_y, angle.size(), CV_32FC1, _stream);
+
+    GpuMat_<float> xc(x.reshape(1));
+    GpuMat_<float> yc(y.reshape(1));
+    GpuMat_<float> magc(mag.reshape(1));
+    GpuMat_<float> anglec(angle.reshape(1));
+
+    const dim3 block(32, 8);
+    const dim3 grid(divUp(anglec.cols, block.x), divUp(anglec.rows, block.y));
+
+    const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    if (magc.empty())
+        polarToCartImpl<false><<<grid, block, 0, stream>>>(shrinkPtr(magc), shrinkPtr(anglec), shrinkPtr(xc), shrinkPtr(yc), scale, anglec.rows, anglec.cols);
+    else
+        polarToCartImpl<true><<<grid, block, 0, stream>>>(shrinkPtr(magc), shrinkPtr(anglec), shrinkPtr(xc), shrinkPtr(yc), scale, anglec.rows, anglec.cols);
+
+    CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+    syncOutput(x, _x, _stream);
+    syncOutput(y, _y, _stream);
+
+    if (stream == 0)
+        CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/reduce.cu b/modules/cudaarithm/src/cuda/reduce.cu
new file mode 100644
index 00000000000..3f907c79554
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/reduce.cu
@@ -0,0 +1,301 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    template <typename T, typename S, typename D>
+    void reduceToRowImpl(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
+
+        switch (reduceOp)
+        {
+        case cv::REDUCE_SUM:
+            gridReduceToRow< Sum<S> >(src, dst, stream);
+            break;
+
+        case cv::REDUCE_AVG:
+            gridReduceToRow< Avg<S> >(src, dst, stream);
+            break;
+
+        case cv::REDUCE_MIN:
+            gridReduceToRow< Min<S> >(src, dst, stream);
+            break;
+
+        case cv::REDUCE_MAX:
+            gridReduceToRow< Max<S> >(src, dst, stream);
+            break;
+        };
+    }
+
+    template <typename T, typename S, typename D>
+    void reduceToColumnImpl_(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
+
+        switch (reduceOp)
+        {
+        case cv::REDUCE_SUM:
+            gridReduceToColumn< Sum<S> >(src, dst, stream);
+            break;
+
+        case cv::REDUCE_AVG:
+            gridReduceToColumn< Avg<S> >(src, dst, stream);
+            break;
+
+        case cv::REDUCE_MIN:
+            gridReduceToColumn< Min<S> >(src, dst, stream);
+            break;
+
+        case cv::REDUCE_MAX:
+            gridReduceToColumn< Max<S> >(src, dst, stream);
+            break;
+        };
+    }
+
+    template <typename T, typename S, typename D>
+    void reduceToColumnImpl(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream)
+    {
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream);
+        static const func_t funcs[4] =
+        {
+            reduceToColumnImpl_<T, S, D>,
+            reduceToColumnImpl_<typename MakeVec<T, 2>::type, typename MakeVec<S, 2>::type, typename MakeVec<D, 2>::type>,
+            reduceToColumnImpl_<typename MakeVec<T, 3>::type, typename MakeVec<S, 3>::type, typename MakeVec<D, 3>::type>,
+            reduceToColumnImpl_<typename MakeVec<T, 4>::type, typename MakeVec<S, 4>::type, typename MakeVec<D, 4>::type>
+        };
+
+        funcs[src.channels() - 1](src, dst, reduceOp, stream);
+    }
+}
+
+void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.channels() <= 4 );
+    CV_Assert( dim == 0 || dim == 1 );
+    CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
+
+    if (dtype < 0)
+        dtype = src.depth();
+
+    GpuMat dst = getOutputMat(_dst, dim == 0 ? 1 : src.rows, dim == 0 ? src.cols : 1, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()), stream);
+
+    if (dim == 0)
+    {
+        typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream);
+        static const func_t funcs[7][7] =
+        {
+            {
+                reduceToRowImpl<uchar, int, uchar>,
+                0 /*reduceToRowImpl<uchar, int, schar>*/,
+                0 /*reduceToRowImpl<uchar, int, ushort>*/,
+                0 /*reduceToRowImpl<uchar, int, short>*/,
+                reduceToRowImpl<uchar, int, int>,
+                reduceToRowImpl<uchar, float, float>,
+                reduceToRowImpl<uchar, double, double>
+            },
+            {
+                0 /*reduceToRowImpl<schar, int, uchar>*/,
+                0 /*reduceToRowImpl<schar, int, schar>*/,
+                0 /*reduceToRowImpl<schar, int, ushort>*/,
+                0 /*reduceToRowImpl<schar, int, short>*/,
+                0 /*reduceToRowImpl<schar, int, int>*/,
+                0 /*reduceToRowImpl<schar, float, float>*/,
+                0 /*reduceToRowImpl<schar, double, double>*/
+            },
+            {
+                0 /*reduceToRowImpl<ushort, int, uchar>*/,
+                0 /*reduceToRowImpl<ushort, int, schar>*/,
+                reduceToRowImpl<ushort, int, ushort>,
+                0 /*reduceToRowImpl<ushort, int, short>*/,
+                reduceToRowImpl<ushort, int, int>,
+                reduceToRowImpl<ushort, float, float>,
+                reduceToRowImpl<ushort, double, double>
+            },
+            {
+                0 /*reduceToRowImpl<short, int, uchar>*/,
+                0 /*reduceToRowImpl<short, int, schar>*/,
+                0 /*reduceToRowImpl<short, int, ushort>*/,
+                reduceToRowImpl<short, int, short>,
+                reduceToRowImpl<short, int, int>,
+                reduceToRowImpl<short, float, float>,
+                reduceToRowImpl<short, double, double>
+            },
+            {
+                0 /*reduceToRowImpl<int, int, uchar>*/,
+                0 /*reduceToRowImpl<int, int, schar>*/,
+                0 /*reduceToRowImpl<int, int, ushort>*/,
+                0 /*reduceToRowImpl<int, int, short>*/,
+                reduceToRowImpl<int, int, int>,
+                reduceToRowImpl<int, float, float>,
+                reduceToRowImpl<int, double, double>
+            },
+            {
+                0 /*reduceToRowImpl<float, float, uchar>*/,
+                0 /*reduceToRowImpl<float, float, schar>*/,
+                0 /*reduceToRowImpl<float, float, ushort>*/,
+                0 /*reduceToRowImpl<float, float, short>*/,
+                0 /*reduceToRowImpl<float, float, int>*/,
+                reduceToRowImpl<float, float, float>,
+                reduceToRowImpl<float, double, double>
+            },
+            {
+                0 /*reduceToRowImpl<double, double, uchar>*/,
+                0 /*reduceToRowImpl<double, double, schar>*/,
+                0 /*reduceToRowImpl<double, double, ushort>*/,
+                0 /*reduceToRowImpl<double, double, short>*/,
+                0 /*reduceToRowImpl<double, double, int>*/,
+                0 /*reduceToRowImpl<double, double, float>*/,
+                reduceToRowImpl<double, double, double>
+            }
+        };
+
+        const func_t func = funcs[src.depth()][dst.depth()];
+
+        if (!func)
+            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
+
+        GpuMat dst_cont = dst.reshape(1);
+        func(src.reshape(1), dst_cont, reduceOp, stream);
+    }
+    else
+    {
+        typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream);
+        static const func_t funcs[7][7] =
+        {
+            {
+                reduceToColumnImpl<uchar, int, uchar>,
+                0 /*reduceToColumnImpl<uchar, int, schar>*/,
+                0 /*reduceToColumnImpl<uchar, int, ushort>*/,
+                0 /*reduceToColumnImpl<uchar, int, short>*/,
+                reduceToColumnImpl<uchar, int, int>,
+                reduceToColumnImpl<uchar, float, float>,
+                reduceToColumnImpl<uchar, double, double>
+            },
+            {
+                0 /*reduceToColumnImpl<schar, int, uchar>*/,
+                0 /*reduceToColumnImpl<schar, int, schar>*/,
+                0 /*reduceToColumnImpl<schar, int, ushort>*/,
+                0 /*reduceToColumnImpl<schar, int, short>*/,
+                0 /*reduceToColumnImpl<schar, int, int>*/,
+                0 /*reduceToColumnImpl<schar, float, float>*/,
+                0 /*reduceToColumnImpl<schar, double, double>*/
+            },
+            {
+                0 /*reduceToColumnImpl<ushort, int, uchar>*/,
+                0 /*reduceToColumnImpl<ushort, int, schar>*/,
+                reduceToColumnImpl<ushort, int, ushort>,
+                0 /*reduceToColumnImpl<ushort, int, short>*/,
+                reduceToColumnImpl<ushort, int, int>,
+                reduceToColumnImpl<ushort, float, float>,
+                reduceToColumnImpl<ushort, double, double>
+            },
+            {
+                0 /*reduceToColumnImpl<short, int, uchar>*/,
+                0 /*reduceToColumnImpl<short, int, schar>*/,
+                0 /*reduceToColumnImpl<short, int, ushort>*/,
+                reduceToColumnImpl<short, int, short>,
+                reduceToColumnImpl<short, int, int>,
+                reduceToColumnImpl<short, float, float>,
+                reduceToColumnImpl<short, double, double>
+            },
+            {
+                0 /*reduceToColumnImpl<int, int, uchar>*/,
+                0 /*reduceToColumnImpl<int, int, schar>*/,
+                0 /*reduceToColumnImpl<int, int, ushort>*/,
+                0 /*reduceToColumnImpl<int, int, short>*/,
+                reduceToColumnImpl<int, int, int>,
+                reduceToColumnImpl<int, float, float>,
+                reduceToColumnImpl<int, double, double>
+            },
+            {
+                0 /*reduceToColumnImpl<float, float, uchar>*/,
+                0 /*reduceToColumnImpl<float, float, schar>*/,
+                0 /*reduceToColumnImpl<float, float, ushort>*/,
+                0 /*reduceToColumnImpl<float, float, short>*/,
+                0 /*reduceToColumnImpl<float, float, int>*/,
+                reduceToColumnImpl<float, float, float>,
+                reduceToColumnImpl<float, double, double>
+            },
+            {
+                0 /*reduceToColumnImpl<double, double, uchar>*/,
+                0 /*reduceToColumnImpl<double, double, schar>*/,
+                0 /*reduceToColumnImpl<double, double, ushort>*/,
+                0 /*reduceToColumnImpl<double, double, short>*/,
+                0 /*reduceToColumnImpl<double, double, int>*/,
+                0 /*reduceToColumnImpl<double, double, float>*/,
+                reduceToColumnImpl<double, double, double>
+            }
+        };
+
+        const func_t func = funcs[src.depth()][dst.depth()];
+
+        if (!func)
+            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
+
+        func(src, dst, reduceOp, stream);
+    }
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/split_merge.cu b/modules/cudaarithm/src/cuda/split_merge.cu
new file mode 100644
index 00000000000..5b3af10775d
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/split_merge.cu
@@ -0,0 +1,250 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+////////////////////////////////////////////////////////////////////////
+/// merge
+
+namespace
+{
+    template <int cn, typename T> struct MergeFunc;
+
+    template <typename T> struct MergeFunc<2, T>
+    {
+        static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
+        {
+            gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1])),
+                    globPtr<typename MakeVec<T, 2>::type>(dst),
+                    stream);
+        }
+    };
+
+    template <typename T> struct MergeFunc<3, T>
+    {
+        static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
+        {
+            gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])),
+                    globPtr<typename MakeVec<T, 3>::type>(dst),
+                    stream);
+        }
+    };
+
+    template <typename T> struct MergeFunc<4, T>
+    {
+        static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
+        {
+            gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])),
+                    globPtr<typename MakeVec<T, 4>::type>(dst),
+                    stream);
+        }
+    };
+
+    void mergeImpl(const GpuMat* src, size_t n, cv::OutputArray _dst, Stream& stream)
+    {
+        CV_Assert( src != 0 );
+        CV_Assert( n > 0 && n <= 4 );
+
+        const int depth = src[0].depth();
+        const cv::Size size = src[0].size();
+
+        for (size_t i = 0; i < n; ++i)
+        {
+            CV_Assert( src[i].size() == size );
+            CV_Assert( src[i].depth() == depth );
+            CV_Assert( src[i].channels() == 1 );
+        }
+
+        if (n == 1)
+        {
+            src[0].copyTo(_dst, stream);
+        }
+        else
+        {
+            typedef void (*func_t)(const GpuMat* src, GpuMat& dst, Stream& stream);
+            static const func_t funcs[3][5] =
+            {
+                {MergeFunc<2, uchar>::call, MergeFunc<2, ushort>::call, MergeFunc<2, int>::call, 0, MergeFunc<2, double>::call},
+                {MergeFunc<3, uchar>::call, MergeFunc<3, ushort>::call, MergeFunc<3, int>::call, 0, MergeFunc<3, double>::call},
+                {MergeFunc<4, uchar>::call, MergeFunc<4, ushort>::call, MergeFunc<4, int>::call, 0, MergeFunc<4, double>::call}
+            };
+
+            const int channels = static_cast<int>(n);
+
+            GpuMat dst = getOutputMat(_dst, size, CV_MAKE_TYPE(depth, channels), stream);
+
+            const func_t func = funcs[channels - 2][CV_ELEM_SIZE(depth) / 2];
+
+            if (func == 0)
+                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
+
+            func(src, dst, stream);
+
+            syncOutput(dst, _dst, stream);
+        }
+    }
+}
+
+void cv::cuda::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream)
+{
+    mergeImpl(src, n, dst, stream);
+}
+
+
+void cv::cuda::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream)
+{
+    mergeImpl(&src[0], src.size(), dst, stream);
+}
+
+////////////////////////////////////////////////////////////////////////
+/// split
+
+namespace
+{
+    template <int cn, typename T> struct SplitFunc;
+
+    template <typename T> struct SplitFunc<2, T>
+    {
+        static void call(const GpuMat& src, GpuMat* dst, Stream& stream)
+        {
+            GlobPtrSz<T> dstarr[2] =
+            {
+                globPtr<T>(dst[0]), globPtr<T>(dst[1])
+            };
+
+            gridSplit(globPtr<typename MakeVec<T, 2>::type>(src), dstarr, stream);
+        }
+    };
+
+    template <typename T> struct SplitFunc<3, T>
+    {
+        static void call(const GpuMat& src, GpuMat* dst, Stream& stream)
+        {
+            GlobPtrSz<T> dstarr[3] =
+            {
+                globPtr<T>(dst[0]), globPtr<T>(dst[1]), globPtr<T>(dst[2])
+            };
+
+            gridSplit(globPtr<typename MakeVec<T, 3>::type>(src), dstarr, stream);
+        }
+    };
+
+    template <typename T> struct SplitFunc<4, T>
+    {
+        static void call(const GpuMat& src, GpuMat* dst, Stream& stream)
+        {
+            GlobPtrSz<T> dstarr[4] =
+            {
+                globPtr<T>(dst[0]), globPtr<T>(dst[1]), globPtr<T>(dst[2]), globPtr<T>(dst[3])
+            };
+
+            gridSplit(globPtr<typename MakeVec<T, 4>::type>(src), dstarr, stream);
+        }
+    };
+
+    void splitImpl(const GpuMat& src, GpuMat* dst, Stream& stream)
+    {
+        typedef void (*func_t)(const GpuMat& src, GpuMat* dst, Stream& stream);
+        static const func_t funcs[3][5] =
+        {
+            {SplitFunc<2, uchar>::call, SplitFunc<2, ushort>::call, SplitFunc<2, int>::call, 0, SplitFunc<2, double>::call},
+            {SplitFunc<3, uchar>::call, SplitFunc<3, ushort>::call, SplitFunc<3, int>::call, 0, SplitFunc<3, double>::call},
+            {SplitFunc<4, uchar>::call, SplitFunc<4, ushort>::call, SplitFunc<4, int>::call, 0, SplitFunc<4, double>::call}
+        };
+
+        CV_Assert( dst != 0 );
+
+        const int depth = src.depth();
+        const int channels = src.channels();
+
+        CV_Assert( channels <= 4 );
+
+        if (channels == 0)
+            return;
+
+        if (channels == 1)
+        {
+            src.copyTo(dst[0], stream);
+            return;
+        }
+
+        for (int i = 0; i < channels; ++i)
+            dst[i].create(src.size(), depth);
+
+        const func_t func = funcs[channels - 2][CV_ELEM_SIZE(depth) / 2];
+
+        if (func == 0)
+            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
+
+        func(src, dst, stream);
+    }
+}
+
+void cv::cuda::split(InputArray _src, GpuMat* dst, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+    splitImpl(src, dst, stream);
+}
+
+void cv::cuda::split(InputArray _src, std::vector<GpuMat>& dst, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+    dst.resize(src.channels());
+    if (src.channels() > 0)
+        splitImpl(src, &dst[0], stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/sub_mat.cu b/modules/cudaarithm/src/cuda/sub_mat.cu
new file mode 100644
index 00000000000..6468692aee9
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/sub_mat.cu
@@ -0,0 +1,225 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);
+
+namespace
+{
+    template <typename T, typename D> struct SubOp1 : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a - b);
+        }
+    };
+
+    template <typename T, typename D>
+    void subMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
+    {
+        if (mask.data)
+            gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), SubOp1<T, D>(), globPtr<uchar>(mask), stream);
+        else
+            gridTransformBinary(globPtr<T>(src1), globPtr<T>(src2), globPtr<D>(dst), SubOp1<T, D>(), stream);
+    }
+
+    struct SubOp2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vsub2(a, b);
+        }
+    };
+
+    void subMat_v2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 1;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, SubOp2(), stream);
+    }
+
+    struct SubOp4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vsub4(a, b);
+        }
+    };
+
+    void subMat_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, SubOp4(), stream);
+    }
+}
+
+void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs[7][7] =
+    {
+        {
+            subMat_v1<uchar, uchar>,
+            subMat_v1<uchar, schar>,
+            subMat_v1<uchar, ushort>,
+            subMat_v1<uchar, short>,
+            subMat_v1<uchar, int>,
+            subMat_v1<uchar, float>,
+            subMat_v1<uchar, double>
+        },
+        {
+            subMat_v1<schar, uchar>,
+            subMat_v1<schar, schar>,
+            subMat_v1<schar, ushort>,
+            subMat_v1<schar, short>,
+            subMat_v1<schar, int>,
+            subMat_v1<schar, float>,
+            subMat_v1<schar, double>
+        },
+        {
+            0 /*subMat_v1<ushort, uchar>*/,
+            0 /*subMat_v1<ushort, schar>*/,
+            subMat_v1<ushort, ushort>,
+            subMat_v1<ushort, short>,
+            subMat_v1<ushort, int>,
+            subMat_v1<ushort, float>,
+            subMat_v1<ushort, double>
+        },
+        {
+            0 /*subMat_v1<short, uchar>*/,
+            0 /*subMat_v1<short, schar>*/,
+            subMat_v1<short, ushort>,
+            subMat_v1<short, short>,
+            subMat_v1<short, int>,
+            subMat_v1<short, float>,
+            subMat_v1<short, double>
+        },
+        {
+            0 /*subMat_v1<int, uchar>*/,
+            0 /*subMat_v1<int, schar>*/,
+            0 /*subMat_v1<int, ushort>*/,
+            0 /*subMat_v1<int, short>*/,
+            subMat_v1<int, int>,
+            subMat_v1<int, float>,
+            subMat_v1<int, double>
+        },
+        {
+            0 /*subMat_v1<float, uchar>*/,
+            0 /*subMat_v1<float, schar>*/,
+            0 /*subMat_v1<float, ushort>*/,
+            0 /*subMat_v1<float, short>*/,
+            0 /*subMat_v1<float, int>*/,
+            subMat_v1<float, float>,
+            subMat_v1<float, double>
+        },
+        {
+            0 /*subMat_v1<double, uchar>*/,
+            0 /*subMat_v1<double, schar>*/,
+            0 /*subMat_v1<double, ushort>*/,
+            0 /*subMat_v1<double, short>*/,
+            0 /*subMat_v1<double, int>*/,
+            0 /*subMat_v1<double, float>*/,
+            subMat_v1<double, double>
+        }
+    };
+
+    const int sdepth = src1.depth();
+    const int ddepth = dst.depth();
+
+    CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
+
+    GpuMat src1_ = src1.reshape(1);
+    GpuMat src2_ = src2.reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+
+    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
+            {
+                subMat_v4(src1_, src2_, dst_, stream);
+                return;
+            }
+            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
+            {
+                subMat_v2(src1_, src2_, dst_, stream);
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs[sdepth][ddepth];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, mask, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/sub_scalar.cu b/modules/cudaarithm/src/cuda/sub_scalar.cu
new file mode 100644
index 00000000000..c4eeec01482
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/sub_scalar.cu
@@ -0,0 +1,203 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void subScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);
+
+namespace
+{
+    template <typename SrcType, typename ScalarType, typename DstType> struct SubScalarOp : unary_function<SrcType, DstType>
+    {
+        ScalarType val;
+
+        __device__ __forceinline__ DstType operator ()(SrcType a) const
+        {
+            return saturate_cast<DstType>(saturate_cast<ScalarType>(a) - val);
+        }
+    };
+
+    template <typename SrcType, typename ScalarType, typename DstType> struct SubScalarOpInv : unary_function<SrcType, DstType>
+    {
+        ScalarType val;
+
+        __device__ __forceinline__ DstType operator ()(SrcType a) const
+        {
+            return saturate_cast<DstType>(val - saturate_cast<ScalarType>(a));
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename SrcType, typename ScalarDepth, typename DstType>
+    void subScalarImpl(const GpuMat& src, cv::Scalar value, bool inv, GpuMat& dst, const GpuMat& mask, Stream& stream)
+    {
+        typedef typename MakeVec<ScalarDepth, VecTraits<SrcType>::cn>::type ScalarType;
+
+        cv::Scalar_<ScalarDepth> value_ = value;
+
+        if (inv)
+        {
+            SubScalarOpInv<SrcType, ScalarType, DstType> op;
+            op.val = VecTraits<ScalarType>::make(value_.val);
+
+            if (mask.data)
+                gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, globPtr<uchar>(mask), stream);
+            else
+                gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
+        }
+        else
+        {
+            SubScalarOp<SrcType, ScalarType, DstType> op;
+            op.val = VecTraits<ScalarType>::make(value_.val);
+
+            if (mask.data)
+                gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, globPtr<uchar>(mask), stream);
+            else
+                gridTransformUnary_< TransformPolicy<ScalarDepth> >(globPtr<SrcType>(src), globPtr<DstType>(dst), op, stream);
+        }
+    }
+}
+
+void subScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int)
+{
+    typedef void (*func_t)(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs[7][7][4] =
+    {
+        {
+            {subScalarImpl<uchar, float, uchar>, subScalarImpl<uchar2, float, uchar2>, subScalarImpl<uchar3, float, uchar3>, subScalarImpl<uchar4, float, uchar4>},
+            {subScalarImpl<uchar, float, schar>, subScalarImpl<uchar2, float, char2>, subScalarImpl<uchar3, float, char3>, subScalarImpl<uchar4, float, char4>},
+            {subScalarImpl<uchar, float, ushort>, subScalarImpl<uchar2, float, ushort2>, subScalarImpl<uchar3, float, ushort3>, subScalarImpl<uchar4, float, ushort4>},
+            {subScalarImpl<uchar, float, short>, subScalarImpl<uchar2, float, short2>, subScalarImpl<uchar3, float, short3>, subScalarImpl<uchar4, float, short4>},
+            {subScalarImpl<uchar, float, int>, subScalarImpl<uchar2, float, int2>, subScalarImpl<uchar3, float, int3>, subScalarImpl<uchar4, float, int4>},
+            {subScalarImpl<uchar, float, float>, subScalarImpl<uchar2, float, float2>, subScalarImpl<uchar3, float, float3>, subScalarImpl<uchar4, float, float4>},
+            {subScalarImpl<uchar, double, double>, subScalarImpl<uchar2, double, double2>, subScalarImpl<uchar3, double, double3>, subScalarImpl<uchar4, double, double4>}
+        },
+        {
+            {subScalarImpl<schar, float, uchar>, subScalarImpl<char2, float, uchar2>, subScalarImpl<char3, float, uchar3>, subScalarImpl<char4, float, uchar4>},
+            {subScalarImpl<schar, float, schar>, subScalarImpl<char2, float, char2>, subScalarImpl<char3, float, char3>, subScalarImpl<char4, float, char4>},
+            {subScalarImpl<schar, float, ushort>, subScalarImpl<char2, float, ushort2>, subScalarImpl<char3, float, ushort3>, subScalarImpl<char4, float, ushort4>},
+            {subScalarImpl<schar, float, short>, subScalarImpl<char2, float, short2>, subScalarImpl<char3, float, short3>, subScalarImpl<char4, float, short4>},
+            {subScalarImpl<schar, float, int>, subScalarImpl<char2, float, int2>, subScalarImpl<char3, float, int3>, subScalarImpl<char4, float, int4>},
+            {subScalarImpl<schar, float, float>, subScalarImpl<char2, float, float2>, subScalarImpl<char3, float, float3>, subScalarImpl<char4, float, float4>},
+            {subScalarImpl<schar, double, double>, subScalarImpl<char2, double, double2>, subScalarImpl<char3, double, double3>, subScalarImpl<char4, double, double4>}
+        },
+        {
+            {0 /*subScalarImpl<ushort, float, uchar>*/, 0 /*subScalarImpl<ushort2, float, uchar2>*/, 0 /*subScalarImpl<ushort3, float, uchar3>*/, 0 /*subScalarImpl<ushort4, float, uchar4>*/},
+            {0 /*subScalarImpl<ushort, float, schar>*/, 0 /*subScalarImpl<ushort2, float, char2>*/, 0 /*subScalarImpl<ushort3, float, char3>*/, 0 /*subScalarImpl<ushort4, float, char4>*/},
+            {subScalarImpl<ushort, float, ushort>, subScalarImpl<ushort2, float, ushort2>, subScalarImpl<ushort3, float, ushort3>, subScalarImpl<ushort4, float, ushort4>},
+            {subScalarImpl<ushort, float, short>, subScalarImpl<ushort2, float, short2>, subScalarImpl<ushort3, float, short3>, subScalarImpl<ushort4, float, short4>},
+            {subScalarImpl<ushort, float, int>, subScalarImpl<ushort2, float, int2>, subScalarImpl<ushort3, float, int3>, subScalarImpl<ushort4, float, int4>},
+            {subScalarImpl<ushort, float, float>, subScalarImpl<ushort2, float, float2>, subScalarImpl<ushort3, float, float3>, subScalarImpl<ushort4, float, float4>},
+            {subScalarImpl<ushort, double, double>, subScalarImpl<ushort2, double, double2>, subScalarImpl<ushort3, double, double3>, subScalarImpl<ushort4, double, double4>}
+        },
+        {
+            {0 /*subScalarImpl<short, float, uchar>*/, 0 /*subScalarImpl<short2, float, uchar2>*/, 0 /*subScalarImpl<short3, float, uchar3>*/, 0 /*subScalarImpl<short4, float, uchar4>*/},
+            {0 /*subScalarImpl<short, float, schar>*/, 0 /*subScalarImpl<short2, float, char2>*/, 0 /*subScalarImpl<short3, float, char3>*/, 0 /*subScalarImpl<short4, float, char4>*/},
+            {subScalarImpl<short, float, ushort>, subScalarImpl<short2, float, ushort2>, subScalarImpl<short3, float, ushort3>, subScalarImpl<short4, float, ushort4>},
+            {subScalarImpl<short, float, short>, subScalarImpl<short2, float, short2>, subScalarImpl<short3, float, short3>, subScalarImpl<short4, float, short4>},
+            {subScalarImpl<short, float, int>, subScalarImpl<short2, float, int2>, subScalarImpl<short3, float, int3>, subScalarImpl<short4, float, int4>},
+            {subScalarImpl<short, float, float>, subScalarImpl<short2, float, float2>, subScalarImpl<short3, float, float3>, subScalarImpl<short4, float, float4>},
+            {subScalarImpl<short, double, double>, subScalarImpl<short2, double, double2>, subScalarImpl<short3, double, double3>, subScalarImpl<short4, double, double4>}
+        },
+        {
+            {0 /*subScalarImpl<int, float, uchar>*/, 0 /*subScalarImpl<int2, float, uchar2>*/, 0 /*subScalarImpl<int3, float, uchar3>*/, 0 /*subScalarImpl<int4, float, uchar4>*/},
+            {0 /*subScalarImpl<int, float, schar>*/, 0 /*subScalarImpl<int2, float, char2>*/, 0 /*subScalarImpl<int3, float, char3>*/, 0 /*subScalarImpl<int4, float, char4>*/},
+            {0 /*subScalarImpl<int, float, ushort>*/, 0 /*subScalarImpl<int2, float, ushort2>*/, 0 /*subScalarImpl<int3, float, ushort3>*/, 0 /*subScalarImpl<int4, float, ushort4>*/},
+            {0 /*subScalarImpl<int, float, short>*/, 0 /*subScalarImpl<int2, float, short2>*/, 0 /*subScalarImpl<int3, float, short3>*/, 0 /*subScalarImpl<int4, float, short4>*/},
+            {subScalarImpl<int, float, int>, subScalarImpl<int2, float, int2>, subScalarImpl<int3, float, int3>, subScalarImpl<int4, float, int4>},
+            {subScalarImpl<int, float, float>, subScalarImpl<int2, float, float2>, subScalarImpl<int3, float, float3>, subScalarImpl<int4, float, float4>},
+            {subScalarImpl<int, double, double>, subScalarImpl<int2, double, double2>, subScalarImpl<int3, double, double3>, subScalarImpl<int4, double, double4>}
+        },
+        {
+            {0 /*subScalarImpl<float, float, uchar>*/, 0 /*subScalarImpl<float2, float, uchar2>*/, 0 /*subScalarImpl<float3, float, uchar3>*/, 0 /*subScalarImpl<float4, float, uchar4>*/},
+            {0 /*subScalarImpl<float, float, schar>*/, 0 /*subScalarImpl<float2, float, char2>*/, 0 /*subScalarImpl<float3, float, char3>*/, 0 /*subScalarImpl<float4, float, char4>*/},
+            {0 /*subScalarImpl<float, float, ushort>*/, 0 /*subScalarImpl<float2, float, ushort2>*/, 0 /*subScalarImpl<float3, float, ushort3>*/, 0 /*subScalarImpl<float4, float, ushort4>*/},
+            {0 /*subScalarImpl<float, float, short>*/, 0 /*subScalarImpl<float2, float, short2>*/, 0 /*subScalarImpl<float3, float, short3>*/, 0 /*subScalarImpl<float4, float, short4>*/},
+            {0 /*subScalarImpl<float, float, int>*/, 0 /*subScalarImpl<float2, float, int2>*/, 0 /*subScalarImpl<float3, float, int3>*/, 0 /*subScalarImpl<float4, float, int4>*/},
+            {subScalarImpl<float, float, float>, subScalarImpl<float2, float, float2>, subScalarImpl<float3, float, float3>, subScalarImpl<float4, float, float4>},
+            {subScalarImpl<float, double, double>, subScalarImpl<float2, double, double2>, subScalarImpl<float3, double, double3>, subScalarImpl<float4, double, double4>}
+        },
+        {
+            {0 /*subScalarImpl<double, double, uchar>*/, 0 /*subScalarImpl<double2, double, uchar2>*/, 0 /*subScalarImpl<double3, double, uchar3>*/, 0 /*subScalarImpl<double4, double, uchar4>*/},
+            {0 /*subScalarImpl<double, double, schar>*/, 0 /*subScalarImpl<double2, double, char2>*/, 0 /*subScalarImpl<double3, double, char3>*/, 0 /*subScalarImpl<double4, double, char4>*/},
+            {0 /*subScalarImpl<double, double, ushort>*/, 0 /*subScalarImpl<double2, double, ushort2>*/, 0 /*subScalarImpl<double3, double, ushort3>*/, 0 /*subScalarImpl<double4, double, ushort4>*/},
+            {0 /*subScalarImpl<double, double, short>*/, 0 /*subScalarImpl<double2, double, short2>*/, 0 /*subScalarImpl<double3, double, short3>*/, 0 /*subScalarImpl<double4, double, short4>*/},
+            {0 /*subScalarImpl<double, double, int>*/, 0 /*subScalarImpl<double2, double, int2>*/, 0 /*subScalarImpl<double3, double, int3>*/, 0 /*subScalarImpl<double4, double, int4>*/},
+            {0 /*subScalarImpl<double, double, float>*/, 0 /*subScalarImpl<double2, double, float2>*/, 0 /*subScalarImpl<double3, double, float3>*/, 0 /*subScalarImpl<double4, double, float4>*/},
+            {subScalarImpl<double, double, double>, subScalarImpl<double2, double, double2>, subScalarImpl<double3, double, double3>, subScalarImpl<double4, double, double4>}
+        }
+    };
+
+    const int sdepth = src.depth();
+    const int ddepth = dst.depth();
+    const int cn = src.channels();
+
+    CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F && cn <= 4 );
+
+    const func_t func = funcs[sdepth][ddepth][cn - 1];
+
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src, val, inv, dst, mask, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/sum.cu b/modules/cudaarithm/src/cuda/sum.cu
new file mode 100644
index 00000000000..01604490394
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/sum.cu
@@ -0,0 +1,242 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    template <typename T, typename R, int cn>
+    void sumImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream)
+    {
+        typedef typename MakeVec<T, cn>::type src_type;
+        typedef typename MakeVec<R, cn>::type res_type;
+
+        const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
+        GpuMat_<res_type>& dst = (GpuMat_<res_type>&) _dst;
+
+        if (mask.empty())
+            gridCalcSum(src, dst, stream);
+        else
+            gridCalcSum(src, dst, globPtr<uchar>(mask), stream);
+    }
+
+    template <typename T, typename R, int cn>
+    void sumAbsImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream)
+    {
+        typedef typename MakeVec<T, cn>::type src_type;
+        typedef typename MakeVec<R, cn>::type res_type;
+
+        const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
+        GpuMat_<res_type>& dst = (GpuMat_<res_type>&) _dst;
+
+        if (mask.empty())
+            gridCalcSum(abs_(cvt_<res_type>(src)), dst, stream);
+        else
+            gridCalcSum(abs_(cvt_<res_type>(src)), dst, globPtr<uchar>(mask), stream);
+    }
+
+    template <typename T, typename R, int cn>
+    void sumSqrImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream)
+    {
+        typedef typename MakeVec<T, cn>::type src_type;
+        typedef typename MakeVec<R, cn>::type res_type;
+
+        const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
+        GpuMat_<res_type>& dst = (GpuMat_<res_type>&) _dst;
+
+        if (mask.empty())
+            gridCalcSum(sqr_(cvt_<res_type>(src)), dst, stream);
+        else
+            gridCalcSum(sqr_(cvt_<res_type>(src)), dst, globPtr<uchar>(mask), stream);
+    }
+}
+
+void cv::cuda::calcSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs[7][4] =
+    {
+        {sumImpl<uchar , double, 1>, sumImpl<uchar , double, 2>, sumImpl<uchar , double, 3>, sumImpl<uchar , double, 4>},
+        {sumImpl<schar , double, 1>, sumImpl<schar , double, 2>, sumImpl<schar , double, 3>, sumImpl<schar , double, 4>},
+        {sumImpl<ushort, double, 1>, sumImpl<ushort, double, 2>, sumImpl<ushort, double, 3>, sumImpl<ushort, double, 4>},
+        {sumImpl<short , double, 1>, sumImpl<short , double, 2>, sumImpl<short , double, 3>, sumImpl<short , double, 4>},
+        {sumImpl<int   , double, 1>, sumImpl<int   , double, 2>, sumImpl<int   , double, 3>, sumImpl<int   , double, 4>},
+        {sumImpl<float , double, 1>, sumImpl<float , double, 2>, sumImpl<float , double, 3>, sumImpl<float , double, 4>},
+        {sumImpl<double, double, 1>, sumImpl<double, double, 2>, sumImpl<double, double, 3>, sumImpl<double, double, 4>}
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+    const int src_depth = src.depth();
+    const int channels = src.channels();
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream);
+
+    const func_t func = funcs[src_depth][channels - 1];
+    func(src, dst, mask, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcSum(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F);
+
+    return val;
+}
+
+void cv::cuda::calcAbsSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs[7][4] =
+    {
+        {sumAbsImpl<uchar , double, 1>, sumAbsImpl<uchar , double, 2>, sumAbsImpl<uchar , double, 3>, sumAbsImpl<uchar , double, 4>},
+        {sumAbsImpl<schar , double, 1>, sumAbsImpl<schar , double, 2>, sumAbsImpl<schar , double, 3>, sumAbsImpl<schar , double, 4>},
+        {sumAbsImpl<ushort, double, 1>, sumAbsImpl<ushort, double, 2>, sumAbsImpl<ushort, double, 3>, sumAbsImpl<ushort, double, 4>},
+        {sumAbsImpl<short , double, 1>, sumAbsImpl<short , double, 2>, sumAbsImpl<short , double, 3>, sumAbsImpl<short , double, 4>},
+        {sumAbsImpl<int   , double, 1>, sumAbsImpl<int   , double, 2>, sumAbsImpl<int   , double, 3>, sumAbsImpl<int   , double, 4>},
+        {sumAbsImpl<float , double, 1>, sumAbsImpl<float , double, 2>, sumAbsImpl<float , double, 3>, sumAbsImpl<float , double, 4>},
+        {sumAbsImpl<double, double, 1>, sumAbsImpl<double, double, 2>, sumAbsImpl<double, double, 3>, sumAbsImpl<double, double, 4>}
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+    const int src_depth = src.depth();
+    const int channels = src.channels();
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream);
+
+    const func_t func = funcs[src_depth][channels - 1];
+    func(src, dst, mask, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcAbsSum(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F);
+
+    return val;
+}
+
+void cv::cuda::calcSqrSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs[7][4] =
+    {
+        {sumSqrImpl<uchar , double, 1>, sumSqrImpl<uchar , double, 2>, sumSqrImpl<uchar , double, 3>, sumSqrImpl<uchar , double, 4>},
+        {sumSqrImpl<schar , double, 1>, sumSqrImpl<schar , double, 2>, sumSqrImpl<schar , double, 3>, sumSqrImpl<schar , double, 4>},
+        {sumSqrImpl<ushort, double, 1>, sumSqrImpl<ushort, double, 2>, sumSqrImpl<ushort, double, 3>, sumSqrImpl<ushort, double, 4>},
+        {sumSqrImpl<short , double, 1>, sumSqrImpl<short , double, 2>, sumSqrImpl<short , double, 3>, sumSqrImpl<short , double, 4>},
+        {sumSqrImpl<int   , double, 1>, sumSqrImpl<int   , double, 2>, sumSqrImpl<int   , double, 3>, sumSqrImpl<int   , double, 4>},
+        {sumSqrImpl<float , double, 1>, sumSqrImpl<float , double, 2>, sumSqrImpl<float , double, 3>, sumSqrImpl<float , double, 4>},
+        {sumSqrImpl<double, double, 1>, sumSqrImpl<double, double, 2>, sumSqrImpl<double, double, 3>, sumSqrImpl<double, double, 4>}
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+    const int src_depth = src.depth();
+    const int channels = src.channels();
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream);
+
+    const func_t func = funcs[src_depth][channels - 1];
+    func(src, dst, mask, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcSqrSum(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F);
+
+    return val;
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
new file mode 100644
index 00000000000..1249fee04be
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -0,0 +1,153 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace
+{
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename T>
+    void thresholdImpl(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& stream)
+    {
+        const T thresh_ = static_cast<T>(thresh);
+        const T maxVal_ = static_cast<T>(maxVal);
+
+        switch (type)
+        {
+        case 0:
+            gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_binary_func(thresh_, maxVal_), stream);
+            break;
+        case 1:
+            gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_binary_inv_func(thresh_, maxVal_), stream);
+            break;
+        case 2:
+            gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_trunc_func(thresh_), stream);
+            break;
+        case 3:
+            gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_to_zero_func(thresh_), stream);
+            break;
+        case 4:
+            gridTransformUnary_< TransformPolicy<T> >(globPtr<T>(src), globPtr<T>(dst), thresh_to_zero_inv_func(thresh_), stream);
+            break;
+        };
+    }
+}
+
+double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( type <= 4 /*THRESH_TOZERO_INV*/ );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+    src = src.reshape(1);
+    dst = dst.reshape(1);
+
+    if (depth == CV_32F && type == 2 /*THRESH_TRUNC*/)
+    {
+        NppStreamHandler h(StreamAccessor::getStream(stream));
+
+        NppiSize sz;
+        sz.width  = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );
+
+        if (!stream)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+    else
+    {
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& stream);
+        static const func_t funcs[] =
+        {
+            thresholdImpl<uchar>,
+            thresholdImpl<schar>,
+            thresholdImpl<ushort>,
+            thresholdImpl<short>,
+            thresholdImpl<int>,
+            thresholdImpl<float>,
+            thresholdImpl<double>
+        };
+
+        if (depth != CV_32F && depth != CV_64F)
+        {
+            thresh = cvFloor(thresh);
+            maxVal = cvRound(maxVal);
+        }
+
+        funcs[depth](src, dst, thresh, maxVal, type, stream);
+    }
+
+    syncOutput(dst, _dst, stream);
+
+    return thresh;
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
new file mode 100644
index 00000000000..bfe50bd34fb
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -0,0 +1,95 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+
+    const size_t elemSize = src.elemSize();
+
+    CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
+
+    GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
+
+    if (elemSize == 1)
+    {
+        NppStreamHandler h(StreamAccessor::getStream(stream));
+
+        NppiSize sz;
+        sz.width  = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+
+        if (!stream)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+    else if (elemSize == 4)
+    {
+        gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
+    }
+    else // if (elemSize == 8)
+    {
+        gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
+    }
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp
new file mode 100644
index 00000000000..f88119502d1
--- /dev/null
+++ b/modules/cudaarithm/src/element_operations.cpp
@@ -0,0 +1,505 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::add(InputArray, InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::subtract(InputArray, InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::multiply(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::divide(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::absdiff(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::abs(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::sqr(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::sqrt(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::exp(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::log(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::pow(InputArray, double, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::compare(InputArray, InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::bitwise_not(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::bitwise_or(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::bitwise_and(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::bitwise_xor(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::rshift(InputArray, Scalar_<int>, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::lshift(InputArray, Scalar_<int>, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::min(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::max(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::addWeighted(InputArray, double, InputArray, double, double, OutputArray, int, Stream&) { throw_no_cuda(); }
+
+double cv::cuda::threshold(InputArray, OutputArray, double, double, int, Stream&) {throw_no_cuda(); return 0.0;}
+
+void cv::cuda::magnitude(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::magnitude(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::magnitudeSqr(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::magnitudeSqr(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::phase(InputArray, InputArray, OutputArray, bool, Stream&) { throw_no_cuda(); }
+void cv::cuda::cartToPolar(InputArray, InputArray, OutputArray, OutputArray, bool, Stream&) { throw_no_cuda(); }
+void cv::cuda::polarToCart(InputArray, InputArray, OutputArray, OutputArray, bool, Stream&) { throw_no_cuda(); }
+
+#else
+
+////////////////////////////////////////////////////////////////////////
+// arithm_op
+
+namespace
+{
+    typedef void (*mat_mat_func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int op);
+    typedef void (*mat_scalar_func_t)(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int op);
+
+    void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, double scale, int dtype, Stream& stream,
+                   mat_mat_func_t mat_mat_func, mat_scalar_func_t mat_scalar_func, int op = 0)
+    {
+        const int kind1 = _src1.kind();
+        const int kind2 = _src2.kind();
+
+        const bool isScalar1 = (kind1 == _InputArray::MATX);
+        const bool isScalar2 = (kind2 == _InputArray::MATX);
+        CV_Assert( !isScalar1 || !isScalar2 );
+
+        GpuMat src1;
+        if (!isScalar1)
+            src1 = getInputMat(_src1, stream);
+
+        GpuMat src2;
+        if (!isScalar2)
+            src2 = getInputMat(_src2, stream);
+
+        Mat scalar;
+        if (isScalar1)
+            scalar = _src1.getMat();
+        else if (isScalar2)
+            scalar = _src2.getMat();
+
+        Scalar val;
+        if (!scalar.empty())
+        {
+            CV_Assert( scalar.total() <= 4 );
+            scalar.convertTo(Mat_<double>(scalar.rows, scalar.cols, &val[0]), CV_64F);
+        }
+
+        GpuMat mask = getInputMat(_mask, stream);
+
+        const int sdepth = src1.empty() ? src2.depth() : src1.depth();
+        const int cn = src1.empty() ? src2.channels() : src1.channels();
+        const Size size = src1.empty() ? src2.size() : src1.size();
+
+        if (dtype < 0)
+            dtype = sdepth;
+
+        const int ddepth = CV_MAT_DEPTH(dtype);
+
+        CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+        CV_Assert( !scalar.empty() || (src2.type() == src1.type() && src2.size() == src1.size()) );
+        CV_Assert( mask.empty() || (cn == 1 && mask.size() == size && mask.type() == CV_8UC1) );
+
+        if (sdepth == CV_64F || ddepth == CV_64F)
+        {
+            if (!deviceSupports(NATIVE_DOUBLE))
+                CV_Error(Error::StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        GpuMat dst = getOutputMat(_dst, size, CV_MAKE_TYPE(ddepth, cn), stream);
+
+        if (isScalar1)
+            mat_scalar_func(src2, val, true, dst, mask, scale, stream, op);
+        else if (isScalar2)
+            mat_scalar_func(src1, val, false, dst, mask, scale, stream, op);
+        else
+            mat_mat_func(src1, src2, dst, mask, scale, stream, op);
+
+        syncOutput(dst, _dst, stream);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// add
+
+void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);
+
+void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);
+
+void cv::cuda::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1.0, dtype, stream, addMat, addScalar);
+}
+
+////////////////////////////////////////////////////////////////////////
+// subtract
+
+void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);
+
+void subScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);
+
+void cv::cuda::subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1.0, dtype, stream, subMat, subScalar);
+}
+
+////////////////////////////////////////////////////////////////////////
+// multiply
+
+void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int);
+void mulMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+void mulMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+
+void mulScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int);
+
+void cv::cuda::multiply(InputArray _src1, InputArray _src2, OutputArray _dst, double scale, int dtype, Stream& stream)
+{
+    if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1)
+    {
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);
+
+        CV_Assert( src1.size() == src2.size() );
+
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);
+
+        mulMat_8uc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
+    }
+    else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1)
+    {
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);
+
+        CV_Assert( src1.size() == src2.size() );
+
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);
+
+        mulMat_16sc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
+    }
+    else
+    {
+        arithm_op(_src1, _src2, _dst, GpuMat(), scale, dtype, stream, mulMat, mulScalar);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// divide
+
+void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& stream, int);
+void divMat_8uc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+void divMat_16sc4_32f(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+
+void divScalar(const GpuMat& src, cv::Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int);
+
+void cv::cuda::divide(InputArray _src1, InputArray _src2, OutputArray _dst, double scale, int dtype, Stream& stream)
+{
+    if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1)
+    {
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);
+
+        CV_Assert( src1.size() == src2.size() );
+
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);
+
+        divMat_8uc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
+    }
+    else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1)
+    {
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);
+
+        CV_Assert( src1.size() == src2.size() );
+
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);
+
+        divMat_16sc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
+    }
+    else
+    {
+        arithm_op(_src1, _src2, _dst, GpuMat(), scale, dtype, stream, divMat, divScalar);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// absdiff
+
+void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
+
+void absDiffScalar(const GpuMat& src, cv::Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int);
+
+void cv::cuda::absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, absDiffMat, absDiffScalar);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// compare
+
+void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);
+
+void cmpScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);
+
+void cv::cuda::compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), 1.0, CV_8U, stream, cmpMat, cmpScalar, cmpop);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Binary bitwise logical operations
+
+namespace
+{
+    enum
+    {
+        BIT_OP_AND,
+        BIT_OP_OR,
+        BIT_OP_XOR
+    };
+}
+
+void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
+
+void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
+
+void cv::cuda::bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, Stream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1.0, -1, stream, bitMat, bitScalar, BIT_OP_OR);
+}
+
+void cv::cuda::bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, Stream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1.0, -1, stream, bitMat, bitScalar, BIT_OP_AND);
+}
+
+void cv::cuda::bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, Stream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1.0, -1, stream, bitMat, bitScalar, BIT_OP_XOR);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// shift
+
+namespace
+{
+    template <int DEPTH, int cn> struct NppShiftFunc
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_type* pDst,  int nDstStep,  NppiSize oSizeROI);
+    };
+    template <int DEPTH> struct NppShiftFunc<DEPTH, 1>
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_type* pDst,  int nDstStep,  NppiSize oSizeROI);
+    };
+
+    template <int DEPTH, int cn, typename NppShiftFunc<DEPTH, cn>::func_t func> struct NppShift
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        static void call(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize oSizeROI;
+            oSizeROI.width = src.cols;
+            oSizeROI.height = src.rows;
+
+            nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template <int DEPTH, typename NppShiftFunc<DEPTH, 1>::func_t func> struct NppShift<DEPTH, 1, func>
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        static void call(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize oSizeROI;
+            oSizeROI.width = src.cols;
+            oSizeROI.height = src.rows;
+
+            nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val[0], dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::cuda::rshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[5][4] =
+    {
+        {NppShift<CV_8U , 1, nppiRShiftC_8u_C1R >::call, 0, NppShift<CV_8U , 3, nppiRShiftC_8u_C3R >::call, NppShift<CV_8U , 4, nppiRShiftC_8u_C4R>::call },
+        {NppShift<CV_8S , 1, nppiRShiftC_8s_C1R >::call, 0, NppShift<CV_8S , 3, nppiRShiftC_8s_C3R >::call, NppShift<CV_8S , 4, nppiRShiftC_8s_C4R>::call },
+        {NppShift<CV_16U, 1, nppiRShiftC_16u_C1R>::call, 0, NppShift<CV_16U, 3, nppiRShiftC_16u_C3R>::call, NppShift<CV_16U, 4, nppiRShiftC_16u_C4R>::call},
+        {NppShift<CV_16S, 1, nppiRShiftC_16s_C1R>::call, 0, NppShift<CV_16S, 3, nppiRShiftC_16s_C3R>::call, NppShift<CV_16S, 4, nppiRShiftC_16s_C4R>::call},
+        {NppShift<CV_32S, 1, nppiRShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiRShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiRShiftC_32s_C4R>::call},
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.depth() < CV_32F );
+    CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+    funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::lshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[5][4] =
+    {
+        {NppShift<CV_8U , 1, nppiLShiftC_8u_C1R>::call , 0, NppShift<CV_8U , 3, nppiLShiftC_8u_C3R>::call , NppShift<CV_8U , 4, nppiLShiftC_8u_C4R>::call },
+        {0                                             , 0, 0                                             , 0                                             },
+        {NppShift<CV_16U, 1, nppiLShiftC_16u_C1R>::call, 0, NppShift<CV_16U, 3, nppiLShiftC_16u_C3R>::call, NppShift<CV_16U, 4, nppiLShiftC_16u_C4R>::call},
+        {0                                             , 0, 0                                             , 0                                             },
+        {NppShift<CV_32S, 1, nppiLShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiLShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiLShiftC_32s_C4R>::call},
+    };
+
+    GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S );
+    CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
+
+    funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Minimum and maximum operations
+
+namespace
+{
+    enum
+    {
+        MIN_OP,
+        MAX_OP
+    };
+}
+
+void minMaxMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int op);
+
+void minMaxScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int op);
+
+void cv::cuda::min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, minMaxMat, minMaxScalar, MIN_OP);
+}
+
+void cv::cuda::max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, minMaxMat, minMaxScalar, MAX_OP);
+}
+
+////////////////////////////////////////////////////////////////////////
+// NPP magnitide
+
+namespace
+{
+    typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
+
+    void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
+    {
+        CV_Assert(src.type() == CV_32FC2);
+
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        NppStreamHandler h(stream);
+
+        nppSafeCall( func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+void cv::cuda::magnitude(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+
+    GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream);
+
+    npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::magnitudeSqr(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+
+    GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream);
+
+    npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/src/precomp.hpp b/modules/cudaarithm/src/precomp.hpp
new file mode 100644
index 00000000000..6a756e9d022
--- /dev/null
+++ b/modules/cudaarithm/src/precomp.hpp
@@ -0,0 +1,63 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "cvconfig.h"
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/core/utility.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#ifdef HAVE_CUBLAS
+#  include <cublas.h>
+#endif
+
+#ifdef HAVE_CUFFT
+#  include <cufft.h>
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp
new file mode 100644
index 00000000000..4824a5c4da7
--- /dev/null
+++ b/modules/cudaarithm/src/reductions.cpp
@@ -0,0 +1,219 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+double cv::cuda::norm(InputArray, int, InputArray) { throw_no_cuda(); return 0.0; }
+void cv::cuda::calcNorm(InputArray, OutputArray, int, InputArray, Stream&) { throw_no_cuda(); }
+double cv::cuda::norm(InputArray, InputArray, int) { throw_no_cuda(); return 0.0; }
+void cv::cuda::calcNormDiff(InputArray, InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
+
+Scalar cv::cuda::sum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); }
+void cv::cuda::calcSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+Scalar cv::cuda::absSum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); }
+void cv::cuda::calcAbsSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+Scalar cv::cuda::sqrSum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); }
+void cv::cuda::calcSqrSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::minMax(InputArray, double*, double*, InputArray) { throw_no_cuda(); }
+void cv::cuda::findMinMax(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray) { throw_no_cuda(); }
+void cv::cuda::findMinMaxLoc(InputArray, OutputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+
+int cv::cuda::countNonZero(InputArray) { throw_no_cuda(); return 0; }
+void cv::cuda::countNonZero(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::reduce(InputArray, OutputArray, int, int, int, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::meanStdDev(InputArray, Scalar&, Scalar&) { throw_no_cuda(); }
+void cv::cuda::meanStdDev(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::rectStdDev(InputArray, InputArray, OutputArray, Rect, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::normalize(InputArray, OutputArray, double, double, int, int, InputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::integral(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::sqrIntegral(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+#else
+
+////////////////////////////////////////////////////////////////////////
+// norm
+
+namespace cv { namespace cuda { namespace device {
+
+void normL2(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream);
+
+void findMaxAbs(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream);
+
+}}}
+
+void cv::cuda::calcNorm(InputArray _src, OutputArray dst, int normType, InputArray mask, Stream& stream)
+{
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
+
+    GpuMat src = getInputMat(_src, stream);
+
+    GpuMat src_single_channel = src.reshape(1);
+
+    if (normType == NORM_L1)
+    {
+        calcAbsSum(src_single_channel, dst, mask, stream);
+    }
+    else if (normType == NORM_L2)
+    {
+        cv::cuda::device::normL2(src_single_channel, dst, mask, stream);
+    }
+    else // NORM_INF
+    {
+        cv::cuda::device::findMaxAbs(src_single_channel, dst, mask, stream);
+    }
+}
+
+double cv::cuda::norm(InputArray _src, int normType, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcNorm(_src, dst, normType, _mask, stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    dst.createMatHeader().convertTo(Mat(1, 1, CV_64FC1, &val), CV_64F);
+
+    return val;
+}
+
+////////////////////////////////////////////////////////////////////////
+// meanStdDev
+
+void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
+        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
+
+    const GpuMat src = getInputMat(_src, stream);
+
+    CV_Assert( src.type() == CV_8UC1 );
+
+    GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);
+
+    NppiSize sz;
+    sz.width  = src.cols;
+    sz.height = src.rows;
+
+    int bufSize;
+#if (CUDA_VERSION <= 4020)
+    nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
+#else
+    nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
+#endif
+
+    BufferPool pool(stream);
+    GpuMat buf = pool.getBuffer(1, bufSize, CV_8UC1);
+
+    // detail: https://github.com/opencv/opencv/issues/11063
+    //NppStreamHandler h(StreamAccessor::getStream(stream));
+
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dst.ptr<Npp64f>(), dst.ptr<Npp64f>() + 1) );
+
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    meanStdDev(_src, dst, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().copyTo(Mat(1, 2, CV_64FC1, &vals[0]));
+
+    mean = Scalar(vals[0]);
+    stddev = Scalar(vals[1]);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rectStdDev
+
+void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Rect rect, Stream& _stream)
+{
+    GpuMat src = getInputMat(_src, _stream);
+    GpuMat sqr = getInputMat(_sqr, _stream);
+
+    CV_Assert( src.type() == CV_32SC1 && sqr.type() == CV_64FC1 );
+
+    GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, _stream);
+
+    NppiSize sz;
+    sz.width = src.cols;
+    sz.height = src.rows;
+
+    NppiRect nppRect;
+    nppRect.height = rect.height;
+    nppRect.width = rect.width;
+    nppRect.x = rect.x;
+    nppRect.y = rect.y;
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    NppStreamHandler h(stream);
+
+    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+    syncOutput(dst, _dst, _stream);
+}
+
+#endif
diff --git a/modules/cudaarithm/test/test_arithm.cpp b/modules/cudaarithm/test/test_arithm.cpp
new file mode 100644
index 00000000000..9ee10b6532f
--- /dev/null
+++ b/modules/cudaarithm/test/test_arithm.cpp
@@ -0,0 +1,433 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////////////
+// GEMM
+
+#ifdef HAVE_CUBLAS
+
+CV_FLAGS(GemmFlags, 0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T);
+#define ALL_GEMM_FLAGS testing::Values(GemmFlags(0), GemmFlags(cv::GEMM_1_T), GemmFlags(cv::GEMM_2_T), GemmFlags(cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T | cv::GEMM_3_T))
+
+PARAM_TEST_CASE(GEMM, cv::cuda::DeviceInfo, cv::Size, MatType, GemmFlags, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int flags;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        flags = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(GEMM, Accuracy)
+{
+    cv::Mat src1 = randomMat(size, type, -10.0, 10.0);
+    cv::Mat src2 = randomMat(size, type, -10.0, 10.0);
+    cv::Mat src3 = randomMat(size, type, -10.0, 10.0);
+    double alpha = randomDouble(-10.0, 10.0);
+    double beta = randomDouble(-10.0, 10.0);
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else if (type == CV_64FC2 && flags != 0)
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+        cv::cuda::gemm(loadMat(src1, useRoi), loadMat(src2, useRoi), alpha, loadMat(src3, useRoi), beta, dst, flags);
+
+        cv::Mat dst_gold;
+        cv::gemm(src1, src2, alpha, src3, beta, dst_gold, flags);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1e-10);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, GEMM, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_32FC1), MatType(CV_32FC2), MatType(CV_64FC1), MatType(CV_64FC2)),
+    ALL_GEMM_FLAGS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////
+// MulSpectrums
+
+CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
+
+PARAM_TEST_CASE(MulSpectrums, cv::cuda::DeviceInfo, cv::Size, DftFlags)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int flag;
+
+    cv::Mat a, b;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        flag = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        a = randomMat(size, CV_32FC2);
+        b = randomMat(size, CV_32FC2);
+    }
+};
+
+CUDA_TEST_P(MulSpectrums, Simple)
+{
+    cv::cuda::GpuMat c;
+    cv::cuda::mulSpectrums(loadMat(a), loadMat(b), c, flag, false);
+
+    cv::Mat c_gold;
+    cv::mulSpectrums(a, b, c_gold, flag, false);
+
+    EXPECT_MAT_NEAR(c_gold, c, 1e-2);
+}
+
+CUDA_TEST_P(MulSpectrums, Scaled)
+{
+    float scale = 1.f / size.area();
+
+    cv::cuda::GpuMat c;
+    cv::cuda::mulAndScaleSpectrums(loadMat(a), loadMat(b), c, flag, scale, false);
+
+    cv::Mat c_gold;
+    cv::mulSpectrums(a, b, c_gold, flag, false);
+    c_gold.convertTo(c_gold, c_gold.type(), scale);
+
+    EXPECT_MAT_NEAR(c_gold, c, 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MulSpectrums, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS))));
+
+////////////////////////////////////////////////////////////////////////////
+// Dft
+
+struct Dft : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+namespace
+{
+    void testC2C(const std::string& hint, int cols, int rows, int flags, bool inplace)
+    {
+        SCOPED_TRACE(hint);
+
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0);
+
+        cv::Mat b_gold;
+        cv::dft(a, b_gold, flags);
+
+        cv::cuda::GpuMat d_b;
+        cv::cuda::GpuMat d_b_data;
+        if (inplace)
+        {
+            d_b_data.create(1, a.size().area(), CV_32FC2);
+            d_b = cv::cuda::GpuMat(a.rows, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
+        }
+        cv::cuda::dft(loadMat(a), d_b, cv::Size(cols, rows), flags);
+
+        EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
+        ASSERT_EQ(CV_32F, d_b.depth());
+        ASSERT_EQ(2, d_b.channels());
+        EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
+    }
+}
+
+CUDA_TEST_P(Dft, C2C)
+{
+    int cols = randomInt(2, 100);
+    int rows = randomInt(2, 100);
+
+    for (int i = 0; i < 2; ++i)
+    {
+        bool inplace = i != 0;
+
+        testC2C("no flags", cols, rows, 0, inplace);
+        testC2C("no flags 0 1", cols, rows + 1, 0, inplace);
+        testC2C("no flags 1 0", cols, rows + 1, 0, inplace);
+        testC2C("no flags 1 1", cols + 1, rows, 0, inplace);
+        testC2C("DFT_INVERSE", cols, rows, cv::DFT_INVERSE, inplace);
+        testC2C("DFT_ROWS", cols, rows, cv::DFT_ROWS, inplace);
+        testC2C("single col", 1, rows, 0, inplace);
+        testC2C("single row", cols, 1, 0, inplace);
+        testC2C("single col inversed", 1, rows, cv::DFT_INVERSE, inplace);
+        testC2C("single row inversed", cols, 1, cv::DFT_INVERSE, inplace);
+        testC2C("single row DFT_ROWS", cols, 1, cv::DFT_ROWS, inplace);
+        testC2C("size 1 2", 1, 2, 0, inplace);
+        testC2C("size 2 1", 2, 1, 0, inplace);
+    }
+}
+
+CUDA_TEST_P(Dft, Algorithm)
+{
+    int cols = randomInt(2, 100);
+    int rows = randomInt(2, 100);
+
+    int flags = 0 | DFT_COMPLEX_INPUT;
+    cv::Ptr<cv::cuda::DFT> dft = cv::cuda::createDFT(cv::Size(cols, rows), flags);
+
+    for (int i = 0; i < 5; ++i)
+    {
+        SCOPED_TRACE("dft algorithm");
+
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0);
+
+        cv::cuda::GpuMat d_b;
+        cv::cuda::GpuMat d_b_data;
+        dft->compute(loadMat(a), d_b);
+
+        cv::Mat b_gold;
+        cv::dft(a, b_gold, flags);
+
+        ASSERT_EQ(CV_32F, d_b.depth());
+        ASSERT_EQ(2, d_b.channels());
+        EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
+    }
+}
+
+namespace
+{
+    void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)
+    {
+        SCOPED_TRACE(hint);
+
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC1, 0.0, 10.0);
+
+        cv::cuda::GpuMat d_b, d_c;
+        cv::cuda::GpuMat d_b_data, d_c_data;
+        if (inplace)
+        {
+            if (a.cols == 1)
+            {
+                d_b_data.create(1, (a.rows / 2 + 1) * a.cols, CV_32FC2);
+                d_b = cv::cuda::GpuMat(a.rows / 2 + 1, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
+            }
+            else
+            {
+                d_b_data.create(1, a.rows * (a.cols / 2 + 1), CV_32FC2);
+                d_b = cv::cuda::GpuMat(a.rows, a.cols / 2 + 1, CV_32FC2, d_b_data.ptr(), (a.cols / 2 + 1) * d_b_data.elemSize());
+            }
+            d_c_data.create(1, a.size().area(), CV_32F);
+            d_c = cv::cuda::GpuMat(a.rows, a.cols, CV_32F, d_c_data.ptr(), a.cols * d_c_data.elemSize());
+        }
+
+        cv::cuda::dft(loadMat(a), d_b, cv::Size(cols, rows), 0);
+        cv::cuda::dft(d_b, d_c, cv::Size(cols, rows), cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
+
+        EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
+        EXPECT_TRUE(!inplace || d_c.ptr() == d_c_data.ptr());
+        ASSERT_EQ(CV_32F, d_c.depth());
+        ASSERT_EQ(1, d_c.channels());
+
+        cv::Mat c(d_c);
+        EXPECT_MAT_NEAR(a, c, rows * cols * 1e-5);
+    }
+}
+
+CUDA_TEST_P(Dft, R2CThenC2R)
+{
+    int cols = randomInt(2, 100);
+    int rows = randomInt(2, 100);
+
+    testR2CThenC2R("sanity", cols, rows, false);
+    testR2CThenC2R("sanity 0 1", cols, rows + 1, false);
+    testR2CThenC2R("sanity 1 0", cols + 1, rows, false);
+    testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, false);
+    testR2CThenC2R("single col", 1, rows, false);
+    testR2CThenC2R("single col 1", 1, rows + 1, false);
+    testR2CThenC2R("single row", cols, 1, false);
+    testR2CThenC2R("single row 1", cols + 1, 1, false);
+
+    testR2CThenC2R("sanity", cols, rows, true);
+    testR2CThenC2R("sanity 0 1", cols, rows + 1, true);
+    testR2CThenC2R("sanity 1 0", cols + 1, rows, true);
+    testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, true);
+    testR2CThenC2R("single row", cols, 1, true);
+    testR2CThenC2R("single row 1", cols + 1, 1, true);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Dft, ALL_DEVICES);
+
+////////////////////////////////////////////////////////
+// Convolve
+
+namespace
+{
+    void convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false)
+    {
+        // reallocate the output array if needed
+        C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type());
+        cv::Size dftSize;
+
+        // compute the size of DFT transform
+        dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1);
+        dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1);
+
+        // allocate temporary buffers and initialize them with 0s
+        cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0));
+        cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0));
+
+        // copy A and B to the top-left corners of tempA and tempB, respectively
+        cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows));
+        A.copyTo(roiA);
+        cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows));
+        B.copyTo(roiB);
+
+        // now transform the padded A & B in-place;
+        // use "nonzeroRows" hint for faster processing
+        cv::dft(tempA, tempA, 0, A.rows);
+        cv::dft(tempB, tempB, 0, B.rows);
+
+        // multiply the spectrums;
+        // the function handles packed spectrum representations well
+        cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr);
+
+        // transform the product back from the frequency domain.
+        // Even though all the result rows will be non-zero,
+        // you need only the first C.rows of them, and thus you
+        // pass nonzeroRows == C.rows
+        cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows);
+
+        // now copy the result back to C.
+        tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C);
+    }
+
+    IMPLEMENT_PARAM_CLASS(KSize, int)
+    IMPLEMENT_PARAM_CLASS(Ccorr, bool)
+}
+
+PARAM_TEST_CASE(Convolve, cv::cuda::DeviceInfo, cv::Size, KSize, Ccorr)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int ksize;
+    bool ccorr;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        ksize = GET_PARAM(2);
+        ccorr = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Convolve, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
+    cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::cuda::Convolution> conv = cv::cuda::createConvolution();
+
+    cv::cuda::GpuMat dst;
+    conv->convolve(loadMat(src), loadMat(kernel), dst, ccorr);
+
+    cv::Mat dst_gold;
+    convolveDFT(src, kernel, dst_gold, ccorr);
+
+    EXPECT_MAT_NEAR(dst, dst_gold, 1e-1);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Convolve, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(KSize(3), KSize(7), KSize(11), KSize(17), KSize(19), KSize(23), KSize(45)),
+    testing::Values(Ccorr(false), Ccorr(true))));
+
+#endif // HAVE_CUBLAS
+
+}} // namespace
+
+#endif // HAVE_CUDA
diff --git a/modules/cudaarithm/test/test_buffer_pool.cpp b/modules/cudaarithm/test/test_buffer_pool.cpp
new file mode 100644
index 00000000000..3c1fe2bcfd7
--- /dev/null
+++ b/modules/cudaarithm/test/test_buffer_pool.cpp
@@ -0,0 +1,120 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+namespace opencv_test { namespace {
+
+struct BufferPoolTest : TestWithParam<DeviceInfo>
+{
+    void RunSimpleTest(Stream& stream, HostMem& dst_1, HostMem& dst_2)
+    {
+        BufferPool pool(stream);
+
+        {
+            GpuMat buf0 = pool.getBuffer(Size(640, 480), CV_8UC1);
+            EXPECT_FALSE( buf0.empty() );
+
+            buf0.setTo(Scalar::all(0), stream);
+
+            GpuMat buf1 = pool.getBuffer(Size(640, 480), CV_8UC1);
+            EXPECT_FALSE( buf1.empty() );
+
+            buf0.convertTo(buf1, buf1.type(), 1.0, 1.0, stream);
+
+            buf1.download(dst_1, stream);
+        }
+
+        {
+            GpuMat buf2 = pool.getBuffer(Size(1280, 1024), CV_32SC1);
+            EXPECT_FALSE( buf2.empty() );
+
+            buf2.setTo(Scalar::all(2), stream);
+
+            buf2.download(dst_2, stream);
+        }
+    }
+
+    void CheckSimpleTest(HostMem& dst_1, HostMem& dst_2)
+    {
+        EXPECT_MAT_NEAR(Mat(Size(640, 480), CV_8UC1, Scalar::all(1)), dst_1, 0.0);
+        EXPECT_MAT_NEAR(Mat(Size(1280, 1024), CV_32SC1, Scalar::all(2)), dst_2, 0.0);
+    }
+};
+
+CUDA_TEST_P(BufferPoolTest, FromNullStream)
+{
+    HostMem dst_1, dst_2;
+
+    RunSimpleTest(Stream::Null(), dst_1, dst_2);
+
+    cudaSafeCall(cudaDeviceSynchronize());
+
+    CheckSimpleTest(dst_1, dst_2);
+}
+
+CUDA_TEST_P(BufferPoolTest, From2Streams)
+{
+    HostMem dst1_1, dst1_2;
+    HostMem dst2_1, dst2_2;
+
+    Stream stream1, stream2;
+    RunSimpleTest(stream1, dst1_1, dst1_2);
+    RunSimpleTest(stream2, dst2_1, dst2_2);
+
+    stream1.waitForCompletion();
+    stream2.waitForCompletion();
+
+    CheckSimpleTest(dst1_1, dst1_2);
+    CheckSimpleTest(dst2_1, dst2_2);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Stream, BufferPoolTest, ALL_DEVICES);
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaarithm/test/test_core.cpp b/modules/cudaarithm/test/test_core.cpp
new file mode 100644
index 00000000000..7e5762aa3fc
--- /dev/null
+++ b/modules/cudaarithm/test/test_core.cpp
@@ -0,0 +1,421 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// Merge
+
+PARAM_TEST_CASE(Merge, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Merge, Accuracy)
+{
+    std::vector<cv::Mat> src;
+    src.reserve(channels);
+    for (int i = 0; i < channels; ++i)
+        src.push_back(cv::Mat(size, depth, cv::Scalar::all(i)));
+
+    std::vector<cv::cuda::GpuMat> d_src;
+    for (int i = 0; i < channels; ++i)
+        d_src.push_back(loadMat(src[i], useRoi));
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::merge(d_src, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst;
+        cv::cuda::merge(d_src, dst);
+
+        cv::Mat dst_gold;
+        cv::merge(src, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Merge, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    testing::Values(1, 2, 3, 4),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Split
+
+PARAM_TEST_CASE(Split, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    bool useRoi;
+
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, channels);
+    }
+};
+
+CUDA_TEST_P(Split, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            std::vector<cv::cuda::GpuMat> dst;
+            cv::cuda::split(loadMat(src), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::cuda::GpuMat> dst;
+        cv::cuda::split(loadMat(src, useRoi), dst);
+
+        std::vector<cv::Mat> dst_gold;
+        cv::split(src, dst_gold);
+
+        ASSERT_EQ(dst_gold.size(), dst.size());
+
+        for (size_t i = 0; i < dst_gold.size(); ++i)
+        {
+            EXPECT_MAT_NEAR(dst_gold[i], dst[i], 0.0);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Split, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    testing::Values(1, 2, 3, 4),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Transpose
+
+PARAM_TEST_CASE(Transpose, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Transpose, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::transpose(loadMat(src), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(cv::Size(size.height, size.width), type, useRoi);
+        cv::cuda::transpose(loadMat(src, useRoi), dst);
+
+        cv::Mat dst_gold;
+        cv::transpose(src, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC4),
+                    MatType(CV_16UC2),
+                    MatType(CV_16SC2),
+                    MatType(CV_32SC1),
+                    MatType(CV_32SC2),
+                    MatType(CV_64FC1)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Flip
+
+enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
+CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
+#define ALL_FLIP_CODES testing::Values(FlipCode(FLIP_BOTH), FlipCode(FLIP_X), FlipCode(FLIP_Y))
+
+PARAM_TEST_CASE(Flip, cv::cuda::DeviceInfo, cv::Size, MatType, FlipCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int flip_code;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        flip_code = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Flip, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    cv::cuda::flip(loadMat(src, useRoi), dst, flip_code);
+
+    cv::Mat dst_gold;
+    cv::flip(src, dst_gold, flip_code);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Flip, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC3),
+                    MatType(CV_8UC4),
+                    MatType(CV_16UC1),
+                    MatType(CV_16UC3),
+                    MatType(CV_16UC4),
+                    MatType(CV_32SC1),
+                    MatType(CV_32SC3),
+                    MatType(CV_32SC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4)),
+    ALL_FLIP_CODES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// LUT
+
+PARAM_TEST_CASE(LUT, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(LUT, OneChannel)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1);
+
+    cv::Ptr<cv::cuda::LookUpTable> lutAlg = cv::cuda::createLookUpTable(lut);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()));
+    lutAlg->transform(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::LUT(src, lut, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(LUT, MultiChannel)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels()));
+
+    cv::Ptr<cv::cuda::LookUpTable> lutAlg = cv::cuda::createLookUpTable(lut);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()), useRoi);
+    lutAlg->transform(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::LUT(src, lut, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, LUT, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3)),
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// CopyMakeBorder
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(Border, int)
+}
+
+PARAM_TEST_CASE(CopyMakeBorder, cv::cuda::DeviceInfo, cv::Size, MatType, Border, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int border;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        border = GET_PARAM(3);
+        borderType = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CopyMakeBorder, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar val = randomScalar(0, 255);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(size.width + 2 * border, size.height + 2 * border), type, useRoi);
+    cv::cuda::copyMakeBorder(loadMat(src, useRoi), dst, border, border, border, border, borderType, val);
+
+    cv::Mat dst_gold;
+    cv::copyMakeBorder(src, dst_gold, border, border, border, border, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, CopyMakeBorder, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC3),
+                    MatType(CV_8UC4),
+                    MatType(CV_16UC1),
+                    MatType(CV_16UC3),
+                    MatType(CV_16UC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4)),
+    testing::Values(Border(1), Border(10), Border(50)),
+    ALL_BORDER_TYPES,
+    WHOLE_SUBMAT));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp
new file mode 100644
index 00000000000..cf133024e4b
--- /dev/null
+++ b/modules/cudaarithm/test/test_element_operations.cpp
@@ -0,0 +1,2799 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// Add_Array
+
+PARAM_TEST_CASE(Add_Array, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    int channels;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, channels);
+        dtype = CV_MAKE_TYPE(depth.second, channels);
+    }
+};
+
+CUDA_TEST_P(Add_Array, Accuracy)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::add(loadMat(mat1), loadMat(mat2), dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, dtype, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, cv::cuda::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
+        cv::add(mat1, mat2, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Add_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    ALL_CHANNELS,
+    WHOLE_SUBMAT));
+
+PARAM_TEST_CASE(Add_Array_Mask, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, 1);
+        dtype = CV_MAKE_TYPE(depth.second, 1);
+    }
+};
+
+CUDA_TEST_P(Add_Array_Mask, Accuracy)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::add(loadMat(mat1), loadMat(mat2), dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, dtype, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
+        cv::add(mat1, mat2, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Add_Array_Mask, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Add_Scalar
+
+PARAM_TEST_CASE(Add_Scalar, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Add_Scalar, WithOutMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::add(loadMat(mat), val, dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::add(loadMat(mat, useRoi), val, dst, cv::cuda::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::add(mat, val, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
+    }
+}
+
+CUDA_TEST_P(Add_Scalar, WithMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::add(loadMat(mat), val, dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::add(loadMat(mat, useRoi), val, dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::add(mat, val, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Add_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Add_Scalar_First
+
+PARAM_TEST_CASE(Add_Scalar_First, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Add_Scalar_First, WithOutMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::add(val, loadMat(mat), dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::add(val, loadMat(mat, useRoi), dst, cv::cuda::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::add(val, mat, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+CUDA_TEST_P(Add_Scalar_First, WithMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::add(val, loadMat(mat), dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::add(val, loadMat(mat, useRoi), dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::add(val, mat, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Add_Scalar_First, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Subtract_Array
+
+PARAM_TEST_CASE(Subtract_Array, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    int channels;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, channels);
+        dtype = CV_MAKE_TYPE(depth.second, channels);
+    }
+};
+
+CUDA_TEST_P(Subtract_Array, Accuracy)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::subtract(loadMat(mat1), loadMat(mat2), dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, dtype, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, cv::cuda::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
+        cv::subtract(mat1, mat2, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Subtract_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    ALL_CHANNELS,
+    WHOLE_SUBMAT));
+
+PARAM_TEST_CASE(Subtract_Array_Mask, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, 1);
+        dtype = CV_MAKE_TYPE(depth.second, 1);
+    }
+};
+
+CUDA_TEST_P(Subtract_Array_Mask, Accuracy)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::subtract(loadMat(mat1), loadMat(mat2), dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, dtype, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
+        cv::subtract(mat1, mat2, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Subtract_Array_Mask, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Subtract_Scalar
+
+PARAM_TEST_CASE(Subtract_Scalar, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Subtract_Scalar, WithOutMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::subtract(loadMat(mat), val, dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::subtract(loadMat(mat, useRoi), val, dst, cv::cuda::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::subtract(mat, val, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
+    }
+}
+
+CUDA_TEST_P(Subtract_Scalar, WithMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::subtract(loadMat(mat), val, dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::subtract(loadMat(mat, useRoi), val, dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::subtract(mat, val, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Subtract_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Subtract_Scalar_First
+
+PARAM_TEST_CASE(Subtract_Scalar_First, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Subtract_Scalar_First, WithOutMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::subtract(val, loadMat(mat), dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::subtract(val, loadMat(mat, useRoi), dst, cv::cuda::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::subtract(val, mat, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+CUDA_TEST_P(Subtract_Scalar_First, WithMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::subtract(val, loadMat(mat), dst, cv::cuda::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::cuda::subtract(val, loadMat(mat, useRoi), dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::subtract(val, mat, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Subtract_Scalar_First, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Multiply_Array
+
+PARAM_TEST_CASE(Multiply_Array, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    int channels;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, channels);
+        dtype = CV_MAKE_TYPE(depth.second, channels);
+    }
+};
+
+CUDA_TEST_P(Multiply_Array, WithOutScale)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::multiply(loadMat(mat1), loadMat(mat2), dst, 1, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, dtype, useRoi);
+        cv::cuda::multiply(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, 1, depth.second);
+
+        cv::Mat dst_gold;
+        cv::multiply(mat1, mat2, dst_gold, 1, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 0.0);
+    }
+}
+
+CUDA_TEST_P(Multiply_Array, WithScale)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+    double scale = randomDouble(0.0, 255.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::multiply(loadMat(mat1), loadMat(mat2), dst, scale, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, dtype, useRoi);
+        cv::cuda::multiply(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, scale, depth.second);
+
+        cv::Mat dst_gold;
+        cv::multiply(mat1, mat2, dst_gold, scale, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 2.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Multiply_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    ALL_CHANNELS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Multiply_Array_Special
+
+PARAM_TEST_CASE(Multiply_Array_Special, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Multiply_Array_Special, Case_8UC4x_32FC1)
+{
+    cv::Mat mat1 = randomMat(size, CV_8UC4);
+    cv::Mat mat2 = randomMat(size, CV_32FC1);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_8UC4, useRoi);
+    cv::cuda::multiply(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst);
+
+    cv::Mat h_dst(dst);
+
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        const cv::Vec4b* mat1_row = mat1.ptr<cv::Vec4b>(y);
+        const float* mat2_row = mat2.ptr<float>(y);
+        const cv::Vec4b* dst_row = h_dst.ptr<cv::Vec4b>(y);
+
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            cv::Vec4b val1 = mat1_row[x];
+            float val2 = mat2_row[x];
+            cv::Vec4b actual = dst_row[x];
+
+            cv::Vec4b gold;
+
+            gold[0] = cv::saturate_cast<uchar>(val1[0] * val2);
+            gold[1] = cv::saturate_cast<uchar>(val1[1] * val2);
+            gold[2] = cv::saturate_cast<uchar>(val1[2] * val2);
+            gold[3] = cv::saturate_cast<uchar>(val1[3] * val2);
+
+            ASSERT_LE(std::abs(gold[0] - actual[0]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+        }
+    }
+}
+
+CUDA_TEST_P(Multiply_Array_Special, Case_16SC4x_32FC1)
+{
+    cv::Mat mat1 = randomMat(size, CV_16SC4);
+    cv::Mat mat2 = randomMat(size, CV_32FC1);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_16SC4, useRoi);
+    cv::cuda::multiply(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst);
+
+    cv::Mat h_dst(dst);
+
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        const cv::Vec4s* mat1_row = mat1.ptr<cv::Vec4s>(y);
+        const float* mat2_row = mat2.ptr<float>(y);
+        const cv::Vec4s* dst_row = h_dst.ptr<cv::Vec4s>(y);
+
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            cv::Vec4s val1 = mat1_row[x];
+            float val2 = mat2_row[x];
+            cv::Vec4s actual = dst_row[x];
+
+            cv::Vec4s gold;
+
+            gold[0] = cv::saturate_cast<short>(val1[0] * val2);
+            gold[1] = cv::saturate_cast<short>(val1[1] * val2);
+            gold[2] = cv::saturate_cast<short>(val1[2] * val2);
+            gold[3] = cv::saturate_cast<short>(val1[3] * val2);
+
+            ASSERT_LE(std::abs(gold[0] - actual[0]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Multiply_Array_Special, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Multiply_Scalar
+
+PARAM_TEST_CASE(Multiply_Scalar, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Multiply_Scalar, WithOutScale)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::multiply(loadMat(mat), val, dst, 1, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::cuda::multiply(loadMat(mat, useRoi), val, dst, 1, depth.second);
+
+        cv::Mat dst_gold;
+        cv::multiply(mat, val, dst_gold, 1, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+    }
+}
+
+
+CUDA_TEST_P(Multiply_Scalar, WithScale)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    double scale = randomDouble(0.0, 255.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::multiply(loadMat(mat), val, dst, scale, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::cuda::multiply(loadMat(mat, useRoi), val, dst, scale, depth.second);
+
+        cv::Mat dst_gold;
+        cv::multiply(mat, val, dst_gold, scale, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Multiply_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Multiply_Scalar_First
+
+PARAM_TEST_CASE(Multiply_Scalar_First, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Multiply_Scalar_First, WithOutScale)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::multiply(val, loadMat(mat), dst, 1, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::cuda::multiply(val, loadMat(mat, useRoi), dst, 1, depth.second);
+
+        cv::Mat dst_gold;
+        cv::multiply(val, mat, dst_gold, 1, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+    }
+}
+
+
+CUDA_TEST_P(Multiply_Scalar_First, WithScale)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    double scale = randomDouble(0.0, 255.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::multiply(val, loadMat(mat), dst, scale, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::cuda::multiply(val, loadMat(mat, useRoi), dst, scale, depth.second);
+
+        cv::Mat dst_gold;
+        cv::multiply(val, mat, dst_gold, scale, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Multiply_Scalar_First, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Divide_Array
+
+PARAM_TEST_CASE(Divide_Array, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    int channels;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, channels);
+        dtype = CV_MAKE_TYPE(depth.second, channels);
+    }
+};
+
+CUDA_TEST_P(Divide_Array, WithOutScale)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype, 1.0, 255.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::divide(loadMat(mat1), loadMat(mat2), dst, 1, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, dtype, useRoi);
+        cv::cuda::divide(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, 1, depth.second);
+
+        cv::Mat dst_gold;
+        cv::divide(mat1, mat2, dst_gold, 1, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
+    }
+}
+
+CUDA_TEST_P(Divide_Array, WithScale)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype, 1.0, 255.0);
+    double scale = randomDouble(0.0, 255.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::divide(loadMat(mat1), loadMat(mat2), dst, scale, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, dtype, useRoi);
+        cv::cuda::divide(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, scale, depth.second);
+
+        cv::Mat dst_gold;
+        cv::divide(mat1, mat2, dst_gold, scale, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 1.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Divide_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    ALL_CHANNELS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Divide_Array_Special
+
+PARAM_TEST_CASE(Divide_Array_Special, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Divide_Array_Special, Case_8UC4x_32FC1)
+{
+    cv::Mat mat1 = randomMat(size, CV_8UC4);
+    cv::Mat mat2 = randomMat(size, CV_32FC1, 1.0, 255.0);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_8UC4, useRoi);
+    cv::cuda::divide(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst);
+
+    cv::Mat h_dst(dst);
+
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        const cv::Vec4b* mat1_row = mat1.ptr<cv::Vec4b>(y);
+        const float* mat2_row = mat2.ptr<float>(y);
+        const cv::Vec4b* dst_row = h_dst.ptr<cv::Vec4b>(y);
+
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            cv::Vec4b val1 = mat1_row[x];
+            float val2 = mat2_row[x];
+            cv::Vec4b actual = dst_row[x];
+
+            cv::Vec4b gold;
+
+            gold[0] = cv::saturate_cast<uchar>(val1[0] / val2);
+            gold[1] = cv::saturate_cast<uchar>(val1[1] / val2);
+            gold[2] = cv::saturate_cast<uchar>(val1[2] / val2);
+            gold[3] = cv::saturate_cast<uchar>(val1[3] / val2);
+
+            ASSERT_LE(std::abs(gold[0] - actual[0]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+        }
+    }
+}
+
+CUDA_TEST_P(Divide_Array_Special, Case_16SC4x_32FC1)
+{
+    cv::Mat mat1 = randomMat(size, CV_16SC4);
+    cv::Mat mat2 = randomMat(size, CV_32FC1, 1.0, 255.0);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_16SC4, useRoi);
+    cv::cuda::divide(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst);
+
+    cv::Mat h_dst(dst);
+
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        const cv::Vec4s* mat1_row = mat1.ptr<cv::Vec4s>(y);
+        const float* mat2_row = mat2.ptr<float>(y);
+        const cv::Vec4s* dst_row = h_dst.ptr<cv::Vec4s>(y);
+
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            cv::Vec4s val1 = mat1_row[x];
+            float val2 = mat2_row[x];
+            cv::Vec4s actual = dst_row[x];
+
+            cv::Vec4s gold;
+
+            gold[0] = cv::saturate_cast<short>(val1[0] / val2);
+            gold[1] = cv::saturate_cast<short>(val1[1] / val2);
+            gold[2] = cv::saturate_cast<short>(val1[2] / val2);
+            gold[3] = cv::saturate_cast<short>(val1[3] / val2);
+
+            ASSERT_LE(std::abs(gold[0] - actual[0]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+            ASSERT_LE(std::abs(gold[1] - actual[1]), 1.0);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Divide_Array_Special, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Divide_Scalar
+
+PARAM_TEST_CASE(Divide_Scalar, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Divide_Scalar, WithOutScale)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(1.0, 255.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::divide(loadMat(mat), val, dst, 1, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::cuda::divide(loadMat(mat, useRoi), val, dst, 1, depth.second);
+
+        cv::Mat dst_gold;
+        cv::divide(mat, val, dst_gold, 1, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
+    }
+}
+
+CUDA_TEST_P(Divide_Scalar, WithScale)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(1.0, 255.0);
+    double scale = randomDouble(0.0, 255.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::divide(loadMat(mat), val, dst, scale, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::cuda::divide(loadMat(mat, useRoi), val, dst, scale, depth.second);
+
+        cv::Mat dst_gold;
+        cv::divide(mat, val, dst_gold, scale, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 1.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Divide_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Divide_Scalar_First
+
+PARAM_TEST_CASE(Divide_Scalar_First, cv::cuda::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Divide_Scalar_First, Accuracy)
+{
+    double scale = randomDouble(0.0, 255.0);
+    cv::Mat mat = randomMat(size, depth.first, 1.0, 255.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::divide(scale, loadMat(mat), dst, 1.0, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::cuda::divide(scale, loadMat(mat, useRoi), dst, 1.0, depth.second);
+
+        cv::Mat dst_gold;
+        cv::divide(scale, mat, dst_gold, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Divide_Scalar_First, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// AbsDiff
+
+PARAM_TEST_CASE(AbsDiff, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(AbsDiff, Array)
+{
+    cv::Mat src1 = randomMat(size, depth);
+    cv::Mat src2 = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::absdiff(loadMat(src1), loadMat(src2), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+        cv::cuda::absdiff(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
+
+        cv::Mat dst_gold;
+        cv::absdiff(src1, src2, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+CUDA_TEST_P(AbsDiff, Scalar)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::absdiff(loadMat(src), val, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+        cv::cuda::absdiff(loadMat(src, useRoi), val, dst);
+
+        cv::Mat dst_gold;
+        cv::absdiff(src, val, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth <= CV_32F ? 1.0 : 1e-5);
+    }
+}
+
+CUDA_TEST_P(AbsDiff, Scalar_First)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::absdiff(val, loadMat(src), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+        cv::cuda::absdiff(val, loadMat(src, useRoi), dst);
+
+        cv::Mat dst_gold;
+        cv::absdiff(val, src, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth <= CV_32F ? 1.0 : 1e-5);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, AbsDiff, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Abs
+
+PARAM_TEST_CASE(Abs, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Abs, Accuracy)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+    cv::cuda::abs(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold = cv::abs(src);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Abs, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_16S), MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Sqr
+
+PARAM_TEST_CASE(Sqr, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Sqr, Accuracy)
+{
+    cv::Mat src = randomMat(size, depth, 0, depth == CV_8U ? 16 : 255);
+
+    cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+    cv::cuda::sqr(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::multiply(src, src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Sqr, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Sqrt
+
+namespace
+{
+    template <typename T> void sqrtImpl(const cv::Mat& src, cv::Mat& dst)
+    {
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+                dst.at<T>(y, x) = static_cast<T>(std::sqrt(static_cast<float>(src.at<T>(y, x))));
+        }
+    }
+
+    void sqrtGold(const cv::Mat& src, cv::Mat& dst)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
+
+        const func_t funcs[] =
+        {
+            sqrtImpl<uchar>, sqrtImpl<schar>, sqrtImpl<ushort>, sqrtImpl<short>,
+            sqrtImpl<int>, sqrtImpl<float>
+        };
+
+        funcs[src.depth()](src, dst);
+    }
+}
+
+PARAM_TEST_CASE(Sqrt, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Sqrt, Accuracy)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+    cv::cuda::sqrt(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    sqrtGold(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Sqrt, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Log
+
+namespace
+{
+    template <typename T> void logImpl(const cv::Mat& src, cv::Mat& dst)
+    {
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+                dst.at<T>(y, x) = static_cast<T>(std::log(static_cast<float>(src.at<T>(y, x))));
+        }
+    }
+
+    void logGold(const cv::Mat& src, cv::Mat& dst)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
+
+        const func_t funcs[] =
+        {
+            logImpl<uchar>, logImpl<schar>, logImpl<ushort>, logImpl<short>,
+            logImpl<int>, logImpl<float>
+        };
+
+        funcs[src.depth()](src, dst);
+    }
+}
+
+PARAM_TEST_CASE(Log, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Log, Accuracy)
+{
+    cv::Mat src = randomMat(size, depth, 1.0, 255.0);
+
+    cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+    cv::cuda::log(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    logGold(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-6);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Log, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Exp
+
+namespace
+{
+    template <typename T> void expImpl(const cv::Mat& src, cv::Mat& dst)
+    {
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+                dst.at<T>(y, x) = cv::saturate_cast<T>(static_cast<int>(std::exp(static_cast<float>(src.at<T>(y, x)))));
+        }
+    }
+    void expImpl_float(const cv::Mat& src, cv::Mat& dst)
+    {
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+                dst.at<float>(y, x) = std::exp(static_cast<float>(src.at<float>(y, x)));
+        }
+    }
+
+    void expGold(const cv::Mat& src, cv::Mat& dst)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst);
+
+        const func_t funcs[] =
+        {
+            expImpl<uchar>, expImpl<schar>, expImpl<ushort>, expImpl<short>,
+            expImpl<int>, expImpl_float
+        };
+
+        funcs[src.depth()](src, dst);
+    }
+}
+
+PARAM_TEST_CASE(Exp, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Exp, Accuracy)
+{
+    cv::Mat src = randomMat(size, depth, 0.0, 10.0);
+
+    cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+    cv::cuda::exp(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    expGold(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Exp, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Pow
+
+PARAM_TEST_CASE(Pow, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Pow, Accuracy)
+{
+    cv::Mat src = randomMat(size, depth, 0.0, 10.0);
+    double power = randomDouble(2.0, 4.0);
+
+    if (src.depth() < CV_32F)
+        power = static_cast<int>(power);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::pow(loadMat(src), power, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+        cv::cuda::pow(loadMat(src, useRoi), power, dst);
+
+        cv::Mat dst_gold;
+        cv::pow(src, power, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 0.0 : 1e-1);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Pow, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Compare_Array
+
+CV_ENUM(CmpCode, cv::CMP_EQ, cv::CMP_GT, cv::CMP_GE, cv::CMP_LT, cv::CMP_LE, cv::CMP_NE)
+#define ALL_CMP_CODES testing::Values(CmpCode(cv::CMP_EQ), CmpCode(cv::CMP_NE), CmpCode(cv::CMP_GT), CmpCode(cv::CMP_GE), CmpCode(cv::CMP_LT), CmpCode(cv::CMP_LE))
+
+PARAM_TEST_CASE(Compare_Array, cv::cuda::DeviceInfo, cv::Size, MatDepth, CmpCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cmp_code;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cmp_code = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Compare_Array, Accuracy)
+{
+    cv::Mat src1 = randomMat(size, depth);
+    cv::Mat src2 = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::compare(loadMat(src1), loadMat(src2), dst, cmp_code);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, CV_8UC1, useRoi);
+        cv::cuda::compare(loadMat(src1, useRoi), loadMat(src2, useRoi), dst, cmp_code);
+
+        cv::Mat dst_gold;
+        cv::compare(src1, src2, dst_gold, cmp_code);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Compare_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    ALL_CMP_CODES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Compare_Scalar
+
+namespace
+{
+    template <template <typename> class Op, typename T>
+    void compareScalarImpl(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst)
+    {
+        Op<T> op;
+
+        const int cn = src.channels();
+
+        dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                {
+                    T src_val = src.at<T>(y, x * cn + c);
+                    T sc_val = cv::saturate_cast<T>(sc.val[c]);
+                    dst.at<uchar>(y, x * cn + c) = static_cast<uchar>(static_cast<int>(op(src_val, sc_val)) * 255);
+                }
+            }
+        }
+    }
+
+    void compareScalarGold(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst, int cmpop)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst);
+        static const func_t funcs[7][6] =
+        {
+            {compareScalarImpl<std::equal_to, unsigned char> , compareScalarImpl<std::greater, unsigned char> , compareScalarImpl<std::greater_equal, unsigned char> , compareScalarImpl<std::less, unsigned char> , compareScalarImpl<std::less_equal, unsigned char> , compareScalarImpl<std::not_equal_to, unsigned char> },
+            {compareScalarImpl<std::equal_to, signed char>   , compareScalarImpl<std::greater, signed char>   , compareScalarImpl<std::greater_equal, signed char>   , compareScalarImpl<std::less, signed char>   , compareScalarImpl<std::less_equal, signed char>   , compareScalarImpl<std::not_equal_to, signed char>   },
+            {compareScalarImpl<std::equal_to, unsigned short>, compareScalarImpl<std::greater, unsigned short>, compareScalarImpl<std::greater_equal, unsigned short>, compareScalarImpl<std::less, unsigned short>, compareScalarImpl<std::less_equal, unsigned short>, compareScalarImpl<std::not_equal_to, unsigned short>},
+            {compareScalarImpl<std::equal_to, short>         , compareScalarImpl<std::greater, short>         , compareScalarImpl<std::greater_equal, short>         , compareScalarImpl<std::less, short>         , compareScalarImpl<std::less_equal, short>         , compareScalarImpl<std::not_equal_to, short>         },
+            {compareScalarImpl<std::equal_to, int>           , compareScalarImpl<std::greater, int>           , compareScalarImpl<std::greater_equal, int>           , compareScalarImpl<std::less, int>           , compareScalarImpl<std::less_equal, int>           , compareScalarImpl<std::not_equal_to, int>           },
+            {compareScalarImpl<std::equal_to, float>         , compareScalarImpl<std::greater, float>         , compareScalarImpl<std::greater_equal, float>         , compareScalarImpl<std::less, float>         , compareScalarImpl<std::less_equal, float>         , compareScalarImpl<std::not_equal_to, float>         },
+            {compareScalarImpl<std::equal_to, double>        , compareScalarImpl<std::greater, double>        , compareScalarImpl<std::greater_equal, double>        , compareScalarImpl<std::less, double>        , compareScalarImpl<std::less_equal, double>        , compareScalarImpl<std::not_equal_to, double>        }
+        };
+
+        funcs[src.depth()][cmpop](src, sc, dst);
+    }
+}
+
+PARAM_TEST_CASE(Compare_Scalar, cv::cuda::DeviceInfo, cv::Size, MatType, CmpCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int cmp_code;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        cmp_code = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Compare_Scalar, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar sc = randomScalar(0.0, 255.0);
+
+    if (src.depth() < CV_32F)
+    {
+        sc.val[0] = cvRound(sc.val[0]);
+        sc.val[1] = cvRound(sc.val[1]);
+        sc.val[2] = cvRound(sc.val[2]);
+        sc.val[3] = cvRound(sc.val[3]);
+    }
+
+    if (src.depth() == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::compare(loadMat(src), sc, dst, cmp_code);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, CV_MAKE_TYPE(CV_8U, src.channels()), useRoi);
+
+        cv::cuda::compare(loadMat(src, useRoi), sc, dst, cmp_code);
+
+        cv::Mat dst_gold;
+        compareScalarGold(src, sc, dst_gold, cmp_code);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Compare_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    TYPES(CV_8U, CV_64F, 1, 4),
+    ALL_CMP_CODES,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// Bitwise_Array
+
+PARAM_TEST_CASE(Bitwise_Array, cv::cuda::DeviceInfo, cv::Size, MatType)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+
+    cv::Mat src1;
+    cv::Mat src2;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        src1 = randomMat(size, type, 0.0, std::numeric_limits<int>::max());
+        src2 = randomMat(size, type, 0.0, std::numeric_limits<int>::max());
+    }
+};
+
+CUDA_TEST_P(Bitwise_Array, Not)
+{
+    cv::cuda::GpuMat dst;
+    cv::cuda::bitwise_not(loadMat(src1), dst);
+
+    cv::Mat dst_gold = ~src1;
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(Bitwise_Array, Or)
+{
+    cv::cuda::GpuMat dst;
+    cv::cuda::bitwise_or(loadMat(src1), loadMat(src2), dst);
+
+    cv::Mat dst_gold = src1 | src2;
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(Bitwise_Array, And)
+{
+    cv::cuda::GpuMat dst;
+    cv::cuda::bitwise_and(loadMat(src1), loadMat(src2), dst);
+
+    cv::Mat dst_gold = src1 & src2;
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(Bitwise_Array, Xor)
+{
+    cv::cuda::GpuMat dst;
+    cv::cuda::bitwise_xor(loadMat(src1), loadMat(src2), dst);
+
+    cv::Mat dst_gold = src1 ^ src2;
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Bitwise_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    TYPES(CV_8U, CV_32S, 1, 4)));
+
+//////////////////////////////////////////////////////////////////////////////
+// Bitwise_Scalar
+
+PARAM_TEST_CASE(Bitwise_Scalar, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+
+    cv::Mat src;
+    cv::Scalar val;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        src = randomMat(size, CV_MAKE_TYPE(depth, channels));
+        cv::Scalar_<int> ival = randomScalar(0.0, std::numeric_limits<int>::max());
+        val = ival;
+    }
+};
+
+CUDA_TEST_P(Bitwise_Scalar, Or)
+{
+    cv::cuda::GpuMat dst;
+    cv::cuda::bitwise_or(loadMat(src), val, dst);
+
+    cv::Mat dst_gold;
+    cv::bitwise_or(src, val, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(Bitwise_Scalar, And)
+{
+    cv::cuda::GpuMat dst;
+    cv::cuda::bitwise_and(loadMat(src), val, dst);
+
+    cv::Mat dst_gold;
+    cv::bitwise_and(src, val, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(Bitwise_Scalar, Xor)
+{
+    cv::cuda::GpuMat dst;
+    cv::cuda::bitwise_xor(loadMat(src), val, dst);
+
+    cv::Mat dst_gold;
+    cv::bitwise_xor(src, val, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Bitwise_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32S)),
+    IMAGE_CHANNELS));
+
+//////////////////////////////////////////////////////////////////////////////
+// RShift
+
+namespace
+{
+    template <typename T> void rhiftImpl(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
+    {
+        const int cn = src.channels();
+
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = src.at<T>(y, x * cn + c) >> val.val[c];
+            }
+        }
+    }
+
+    void rhiftGold(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst);
+
+        const func_t funcs[] =
+        {
+            rhiftImpl<uchar>, rhiftImpl<schar>, rhiftImpl<ushort>, rhiftImpl<short>, rhiftImpl<int>
+        };
+
+        funcs[src.depth()](src, val, dst);
+    }
+}
+
+PARAM_TEST_CASE(RShift, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(RShift, Accuracy)
+{
+    int type = CV_MAKE_TYPE(depth, channels);
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar_<int> val = randomScalar(0.0, 8.0);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    cv::cuda::rshift(loadMat(src, useRoi), val, dst);
+
+    cv::Mat dst_gold;
+    rhiftGold(src, val, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, RShift, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_8S),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32S)),
+    IMAGE_CHANNELS,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// LShift
+
+namespace
+{
+    template <typename T> void lhiftImpl(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
+    {
+        const int cn = src.channels();
+
+        dst.create(src.size(), src.type());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = src.at<T>(y, x * cn + c) << val.val[c];
+            }
+        }
+    }
+
+    void lhiftGold(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Scalar_<int> val, cv::Mat& dst);
+
+        const func_t funcs[] =
+        {
+            lhiftImpl<uchar>, lhiftImpl<schar>, lhiftImpl<ushort>, lhiftImpl<short>, lhiftImpl<int>
+        };
+
+        funcs[src.depth()](src, val, dst);
+    }
+}
+
+PARAM_TEST_CASE(LShift, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(LShift, Accuracy)
+{
+    int type = CV_MAKE_TYPE(depth, channels);
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar_<int> val = randomScalar(0.0, 8.0);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    cv::cuda::lshift(loadMat(src, useRoi), val, dst);
+
+    cv::Mat dst_gold;
+    lhiftGold(src, val, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, LShift, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32S)),
+    IMAGE_CHANNELS,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// Min
+
+PARAM_TEST_CASE(Min, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Min, Array)
+{
+    cv::Mat src1 = randomMat(size, depth);
+    cv::Mat src2 = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::min(loadMat(src1), loadMat(src2), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+        cv::cuda::min(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
+
+        cv::Mat dst_gold = cv::min(src1, src2);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+CUDA_TEST_P(Min, Scalar)
+{
+    cv::Mat src = randomMat(size, depth);
+    double val = randomDouble(0.0, 255.0);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::min(loadMat(src), val, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+        cv::cuda::min(loadMat(src, useRoi), val, dst);
+
+        cv::Mat dst_gold = cv::min(src, val);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Min, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// Max
+
+PARAM_TEST_CASE(Max, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Max, Array)
+{
+    cv::Mat src1 = randomMat(size, depth);
+    cv::Mat src2 = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::max(loadMat(src1), loadMat(src2), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+        cv::cuda::max(loadMat(src1, useRoi), loadMat(src2, useRoi), dst);
+
+        cv::Mat dst_gold = cv::max(src1, src2);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+CUDA_TEST_P(Max, Scalar)
+{
+    cv::Mat src = randomMat(size, depth);
+    double val = randomDouble(0.0, 255.0);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::max(loadMat(src), val, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, depth, useRoi);
+        cv::cuda::max(loadMat(src, useRoi), val, dst);
+
+        cv::Mat dst_gold = cv::max(src, val);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Max, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// AddWeighted
+
+PARAM_TEST_CASE(AddWeighted, cv::cuda::DeviceInfo, cv::Size, MatDepth, MatDepth, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth1;
+    int depth2;
+    int dst_depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth1 = GET_PARAM(2);
+        depth2 = GET_PARAM(3);
+        dst_depth = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(AddWeighted, Accuracy)
+{
+    cv::Mat src1 = randomMat(size, depth1);
+    cv::Mat src2 = randomMat(size, depth2);
+    double alpha = randomDouble(-10.0, 10.0);
+    double beta = randomDouble(-10.0, 10.0);
+    double gamma = randomDouble(-10.0, 10.0);
+
+    if ((depth1 == CV_64F || depth2 == CV_64F || dst_depth == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::addWeighted(loadMat(src1), alpha, loadMat(src2), beta, gamma, dst, dst_depth);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, dst_depth, useRoi);
+        cv::cuda::addWeighted(loadMat(src1, useRoi), alpha, loadMat(src2, useRoi), beta, gamma, dst, dst_depth);
+
+        cv::Mat dst_gold;
+        cv::addWeighted(src1, alpha, src2, beta, gamma, dst_gold, dst_depth);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 2.0 : 1e-3);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, AddWeighted, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    ALL_DEPTH,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Threshold
+
+CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
+#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
+
+PARAM_TEST_CASE(Threshold, cv::cuda::DeviceInfo, cv::Size, MatType, Channels, ThreshOp, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int channel;
+    int threshOp;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        channel = GET_PARAM(3);
+        threshOp = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Threshold, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_MAKE_TYPE(type, channel));
+    double maxVal = randomDouble(20.0, 127.0);
+    double thresh = randomDouble(0.0, maxVal);
+
+    cv::cuda::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+    cv::cuda::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp);
+
+    cv::Mat dst_gold;
+    cv::threshold(src, dst_gold, thresh, maxVal, threshOp);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Threshold, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    ALL_CHANNELS,
+    ALL_THRESH_OPS,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Magnitude
+
+PARAM_TEST_CASE(Magnitude, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Magnitude, NPP)
+{
+    cv::Mat src = randomMat(size, CV_32FC2);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_32FC1, useRoi);
+    cv::cuda::magnitude(loadMat(src, useRoi), dst);
+
+    cv::Mat arr[2];
+    cv::split(src, arr);
+    cv::Mat dst_gold;
+    cv::magnitude(arr[0], arr[1], dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
+}
+
+CUDA_TEST_P(Magnitude, Sqr_NPP)
+{
+    cv::Mat src = randomMat(size, CV_32FC2);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_32FC1, useRoi);
+    cv::cuda::magnitudeSqr(loadMat(src, useRoi), dst);
+
+    cv::Mat arr[2];
+    cv::split(src, arr);
+    cv::Mat dst_gold;
+    cv::magnitude(arr[0], arr[1], dst_gold);
+    cv::multiply(dst_gold, dst_gold, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-1);
+}
+
+CUDA_TEST_P(Magnitude, Accuracy)
+{
+    cv::Mat x = randomMat(size, CV_32FC1);
+    cv::Mat y = randomMat(size, CV_32FC1);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_32FC1, useRoi);
+    cv::cuda::magnitude(loadMat(x, useRoi), loadMat(y, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::magnitude(x, y, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
+}
+
+CUDA_TEST_P(Magnitude, Sqr_Accuracy)
+{
+    cv::Mat x = randomMat(size, CV_32FC1);
+    cv::Mat y = randomMat(size, CV_32FC1);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_32FC1, useRoi);
+    cv::cuda::magnitudeSqr(loadMat(x, useRoi), loadMat(y, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::magnitude(x, y, dst_gold);
+    cv::multiply(dst_gold, dst_gold, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-1);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Magnitude, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Phase
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(AngleInDegrees, bool)
+}
+
+PARAM_TEST_CASE(Phase, cv::cuda::DeviceInfo, cv::Size, AngleInDegrees, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool angleInDegrees;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        angleInDegrees = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Phase, Accuracy)
+{
+    cv::Mat x = randomMat(size, CV_32FC1);
+    cv::Mat y = randomMat(size, CV_32FC1);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_32FC1, useRoi);
+    cv::cuda::phase(loadMat(x, useRoi), loadMat(y, useRoi), dst, angleInDegrees);
+
+    cv::Mat dst_gold;
+    cv::phase(x, y, dst_gold, angleInDegrees);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, angleInDegrees ? 1e-2 : 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Phase, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(AngleInDegrees(false), AngleInDegrees(true)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// CartToPolar
+
+PARAM_TEST_CASE(CartToPolar, cv::cuda::DeviceInfo, cv::Size, AngleInDegrees, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool angleInDegrees;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        angleInDegrees = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CartToPolar, Accuracy)
+{
+    cv::Mat x = randomMat(size, CV_32FC1);
+    cv::Mat y = randomMat(size, CV_32FC1);
+
+    cv::cuda::GpuMat mag = createMat(size, CV_32FC1, useRoi);
+    cv::cuda::GpuMat angle = createMat(size, CV_32FC1, useRoi);
+    cv::cuda::cartToPolar(loadMat(x, useRoi), loadMat(y, useRoi), mag, angle, angleInDegrees);
+
+    cv::Mat mag_gold;
+    cv::Mat angle_gold;
+    cv::cartToPolar(x, y, mag_gold, angle_gold, angleInDegrees);
+
+    EXPECT_MAT_NEAR(mag_gold, mag, 1e-4);
+    EXPECT_MAT_NEAR(angle_gold, angle, angleInDegrees ? 1e-2 : 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, CartToPolar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(AngleInDegrees(false), AngleInDegrees(true)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// polarToCart
+
+PARAM_TEST_CASE(PolarToCart, cv::cuda::DeviceInfo, cv::Size, AngleInDegrees, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool angleInDegrees;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        angleInDegrees = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(PolarToCart, Accuracy)
+{
+    cv::Mat magnitude = randomMat(size, CV_32FC1);
+    cv::Mat angle = randomMat(size, CV_32FC1);
+
+    cv::cuda::GpuMat x = createMat(size, CV_32FC1, useRoi);
+    cv::cuda::GpuMat y = createMat(size, CV_32FC1, useRoi);
+    cv::cuda::polarToCart(loadMat(magnitude, useRoi), loadMat(angle, useRoi), x, y, angleInDegrees);
+
+    cv::Mat x_gold;
+    cv::Mat y_gold;
+    cv::polarToCart(magnitude, angle, x_gold, y_gold, angleInDegrees);
+
+    EXPECT_MAT_NEAR(x_gold, x, 1e-4);
+    EXPECT_MAT_NEAR(y_gold, y, 1e-4);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, PolarToCart, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(AngleInDegrees(false), AngleInDegrees(true)),
+    WHOLE_SUBMAT));
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaarithm/test/test_gpumat.cpp b/modules/cudaarithm/test/test_gpumat.cpp
new file mode 100644
index 00000000000..e2fed16ad5f
--- /dev/null
+++ b/modules/cudaarithm/test/test_gpumat.cpp
@@ -0,0 +1,412 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// SetTo
+
+PARAM_TEST_CASE(GpuMat_SetTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(GpuMat_SetTo, Zero)
+{
+    cv::Scalar zero = cv::Scalar::all(0);
+
+    cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+    mat.setTo(zero);
+
+    EXPECT_MAT_NEAR(cv::Mat::zeros(size, type), mat, 0.0);
+}
+
+CUDA_TEST_P(GpuMat_SetTo, SameVal)
+{
+    cv::Scalar val = cv::Scalar::all(randomDouble(0.0, 255.0));
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+            mat.setTo(val);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+        mat.setTo(val);
+
+        EXPECT_MAT_NEAR(cv::Mat(size, type, val), mat, 0.0);
+    }
+}
+
+CUDA_TEST_P(GpuMat_SetTo, DifferentVal)
+{
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+            mat.setTo(val);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+        mat.setTo(val);
+
+        EXPECT_MAT_NEAR(cv::Mat(size, type, val), mat, 0.0);
+    }
+}
+
+CUDA_TEST_P(GpuMat_SetTo, Masked)
+{
+    cv::Scalar val = randomScalar(0.0, 255.0);
+    cv::Mat mat_gold = randomMat(size, type);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+            mat.setTo(val, loadMat(mask));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat mat = loadMat(mat_gold, useRoi);
+        mat.setTo(val, loadMat(mask, useRoi));
+
+        mat_gold.setTo(val, mask);
+
+        EXPECT_MAT_NEAR(mat_gold, mat, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA, GpuMat_SetTo, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_TYPES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// CopyTo
+
+PARAM_TEST_CASE(GpuMat_CopyTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(GpuMat_CopyTo, WithOutMask)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    d_src.copyTo(dst);
+
+    EXPECT_MAT_NEAR(src, dst, 0.0);
+}
+
+CUDA_TEST_P(GpuMat_CopyTo, Masked)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat d_src = loadMat(src);
+            cv::cuda::GpuMat dst;
+            d_src.copyTo(dst, loadMat(mask, useRoi));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+        cv::cuda::GpuMat dst = loadMat(cv::Mat::zeros(size, type), useRoi);
+        d_src.copyTo(dst, loadMat(mask, useRoi));
+
+        cv::Mat dst_gold = cv::Mat::zeros(size, type);
+        src.copyTo(dst_gold, mask);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA, GpuMat_CopyTo, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_TYPES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// ConvertTo
+
+PARAM_TEST_CASE(GpuMat_ConvertTo, cv::cuda::DeviceInfo, cv::Size, MatDepth, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth1;
+    int depth2;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth1 = GET_PARAM(2);
+        depth2 = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(GpuMat_ConvertTo, WithOutScaling)
+{
+    cv::Mat src = randomMat(size, depth1);
+
+    if ((depth1 == CV_64F || depth2 == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat d_src = loadMat(src);
+            cv::cuda::GpuMat dst;
+            d_src.convertTo(dst, depth2);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+        cv::cuda::GpuMat dst = createMat(size, depth2, useRoi);
+        d_src.convertTo(dst, depth2);
+
+        cv::Mat dst_gold;
+        src.convertTo(dst_gold, depth2);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth2 < CV_32F ? 1.0 : 1e-4);
+    }
+}
+
+CUDA_TEST_P(GpuMat_ConvertTo, WithScaling)
+{
+    cv::Mat src = randomMat(size, depth1);
+    double a = randomDouble(0.0, 1.0);
+    double b = randomDouble(-10.0, 10.0);
+
+    if ((depth1 == CV_64F || depth2 == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat d_src = loadMat(src);
+            cv::cuda::GpuMat dst;
+            d_src.convertTo(dst, depth2, a, b);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+        cv::cuda::GpuMat dst = createMat(size, depth2, useRoi);
+        d_src.convertTo(dst, depth2, a, b);
+
+        cv::Mat dst_gold;
+        src.convertTo(dst_gold, depth2, a, b);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth2 < CV_32F ? 1.0 : 1e-4);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA, GpuMat_ConvertTo, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// ensureSizeIsEnough
+
+struct EnsureSizeIsEnough : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    virtual void SetUp()
+    {
+        cv::cuda::DeviceInfo devInfo = GetParam();
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(EnsureSizeIsEnough, BufferReuse)
+{
+    cv::cuda::GpuMat buffer(100, 100, CV_8U);
+    cv::cuda::GpuMat old = buffer;
+
+    // don't reallocate memory
+    cv::cuda::ensureSizeIsEnough(10, 20, CV_8U, buffer);
+    EXPECT_EQ(10, buffer.rows);
+    EXPECT_EQ(20, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
+
+    // don't reallocate memory
+    cv::cuda::ensureSizeIsEnough(20, 30, CV_8U, buffer);
+    EXPECT_EQ(20, buffer.rows);
+    EXPECT_EQ(30, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA, EnsureSizeIsEnough, ALL_DEVICES);
+
+////////////////////////////////////////////////////////////////////////////////
+// createContinuous
+
+struct CreateContinuous : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    virtual void SetUp()
+    {
+        cv::cuda::DeviceInfo devInfo = GetParam();
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CreateContinuous, BufferReuse)
+{
+    cv::cuda::GpuMat buffer;
+
+    cv::cuda::createContinuous(100, 100, CV_8UC1, buffer);
+    EXPECT_EQ(100, buffer.rows);
+    EXPECT_EQ(100, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_TRUE(buffer.isContinuous());
+    EXPECT_EQ(buffer.cols * sizeof(uchar), buffer.step);
+
+    cv::cuda::createContinuous(10, 1000, CV_8UC1, buffer);
+    EXPECT_EQ(10, buffer.rows);
+    EXPECT_EQ(1000, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_TRUE(buffer.isContinuous());
+    EXPECT_EQ(buffer.cols * sizeof(uchar), buffer.step);
+
+    cv::cuda::createContinuous(10, 10, CV_8UC1, buffer);
+    EXPECT_EQ(10, buffer.rows);
+    EXPECT_EQ(10, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_TRUE(buffer.isContinuous());
+    EXPECT_EQ(buffer.cols * sizeof(uchar), buffer.step);
+
+    cv::cuda::createContinuous(100, 100, CV_8UC1, buffer);
+    EXPECT_EQ(100, buffer.rows);
+    EXPECT_EQ(100, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_TRUE(buffer.isContinuous());
+    EXPECT_EQ(buffer.cols * sizeof(uchar), buffer.step);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA, CreateContinuous, ALL_DEVICES);
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaarithm/test/test_main.cpp b/modules/cudaarithm/test/test_main.cpp
new file mode 100644
index 00000000000..04f4fcf6e60
--- /dev/null
+++ b/modules/cudaarithm/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cudaarithm/test/test_opengl.cpp b/modules/cudaarithm/test/test_opengl.cpp
new file mode 100644
index 00000000000..c1a8f189650
--- /dev/null
+++ b/modules/cudaarithm/test/test_opengl.cpp
@@ -0,0 +1,457 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#if defined(HAVE_CUDA) && defined(HAVE_OPENGL)
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/core/opengl.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+namespace opencv_test { namespace {
+
+/////////////////////////////////////////////
+// Buffer
+
+PARAM_TEST_CASE(Buffer, cv::Size, MatType)
+{
+    static void SetUpTestCase()
+    {
+        cv::namedWindow("test", cv::WINDOW_OPENGL);
+    }
+
+    static void TearDownTestCase()
+    {
+        cv::destroyAllWindows();
+    }
+
+    cv::Size size;
+    int type;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        type = GET_PARAM(1);
+    }
+};
+
+CUDA_TEST_P(Buffer, Constructor1)
+{
+    cv::ogl::Buffer buf(size.height, size.width, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    EXPECT_EQ(size.height, buf.rows());
+    EXPECT_EQ(size.width, buf.cols());
+    EXPECT_EQ(type, buf.type());
+}
+
+CUDA_TEST_P(Buffer, Constructor2)
+{
+    cv::ogl::Buffer buf(size, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    EXPECT_EQ(size.height, buf.rows());
+    EXPECT_EQ(size.width, buf.cols());
+    EXPECT_EQ(type, buf.type());
+}
+
+CUDA_TEST_P(Buffer, ConstructorFromMat)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+CUDA_TEST_P(Buffer, ConstructorFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type);
+    cv::cuda::GpuMat d_gold(gold);
+
+    cv::ogl::Buffer buf(d_gold, cv::ogl::Buffer::ARRAY_BUFFER);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+CUDA_TEST_P(Buffer, ConstructorFromBuffer)
+{
+    cv::ogl::Buffer buf_gold(size, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::ogl::Buffer buf(buf_gold);
+
+    EXPECT_EQ(buf_gold.bufId(), buf.bufId());
+    EXPECT_EQ(buf_gold.rows(), buf.rows());
+    EXPECT_EQ(buf_gold.cols(), buf.cols());
+    EXPECT_EQ(buf_gold.type(), buf.type());
+}
+
+CUDA_TEST_P(Buffer, Create)
+{
+    cv::ogl::Buffer buf;
+    buf.create(size.height, size.width, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    EXPECT_EQ(size.height, buf.rows());
+    EXPECT_EQ(size.width, buf.cols());
+    EXPECT_EQ(type, buf.type());
+}
+
+CUDA_TEST_P(Buffer, CopyFromMat)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf;
+    buf.copyFrom(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+CUDA_TEST_P(Buffer, CopyFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type);
+    cv::cuda::GpuMat d_gold(gold);
+
+    cv::ogl::Buffer buf;
+    buf.copyFrom(d_gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+CUDA_TEST_P(Buffer, CopyFromBuffer)
+{
+    cv::Mat gold = randomMat(size, type);
+    cv::ogl::Buffer buf_gold(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::ogl::Buffer buf;
+    buf.copyFrom(buf_gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    EXPECT_NE(buf_gold.bufId(), buf.bufId());
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+CUDA_TEST_P(Buffer, CopyToGpuMat)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::cuda::GpuMat dst;
+    buf.copyTo(dst);
+
+    EXPECT_MAT_NEAR(gold, dst, 0);
+}
+
+CUDA_TEST_P(Buffer, CopyToBuffer)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::ogl::Buffer dst;
+    buf.copyTo(dst);
+    dst.setAutoRelease(true);
+
+    EXPECT_NE(buf.bufId(), dst.bufId());
+
+    cv::Mat bufData;
+    dst.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+CUDA_TEST_P(Buffer, Clone)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::ogl::Buffer dst = buf.clone(cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    EXPECT_NE(buf.bufId(), dst.bufId());
+
+    cv::Mat bufData;
+    dst.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+CUDA_TEST_P(Buffer, MapHostRead)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::Mat dst = buf.mapHost(cv::ogl::Buffer::READ_ONLY);
+
+    EXPECT_MAT_NEAR(gold, dst, 0);
+
+    buf.unmapHost();
+}
+
+CUDA_TEST_P(Buffer, MapHostWrite)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(size, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::Mat dst = buf.mapHost(cv::ogl::Buffer::WRITE_ONLY);
+    gold.copyTo(dst);
+    buf.unmapHost();
+    dst.release();
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+CUDA_TEST_P(Buffer, MapDevice)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::cuda::GpuMat dst = buf.mapDevice();
+
+    EXPECT_MAT_NEAR(gold, dst, 0);
+
+    buf.unmapDevice();
+}
+
+INSTANTIATE_TEST_CASE_P(OpenGL, Buffer, testing::Combine(DIFFERENT_SIZES, ALL_TYPES));
+
+/////////////////////////////////////////////
+// Texture2D
+
+PARAM_TEST_CASE(Texture2D, cv::Size, MatType)
+{
+    static void SetUpTestCase()
+    {
+        cv::namedWindow("test", cv::WINDOW_OPENGL);
+    }
+
+    static void TearDownTestCase()
+    {
+        cv::destroyAllWindows();
+    }
+
+    cv::Size size;
+    int type;
+    int depth;
+    int cn;
+    cv::ogl::Texture2D::Format format;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        type = GET_PARAM(1);
+
+        depth = CV_MAT_DEPTH(type);
+        cn = CV_MAT_CN(type);
+        format = cn == 1 ? cv::ogl::Texture2D::DEPTH_COMPONENT : cn == 3 ? cv::ogl::Texture2D::RGB : cn == 4 ? cv::ogl::Texture2D::RGBA : cv::ogl::Texture2D::NONE;
+    }
+};
+
+CUDA_TEST_P(Texture2D, Constructor1)
+{
+    cv::ogl::Texture2D tex(size.height, size.width, format, true);
+
+    EXPECT_EQ(size.height, tex.rows());
+    EXPECT_EQ(size.width, tex.cols());
+    EXPECT_EQ(format, tex.format());
+}
+
+CUDA_TEST_P(Texture2D, Constructor2)
+{
+    cv::ogl::Texture2D tex(size, format, true);
+
+    EXPECT_EQ(size.height, tex.rows());
+    EXPECT_EQ(size.width, tex.cols());
+    EXPECT_EQ(format, tex.format());
+}
+
+CUDA_TEST_P(Texture2D, ConstructorFromMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::ogl::Texture2D tex(gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+CUDA_TEST_P(Texture2D, ConstructorFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::cuda::GpuMat d_gold(gold);
+
+    cv::ogl::Texture2D tex(d_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+CUDA_TEST_P(Texture2D, ConstructorFromBuffer)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::ogl::Buffer buf_gold(gold, cv::ogl::Buffer::PIXEL_UNPACK_BUFFER, true);
+
+    cv::ogl::Texture2D tex(buf_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+CUDA_TEST_P(Texture2D, ConstructorFromTexture2D)
+{
+    cv::ogl::Texture2D tex_gold(size, format, true);
+    cv::ogl::Texture2D tex(tex_gold);
+
+    EXPECT_EQ(tex_gold.texId(), tex.texId());
+    EXPECT_EQ(tex_gold.rows(), tex.rows());
+    EXPECT_EQ(tex_gold.cols(), tex.cols());
+    EXPECT_EQ(tex_gold.format(), tex.format());
+}
+
+CUDA_TEST_P(Texture2D, Create)
+{
+    cv::ogl::Texture2D tex;
+    tex.create(size.height, size.width, format, true);
+
+    EXPECT_EQ(size.height, tex.rows());
+    EXPECT_EQ(size.width, tex.cols());
+    EXPECT_EQ(format, tex.format());
+}
+
+CUDA_TEST_P(Texture2D, CopyFromMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::ogl::Texture2D tex;
+    tex.copyFrom(gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+CUDA_TEST_P(Texture2D, CopyFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::cuda::GpuMat d_gold(gold);
+
+    cv::ogl::Texture2D tex;
+    tex.copyFrom(d_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+CUDA_TEST_P(Texture2D, CopyFromBuffer)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::ogl::Buffer buf_gold(gold, cv::ogl::Buffer::PIXEL_UNPACK_BUFFER, true);
+
+    cv::ogl::Texture2D tex;
+    tex.copyFrom(buf_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+CUDA_TEST_P(Texture2D, CopyToGpuMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::ogl::Texture2D tex(gold, true);
+
+    cv::cuda::GpuMat dst;
+    tex.copyTo(dst, depth);
+
+    EXPECT_MAT_NEAR(gold, dst, 1e-2);
+}
+
+CUDA_TEST_P(Texture2D, CopyToBuffer)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::ogl::Texture2D tex(gold, true);
+
+    cv::ogl::Buffer dst;
+    tex.copyTo(dst, depth, true);
+
+    cv::Mat bufData;
+    dst.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(OpenGL, Texture2D, testing::Combine(DIFFERENT_SIZES, testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4)));
+
+}} // namespace
+#endif
diff --git a/modules/cudaarithm/test/test_precomp.hpp b/modules/cudaarithm/test/test_precomp.hpp
new file mode 100644
index 00000000000..93627cd6d5e
--- /dev/null
+++ b/modules/cudaarithm/test/test_precomp.hpp
@@ -0,0 +1,56 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudaarithm.hpp"
+
+#include "cvconfig.h"
+
+namespace opencv_test {
+using namespace cv::cuda;
+}
+
+#endif
diff --git a/modules/cudaarithm/test/test_reductions.cpp b/modules/cudaarithm/test/test_reductions.cpp
new file mode 100644
index 00000000000..b868280f96f
--- /dev/null
+++ b/modules/cudaarithm/test/test_reductions.cpp
@@ -0,0 +1,1121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// Norm
+
+PARAM_TEST_CASE(Norm, cv::cuda::DeviceInfo, cv::Size, MatDepth, NormCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int normCode;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        normCode = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Norm, Accuracy)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    double val = cv::cuda::norm(loadMat(src, useRoi), normCode, loadMat(mask, useRoi));
+
+    double val_gold = cv::norm(src, normCode, mask);
+
+    EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0);
+}
+
+CUDA_TEST_P(Norm, Async)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcNorm(loadMat(src, useRoi), dst, normCode, loadMat(mask, useRoi), stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    dst.createMatHeader().convertTo(cv::Mat(1, 1, CV_64FC1, &val), CV_64F);
+
+    double val_gold = cv::norm(src, normCode, mask);
+
+    EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Norm, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_8S),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32S),
+                    MatDepth(CV_32F)),
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// normDiff
+
+PARAM_TEST_CASE(NormDiff, cv::cuda::DeviceInfo, cv::Size, NormCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int normCode;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        normCode = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(NormDiff, Accuracy)
+{
+    cv::Mat src1 = randomMat(size, CV_8UC1);
+    cv::Mat src2 = randomMat(size, CV_8UC1);
+
+    double val = cv::cuda::norm(loadMat(src1, useRoi), loadMat(src2, useRoi), normCode);
+
+    double val_gold = cv::norm(src1, src2, normCode);
+
+    EXPECT_NEAR(val_gold, val, 0.0);
+}
+
+CUDA_TEST_P(NormDiff, Async)
+{
+    cv::Mat src1 = randomMat(size, CV_8UC1);
+    cv::Mat src2 = randomMat(size, CV_8UC1);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcNormDiff(loadMat(src1, useRoi), loadMat(src2, useRoi), dst, normCode, stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    const cv::Mat val_mat(1, 1, CV_64FC1, &val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    double val_gold = cv::norm(src1, src2, normCode);
+
+    EXPECT_NEAR(val_gold, val, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, NormDiff, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF)),
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// Sum
+
+namespace
+{
+    template <typename T>
+    cv::Scalar absSumImpl(const cv::Mat& src)
+    {
+        const int cn = src.channels();
+
+        cv::Scalar sum = cv::Scalar::all(0);
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    sum[c] += std::abs(src.at<T>(y, x * cn + c));
+            }
+        }
+
+        return sum;
+    }
+
+    cv::Scalar absSumGold(const cv::Mat& src)
+    {
+        typedef cv::Scalar (*func_t)(const cv::Mat& src);
+
+        static const func_t funcs[] =
+        {
+            absSumImpl<uchar>,
+            absSumImpl<schar>,
+            absSumImpl<ushort>,
+            absSumImpl<short>,
+            absSumImpl<int>,
+            absSumImpl<float>,
+            absSumImpl<double>
+        };
+
+        return funcs[src.depth()](src);
+    }
+
+    template <typename T>
+    cv::Scalar sqrSumImpl(const cv::Mat& src)
+    {
+        const int cn = src.channels();
+
+        cv::Scalar sum = cv::Scalar::all(0);
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                {
+                    const T val = src.at<T>(y, x * cn + c);
+                    sum[c] += val * val;
+                }
+            }
+        }
+
+        return sum;
+    }
+
+    cv::Scalar sqrSumGold(const cv::Mat& src)
+    {
+        typedef cv::Scalar (*func_t)(const cv::Mat& src);
+
+        static const func_t funcs[] =
+        {
+            sqrSumImpl<uchar>,
+            sqrSumImpl<schar>,
+            sqrSumImpl<ushort>,
+            sqrSumImpl<short>,
+            sqrSumImpl<int>,
+            sqrSumImpl<float>,
+            sqrSumImpl<double>
+        };
+
+        return funcs[src.depth()](src);
+    }
+}
+
+PARAM_TEST_CASE(Sum, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    cv::Mat src;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        src = randomMat(size, type, -128.0, 128.0);
+    }
+};
+
+CUDA_TEST_P(Sum, Simple)
+{
+    cv::Scalar val = cv::cuda::sum(loadMat(src, useRoi));
+
+    cv::Scalar val_gold = cv::sum(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
+CUDA_TEST_P(Sum, Simple_Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcSum(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    cv::Scalar val_gold = cv::sum(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
+CUDA_TEST_P(Sum, Abs)
+{
+    cv::Scalar val = cv::cuda::absSum(loadMat(src, useRoi));
+
+    cv::Scalar val_gold = absSumGold(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
+CUDA_TEST_P(Sum, Abs_Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcAbsSum(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    cv::Scalar val_gold = absSumGold(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
+CUDA_TEST_P(Sum, Sqr)
+{
+    cv::Scalar val = cv::cuda::sqrSum(loadMat(src, useRoi));
+
+    cv::Scalar val_gold = sqrSumGold(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
+CUDA_TEST_P(Sum, Sqr_Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcSqrSum(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    cv::Scalar val_gold = sqrSumGold(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Sum, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    TYPES(CV_8U, CV_64F, 1, 4),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// MinMax
+
+PARAM_TEST_CASE(MinMax, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(MinMax, WithoutMask)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::cuda::minMax(loadMat(src), &minVal, &maxVal);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::cuda::minMax(loadMat(src, useRoi), &minVal, &maxVal);
+
+        double minVal_gold, maxVal_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
+}
+
+CUDA_TEST_P(MinMax, Async)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::findMinMax(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    const cv::Mat vals_mat(1, 2, CV_64FC1, &vals[0]);
+    dst.createMatHeader().convertTo(vals_mat, CV_64F);
+
+    double minVal_gold, maxVal_gold;
+    minMaxLocGold(src, &minVal_gold, &maxVal_gold);
+
+    EXPECT_DOUBLE_EQ(minVal_gold, vals[0]);
+    EXPECT_DOUBLE_EQ(maxVal_gold, vals[1]);
+}
+
+CUDA_TEST_P(MinMax, WithMask)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::cuda::minMax(loadMat(src), &minVal, &maxVal, loadMat(mask));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::cuda::minMax(loadMat(src, useRoi), &minVal, &maxVal, loadMat(mask, useRoi));
+
+        double minVal_gold, maxVal_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, 0, 0, mask);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
+}
+
+CUDA_TEST_P(MinMax, NullPtr)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::cuda::minMax(loadMat(src), &minVal, 0);
+            cv::cuda::minMax(loadMat(src), 0, &maxVal);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::cuda::minMax(loadMat(src, useRoi), &minVal, 0);
+        cv::cuda::minMax(loadMat(src, useRoi), 0, &maxVal);
+
+        double minVal_gold, maxVal_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, 0, 0);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MinMax, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// MinMaxLoc
+
+namespace
+{
+    template <typename T>
+    void expectEqualImpl(const cv::Mat& src, cv::Point loc_gold, cv::Point loc)
+    {
+        EXPECT_EQ(src.at<T>(loc_gold.y, loc_gold.x), src.at<T>(loc.y, loc.x));
+    }
+
+    void expectEqual(const cv::Mat& src, cv::Point loc_gold, cv::Point loc)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Point loc_gold, cv::Point loc);
+
+        static const func_t funcs[] =
+        {
+            expectEqualImpl<uchar>,
+            expectEqualImpl<schar>,
+            expectEqualImpl<ushort>,
+            expectEqualImpl<short>,
+            expectEqualImpl<int>,
+            expectEqualImpl<float>,
+            expectEqualImpl<double>
+        };
+
+        funcs[src.depth()](src, loc_gold, loc);
+    }
+}
+
+PARAM_TEST_CASE(MinMaxLoc, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(MinMaxLoc, WithoutMask)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::cuda::minMaxLoc(loadMat(src), &minVal, &maxVal, &minLoc, &maxLoc);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::Point minLoc, maxLoc;
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc);
+
+        double minVal_gold, maxVal_gold;
+        cv::Point minLoc_gold, maxLoc_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+
+        expectEqual(src, minLoc_gold, minLoc);
+        expectEqual(src, maxLoc_gold, maxLoc);
+    }
+}
+
+CUDA_TEST_P(MinMaxLoc, OneRowMat)
+{
+    cv::Mat src = randomMat(cv::Size(size.width, 1), depth);
+
+    double minVal, maxVal;
+    cv::Point minLoc, maxLoc;
+    cv::cuda::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc);
+
+    double minVal_gold, maxVal_gold;
+    cv::Point minLoc_gold, maxLoc_gold;
+    minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+
+    EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+    EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+
+    expectEqual(src, minLoc_gold, minLoc);
+    expectEqual(src, maxLoc_gold, maxLoc);
+}
+
+CUDA_TEST_P(MinMaxLoc, OneColumnMat)
+{
+    cv::Mat src = randomMat(cv::Size(1, size.height), depth);
+
+    double minVal, maxVal;
+    cv::Point minLoc, maxLoc;
+    cv::cuda::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc);
+
+    double minVal_gold, maxVal_gold;
+    cv::Point minLoc_gold, maxLoc_gold;
+    minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+
+    EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+    EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+
+    expectEqual(src, minLoc_gold, minLoc);
+    expectEqual(src, maxLoc_gold, maxLoc);
+}
+
+CUDA_TEST_P(MinMaxLoc, Async)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem minMaxVals, locVals;
+    cv::cuda::findMinMaxLoc(loadMat(src, useRoi), minMaxVals, locVals, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    const cv::Mat vals_mat(2, 1, CV_64FC1, &vals[0]);
+    minMaxVals.createMatHeader().convertTo(vals_mat, CV_64F);
+
+    int locs[2];
+    const cv::Mat locs_mat(2, 1, CV_32SC1, &locs[0]);
+    locVals.createMatHeader().copyTo(locs_mat);
+
+    cv::Point locs2D[] = {
+        cv::Point(locs[0] % src.cols, locs[0] / src.cols),
+        cv::Point(locs[1] % src.cols, locs[1] / src.cols),
+    };
+
+    double minVal_gold, maxVal_gold;
+    cv::Point minLoc_gold, maxLoc_gold;
+    minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+
+    EXPECT_DOUBLE_EQ(minVal_gold, vals[0]);
+    EXPECT_DOUBLE_EQ(maxVal_gold, vals[1]);
+
+    expectEqual(src, minLoc_gold, locs2D[0]);
+    expectEqual(src, maxLoc_gold, locs2D[1]);
+}
+
+CUDA_TEST_P(MinMaxLoc, WithMask)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::cuda::minMaxLoc(loadMat(src), &minVal, &maxVal, &minLoc, &maxLoc, loadMat(mask));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::Point minLoc, maxLoc;
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc, loadMat(mask, useRoi));
+
+        double minVal_gold, maxVal_gold;
+        cv::Point minLoc_gold, maxLoc_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold, mask);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+
+        expectEqual(src, minLoc_gold, minLoc);
+        expectEqual(src, maxLoc_gold, maxLoc);
+    }
+}
+
+CUDA_TEST_P(MinMaxLoc, NullPtr)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::cuda::minMaxLoc(loadMat(src, useRoi), &minVal, 0, 0, 0);
+            cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, &maxVal, 0, 0);
+            cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, 0, &minLoc, 0);
+            cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, 0, 0, &maxLoc);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::Point minLoc, maxLoc;
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), &minVal, 0, 0, 0);
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, &maxVal, 0, 0);
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, 0, &minLoc, 0);
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, 0, 0, &maxLoc);
+
+        double minVal_gold, maxVal_gold;
+        cv::Point minLoc_gold, maxLoc_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+
+        expectEqual(src, minLoc_gold, minLoc);
+        expectEqual(src, maxLoc_gold, maxLoc);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MinMaxLoc, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////
+// CountNonZero
+
+PARAM_TEST_CASE(CountNonZero, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    cv::Mat src;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        cv::Mat srcBase = randomMat(size, CV_8U, 0.0, 1.5);
+        srcBase.convertTo(src, depth);
+    }
+};
+
+CUDA_TEST_P(CountNonZero, Accuracy)
+{
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::countNonZero(loadMat(src));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        int val = cv::cuda::countNonZero(loadMat(src, useRoi));
+
+        int val_gold = cv::countNonZero(src);
+
+        ASSERT_EQ(val_gold, val);
+    }
+}
+
+CUDA_TEST_P(CountNonZero, Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::countNonZero(loadMat(src, useRoi), dst, stream);
+
+    stream.waitForCompletion();
+
+    int val;
+    const cv::Mat val_mat(1, 1, CV_32SC1, &val);
+    dst.createMatHeader().copyTo(val_mat);
+
+    int val_gold = cv::countNonZero(src);
+
+    ASSERT_EQ(val_gold, val);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, CountNonZero, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// Reduce
+
+CV_ENUM(ReduceCode, cv::REDUCE_SUM, cv::REDUCE_AVG, cv::REDUCE_MAX, cv::REDUCE_MIN)
+#define ALL_REDUCE_CODES testing::Values(ReduceCode(cv::REDUCE_SUM), ReduceCode(cv::REDUCE_AVG), ReduceCode(cv::REDUCE_MAX), ReduceCode(cv::REDUCE_MIN))
+
+PARAM_TEST_CASE(Reduce, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, ReduceCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    int reduceOp;
+    bool useRoi;
+
+    int type;
+    int dst_depth;
+    int dst_type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        reduceOp = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, channels);
+
+        if (reduceOp == cv::REDUCE_MAX || reduceOp == cv::REDUCE_MIN)
+            dst_depth = depth;
+        else if (reduceOp == cv::REDUCE_SUM)
+            dst_depth = depth == CV_8U ? CV_32S : depth < CV_64F ? CV_32F : depth;
+        else
+            dst_depth = depth < CV_32F ? CV_32F : depth;
+
+        dst_type = CV_MAKE_TYPE(dst_depth, channels);
+    }
+
+};
+
+CUDA_TEST_P(Reduce, Rows)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(src.cols, 1), dst_type, useRoi);
+    cv::cuda::reduce(loadMat(src, useRoi), dst, 0, reduceOp, dst_depth);
+
+    cv::Mat dst_gold;
+    cv::reduce(src, dst_gold, 0, reduceOp, dst_depth);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 0.02);
+}
+
+CUDA_TEST_P(Reduce, Cols)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::reduce(loadMat(src, useRoi), dst, 1, reduceOp, dst_depth);
+
+    cv::Mat dst_gold;
+    cv::reduce(src, dst_gold, 1, reduceOp, dst_depth);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 0.02);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Reduce, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32F),
+                    MatDepth(CV_64F)),
+    ALL_CHANNELS,
+    ALL_REDUCE_CODES,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// Normalize
+
+PARAM_TEST_CASE(Normalize, cv::cuda::DeviceInfo, cv::Size, MatDepth, NormCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int norm_type;
+    bool useRoi;
+
+    double alpha;
+    double beta;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        norm_type = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        alpha = 1;
+        beta = 0;
+    }
+
+};
+
+CUDA_TEST_P(Normalize, WithOutMask)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    cv::cuda::normalize(loadMat(src, useRoi), dst, alpha, beta, norm_type, type);
+
+    cv::Mat dst_gold;
+    cv::normalize(src, dst_gold, alpha, beta, norm_type, type);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, type < CV_32F ? 1.0 : 1e-4);
+}
+
+CUDA_TEST_P(Normalize, WithMask)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    dst.setTo(cv::Scalar::all(0));
+    cv::cuda::normalize(loadMat(src, useRoi), dst, alpha, beta, norm_type, -1, loadMat(mask, useRoi));
+
+    cv::Mat dst_gold(size, type);
+    dst_gold.setTo(cv::Scalar::all(0));
+    cv::normalize(src, dst_gold, alpha, beta, norm_type, -1, mask);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, type < CV_32F ? 1.0 : 1e-4);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Normalize, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF), NormCode(cv::NORM_MINMAX)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// MeanStdDev
+
+PARAM_TEST_CASE(MeanStdDev, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(MeanStdDev, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_13))
+    {
+        try
+        {
+            cv::Scalar mean;
+            cv::Scalar stddev;
+            cv::cuda::meanStdDev(loadMat(src, useRoi), mean, stddev);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        cv::Scalar mean;
+        cv::Scalar stddev;
+        cv::cuda::meanStdDev(loadMat(src, useRoi), mean, stddev);
+
+        cv::Scalar mean_gold;
+        cv::Scalar stddev_gold;
+        cv::meanStdDev(src, mean_gold, stddev_gold);
+
+        EXPECT_SCALAR_NEAR(mean_gold, mean, 1e-5);
+        EXPECT_SCALAR_NEAR(stddev_gold, stddev, 1e-5);
+    }
+}
+
+CUDA_TEST_P(MeanStdDev, Async)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::meanStdDev(loadMat(src, useRoi), dst, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().copyTo(cv::Mat(1, 2, CV_64FC1, &vals[0]));
+
+    cv::Scalar mean_gold;
+    cv::Scalar stddev_gold;
+    cv::meanStdDev(src, mean_gold, stddev_gold);
+
+    EXPECT_SCALAR_NEAR(mean_gold, cv::Scalar(vals[0]), 1e-5);
+    EXPECT_SCALAR_NEAR(stddev_gold, cv::Scalar(vals[1]), 1e-5);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MeanStdDev, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Integral
+
+PARAM_TEST_CASE(Integral, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Integral, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi);
+    cv::cuda::integral(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::integral(src, dst_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Integral, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(cv::Size(16, 16), cv::Size(128, 128), cv::Size(113, 113), cv::Size(768, 1066)),
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// IntegralSqr
+
+PARAM_TEST_CASE(IntegralSqr, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(IntegralSqr, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_64FC1, useRoi);
+    cv::cuda::sqrIntegral(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold, temp;
+    cv::integral(src, temp, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, IntegralSqr, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaarithm/test/test_stream.cpp b/modules/cudaarithm/test/test_stream.cpp
new file mode 100644
index 00000000000..d701edfee32
--- /dev/null
+++ b/modules/cudaarithm/test/test_stream.cpp
@@ -0,0 +1,176 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#include <cuda_runtime.h>
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/core/cuda_stream_accessor.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+namespace opencv_test { namespace {
+
+struct Async : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::HostMem src;
+    cv::cuda::GpuMat d_src;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::GpuMat d_dst;
+
+    virtual void SetUp()
+    {
+        cv::cuda::DeviceInfo devInfo = GetParam();
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        src = cv::cuda::HostMem(cv::cuda::HostMem::PAGE_LOCKED);
+
+        cv::Mat m = randomMat(cv::Size(128, 128), CV_8UC1);
+        m.copyTo(src);
+    }
+};
+
+void checkMemSet(int status, void* userData)
+{
+    ASSERT_EQ(cudaSuccess, status);
+
+    Async* test = reinterpret_cast<Async*>(userData);
+
+    cv::cuda::HostMem src = test->src;
+    cv::cuda::HostMem dst = test->dst;
+
+    cv::Mat dst_gold = cv::Mat::zeros(src.size(), src.type());
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+}
+
+CUDA_TEST_P(Async, MemSet)
+{
+    cv::cuda::Stream stream;
+
+    d_dst.upload(src);
+
+    d_dst.setTo(cv::Scalar::all(0), stream);
+    d_dst.download(dst, stream);
+
+    Async* test = this;
+    stream.enqueueHostCallback(checkMemSet, test);
+
+    stream.waitForCompletion();
+}
+
+void checkConvert(int status, void* userData)
+{
+    ASSERT_EQ(cudaSuccess, status);
+
+    Async* test = reinterpret_cast<Async*>(userData);
+
+    cv::cuda::HostMem src = test->src;
+    cv::cuda::HostMem dst = test->dst;
+
+    cv::Mat dst_gold;
+    src.createMatHeader().convertTo(dst_gold, CV_32S);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+}
+
+CUDA_TEST_P(Async, Convert)
+{
+    cv::cuda::Stream stream;
+
+    d_src.upload(src, stream);
+    d_src.convertTo(d_dst, CV_32S, stream);
+    d_dst.download(dst, stream);
+
+    Async* test = this;
+    stream.enqueueHostCallback(checkConvert, test);
+
+    stream.waitForCompletion();
+}
+
+CUDA_TEST_P(Async, WrapStream)
+{
+    cudaStream_t cuda_stream = NULL;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&cuda_stream));
+
+    {
+        cv::cuda::Stream stream = cv::cuda::StreamAccessor::wrapStream(cuda_stream);
+
+        d_src.upload(src, stream);
+        d_src.convertTo(d_dst, CV_32S, stream);
+        d_dst.download(dst, stream);
+
+        Async* test = this;
+        stream.enqueueHostCallback(checkConvert, test);
+
+        stream.waitForCompletion();
+    }
+
+    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(cuda_stream));
+}
+
+CUDA_TEST_P(Async, HostMemAllocator)
+{
+    cv::cuda::Stream stream;
+
+    cv::Mat h_dst;
+    h_dst.allocator = cv::cuda::HostMem::getAllocator();
+
+    d_src.upload(src, stream);
+    d_src.convertTo(d_dst, CV_32S, stream);
+    d_dst.download(h_dst, stream);
+
+    stream.waitForCompletion();
+
+    cv::Mat dst_gold;
+    src.createMatHeader().convertTo(dst_gold, CV_32S);
+
+    ASSERT_MAT_NEAR(dst_gold, h_dst, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Stream, Async, ALL_DEVICES);
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudabgsegm/CMakeLists.txt b/modules/cudabgsegm/CMakeLists.txt
new file mode 100644
index 00000000000..ffc6a628aea
--- /dev/null
+++ b/modules/cudabgsegm/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudabgsegm)
+endif()
+
+set(the_description "CUDA-accelerated Background Segmentation")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudabgsegm opencv_video WRAP python)
diff --git a/modules/cudabgsegm/include/opencv2/cudabgsegm.hpp b/modules/cudabgsegm/include/opencv2/cudabgsegm.hpp
new file mode 100644
index 00000000000..1c051e4a501
--- /dev/null
+++ b/modules/cudabgsegm/include/opencv2/cudabgsegm.hpp
@@ -0,0 +1,154 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDABGSEGM_HPP
+#define OPENCV_CUDABGSEGM_HPP
+
+#ifndef __cplusplus
+#  error cudabgsegm.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/video/background_segm.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudabgsegm Background Segmentation
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudabgsegm
+//! @{
+
+////////////////////////////////////////////////////
+// MOG
+
+/** @brief Gaussian Mixture-based Background/Foreground Segmentation Algorithm.
+
+The class discriminates between foreground and background pixels by building and maintaining a model
+of the background. Any pixel which does not fit this model is then deemed to be foreground. The
+class implements algorithm described in @cite MOG2001 .
+
+@sa BackgroundSubtractorMOG
+
+@note
+   -   An example on gaussian mixture based background/foreground segmantation can be found at
+        opencv_source_code/samples/gpu/bgfg_segm.cpp
+ */
+class CV_EXPORTS_W BackgroundSubtractorMOG : public cv::BackgroundSubtractor
+{
+public:
+
+    using cv::BackgroundSubtractor::apply;
+    CV_WRAP virtual void apply(InputArray image, OutputArray fgmask, double learningRate, Stream& stream) = 0;
+
+    using cv::BackgroundSubtractor::getBackgroundImage;
+    CV_WRAP virtual void getBackgroundImage(OutputArray backgroundImage, Stream& stream) const = 0;
+
+    CV_WRAP virtual int getHistory() const = 0;
+    CV_WRAP virtual void setHistory(int nframes) = 0;
+
+    CV_WRAP virtual int getNMixtures() const = 0;
+    CV_WRAP virtual void setNMixtures(int nmix) = 0;
+
+    CV_WRAP virtual double getBackgroundRatio() const = 0;
+    CV_WRAP virtual void setBackgroundRatio(double backgroundRatio) = 0;
+
+    CV_WRAP virtual double getNoiseSigma() const = 0;
+    CV_WRAP virtual void setNoiseSigma(double noiseSigma) = 0;
+};
+
+/** @brief Creates mixture-of-gaussian background subtractor
+
+@param history Length of the history.
+@param nmixtures Number of Gaussian mixtures.
+@param backgroundRatio Background ratio.
+@param noiseSigma Noise strength (standard deviation of the brightness or each color channel). 0
+means some automatic value.
+ */
+CV_EXPORTS_W Ptr<cuda::BackgroundSubtractorMOG>
+    createBackgroundSubtractorMOG(int history = 200, int nmixtures = 5,
+                                  double backgroundRatio = 0.7, double noiseSigma = 0);
+
+////////////////////////////////////////////////////
+// MOG2
+
+/** @brief Gaussian Mixture-based Background/Foreground Segmentation Algorithm.
+
+The class discriminates between foreground and background pixels by building and maintaining a model
+of the background. Any pixel which does not fit this model is then deemed to be foreground. The
+class implements algorithm described in @cite Zivkovic2004 .
+
+@sa BackgroundSubtractorMOG2
+ */
+class CV_EXPORTS_W BackgroundSubtractorMOG2 : public cv::BackgroundSubtractorMOG2
+{
+public:
+    using cv::BackgroundSubtractorMOG2::apply;
+    using cv::BackgroundSubtractorMOG2::getBackgroundImage;
+
+    CV_WRAP virtual void apply(InputArray image, OutputArray fgmask, double learningRate, Stream& stream) = 0;
+
+    CV_WRAP virtual void getBackgroundImage(OutputArray backgroundImage, Stream& stream) const = 0;
+};
+
+/** @brief Creates MOG2 Background Subtractor
+
+@param history Length of the history.
+@param varThreshold Threshold on the squared Mahalanobis distance between the pixel and the model
+to decide whether a pixel is well described by the background model. This parameter does not
+affect the background update.
+@param detectShadows If true, the algorithm will detect shadows and mark them. It decreases the
+speed a bit, so if you do not need this feature, set the parameter to false.
+ */
+CV_EXPORTS_W Ptr<cuda::BackgroundSubtractorMOG2>
+    createBackgroundSubtractorMOG2(int history = 500, double varThreshold = 16,
+                                   bool detectShadows = true);
+
+//! @}
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_CUDABGSEGM_HPP */
diff --git a/modules/cudabgsegm/perf/perf_bgsegm.cpp b/modules/cudabgsegm/perf/perf_bgsegm.cpp
new file mode 100644
index 00000000000..e99c26d5609
--- /dev/null
+++ b/modules/cudabgsegm/perf/perf_bgsegm.cpp
@@ -0,0 +1,392 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////
+// MOG
+
+#ifdef HAVE_VIDEO_INPUT
+
+DEF_PARAM_TEST(Video_Cn_LearningRate, string, MatCn, double);
+
+PERF_TEST_P(Video_Cn_LearningRate, MOG,
+            Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
+                    CUDA_CHANNELS_1_3_4,
+                    Values(0.0, 0.01)))
+{
+    const int numIters = 10;
+
+    const string inputFile = perf::TestBase::getDataPath(GET_PARAM(0));
+    const int cn = GET_PARAM(1);
+    const float learningRate = static_cast<float>(GET_PARAM(2));
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    if (cn != 3)
+    {
+        cv::Mat temp;
+        if (cn == 1)
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+        else
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+        cv::swap(temp, frame);
+    }
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::BackgroundSubtractor> d_mog = cv::cuda::createBackgroundSubtractorMOG();
+
+        cv::cuda::GpuMat d_frame(frame);
+        cv::cuda::GpuMat foreground;
+
+        d_mog->apply(d_frame, foreground, learningRate);
+
+        int i = 0;
+
+        // collect performance data
+        for (; i < numIters; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            d_frame.upload(frame);
+
+            startTimer();
+            if(!next())
+                break;
+
+            d_mog->apply(d_frame, foreground, learningRate);
+
+            stopTimer();
+        }
+
+        // process last frame in sequence to get data for sanity test
+        for (; i < numIters; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            d_frame.upload(frame);
+
+            d_mog->apply(d_frame, foreground, learningRate);
+        }
+
+        CUDA_SANITY_CHECK(foreground);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+#endif
+
+//////////////////////////////////////////////////////
+// MOG2
+
+#ifdef HAVE_VIDEO_INPUT
+
+DEF_PARAM_TEST(Video_Cn, string, int);
+
+PERF_TEST_P(Video_Cn, DISABLED_MOG2,
+            Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const int numIters = 10;
+
+    const string inputFile = perf::TestBase::getDataPath(GET_PARAM(0));
+    const int cn = GET_PARAM(1);
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    if (cn != 3)
+    {
+        cv::Mat temp;
+        if (cn == 1)
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+        else
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+        cv::swap(temp, frame);
+    }
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::BackgroundSubtractorMOG2> d_mog2 = cv::cuda::createBackgroundSubtractorMOG2();
+        d_mog2->setDetectShadows(false);
+
+        cv::cuda::GpuMat d_frame(frame);
+        cv::cuda::GpuMat foreground;
+
+        d_mog2->apply(d_frame, foreground);
+
+        int i = 0;
+
+        // collect performance data
+        for (; i < numIters; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            d_frame.upload(frame);
+
+            startTimer();
+            if(!next())
+                break;
+
+            d_mog2->apply(d_frame, foreground);
+
+            stopTimer();
+        }
+
+        // process last frame in sequence to get data for sanity test
+        for (; i < numIters; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            d_frame.upload(frame);
+
+            d_mog2->apply(d_frame, foreground);
+        }
+
+        CUDA_SANITY_CHECK(foreground);
+    }
+    else
+    {
+        cv::Ptr<cv::BackgroundSubtractorMOG2> mog2 = cv::createBackgroundSubtractorMOG2();
+        mog2->setDetectShadows(false);
+
+        cv::Mat foreground;
+
+        mog2->apply(frame, foreground);
+
+        int i = 0;
+
+        // collect performance data
+        for (; i < numIters; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            startTimer();
+            if(!next())
+                break;
+
+            mog2->apply(frame, foreground);
+
+            stopTimer();
+        }
+
+        // process last frame in sequence to get data for sanity test
+        for (; i < numIters; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            mog2->apply(frame, foreground);
+        }
+
+        CPU_SANITY_CHECK(foreground);
+    }
+}
+
+#endif
+
+//////////////////////////////////////////////////////
+// MOG2GetBackgroundImage
+
+#ifdef HAVE_VIDEO_INPUT
+
+PERF_TEST_P(Video_Cn, MOG2GetBackgroundImage,
+            Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const string inputFile = perf::TestBase::getDataPath(GET_PARAM(0));
+    const int cn = GET_PARAM(1);
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::BackgroundSubtractor> d_mog2 = cv::cuda::createBackgroundSubtractorMOG2();
+
+        cv::cuda::GpuMat d_frame;
+        cv::cuda::GpuMat d_foreground;
+
+        for (int i = 0; i < 10; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            d_frame.upload(frame);
+
+            d_mog2->apply(d_frame, d_foreground);
+        }
+
+        cv::cuda::GpuMat background;
+
+        TEST_CYCLE() d_mog2->getBackgroundImage(background);
+
+        CUDA_SANITY_CHECK(background, 1);
+    }
+    else
+    {
+        cv::Ptr<cv::BackgroundSubtractor> mog2 = cv::createBackgroundSubtractorMOG2();
+        cv::Mat foreground;
+
+        for (int i = 0; i < 10; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            mog2->apply(frame, foreground);
+        }
+
+        cv::Mat background;
+
+        TEST_CYCLE() mog2->getBackgroundImage(background);
+
+        CPU_SANITY_CHECK(background);
+    }
+}
+
+#endif
+
+}} // namespace
diff --git a/modules/cudabgsegm/perf/perf_main.cpp b/modules/cudabgsegm/perf/perf_main.cpp
new file mode 100644
index 00000000000..e85365d03d9
--- /dev/null
+++ b/modules/cudabgsegm/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudabgsegm)
diff --git a/modules/cudabgsegm/perf/perf_precomp.hpp b/modules/cudabgsegm/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..77194b61949
--- /dev/null
+++ b/modules/cudabgsegm/perf/perf_precomp.hpp
@@ -0,0 +1,55 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef OPENCV_PERF_PRECOMP_HPP
+#define OPENCV_PERF_PRECOMP_HPP
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudabgsegm.hpp"
+#include "opencv2/video.hpp"
+
+namespace opencv_test {
+using namespace perf;
+}
+
+#endif
diff --git a/modules/cudabgsegm/src/cuda/mog.cu b/modules/cudabgsegm/src/cuda/mog.cu
new file mode 100644
index 00000000000..a0f03843872
--- /dev/null
+++ b/modules/cudabgsegm/src/cuda/mog.cu
@@ -0,0 +1,425 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace mog
+    {
+        ///////////////////////////////////////////////////////////////
+        // Utility
+
+        __device__ __forceinline__ float cvt(uchar val)
+        {
+            return val;
+        }
+        __device__ __forceinline__ float3 cvt(const uchar3& val)
+        {
+            return make_float3(val.x, val.y, val.z);
+        }
+        __device__ __forceinline__ float4 cvt(const uchar4& val)
+        {
+            return make_float4(val.x, val.y, val.z, val.w);
+        }
+
+        __device__ __forceinline__ float sqr(float val)
+        {
+            return val * val;
+        }
+        __device__ __forceinline__ float sqr(const float3& val)
+        {
+            return val.x * val.x + val.y * val.y + val.z * val.z;
+        }
+        __device__ __forceinline__ float sqr(const float4& val)
+        {
+            return val.x * val.x + val.y * val.y + val.z * val.z;
+        }
+
+        __device__ __forceinline__ float sum(float val)
+        {
+            return val;
+        }
+        __device__ __forceinline__ float sum(const float3& val)
+        {
+            return val.x + val.y + val.z;
+        }
+        __device__ __forceinline__ float sum(const float4& val)
+        {
+            return val.x + val.y + val.z;
+        }
+
+        __device__ __forceinline__ float clamp(float var, float learningRate, float diff, float minVar)
+        {
+             return ::fmaxf(var + learningRate * (diff * diff - var), minVar);
+        }
+        __device__ __forceinline__ float3 clamp(const float3& var, float learningRate, const float3& diff, float minVar)
+        {
+             return make_float3(::fmaxf(var.x + learningRate * (diff.x * diff.x - var.x), minVar),
+                                ::fmaxf(var.y + learningRate * (diff.y * diff.y - var.y), minVar),
+                                ::fmaxf(var.z + learningRate * (diff.z * diff.z - var.z), minVar));
+        }
+        __device__ __forceinline__ float4 clamp(const float4& var, float learningRate, const float4& diff, float minVar)
+        {
+             return make_float4(::fmaxf(var.x + learningRate * (diff.x * diff.x - var.x), minVar),
+                                ::fmaxf(var.y + learningRate * (diff.y * diff.y - var.y), minVar),
+                                ::fmaxf(var.z + learningRate * (diff.z * diff.z - var.z), minVar),
+                                0.0f);
+        }
+
+        ///////////////////////////////////////////////////////////////
+        // MOG without learning
+
+        template <typename SrcT, typename WorkT>
+        __global__ void mog_withoutLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
+                                            const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, const PtrStep<WorkT> gmm_var,
+                                            const int nmixtures, const float varThreshold, const float backgroundRatio)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= frame.cols || y >= frame.rows)
+                return;
+
+            WorkT pix = cvt(frame(y, x));
+
+            int kHit = -1;
+            int kForeground = -1;
+
+            for (int k = 0; k < nmixtures; ++k)
+            {
+                if (gmm_weight(k * frame.rows + y, x) < numeric_limits<float>::epsilon())
+                    break;
+
+                WorkT mu = gmm_mean(k * frame.rows + y, x);
+                WorkT var = gmm_var(k * frame.rows + y, x);
+
+                WorkT diff = pix - mu;
+
+                if (sqr(diff) < varThreshold * sum(var))
+                {
+                    kHit = k;
+                    break;
+                }
+            }
+
+            if (kHit >= 0)
+            {
+                float wsum = 0.0f;
+                for (int k = 0; k < nmixtures; ++k)
+                {
+                    wsum += gmm_weight(k * frame.rows + y, x);
+
+                    if (wsum > backgroundRatio)
+                    {
+                        kForeground = k + 1;
+                        break;
+                    }
+                }
+            }
+
+            fgmask(y, x) = (uchar) (-(kHit < 0 || kHit >= kForeground));
+        }
+
+        template <typename SrcT, typename WorkT>
+        void mog_withoutLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var,
+                                        int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(mog_withoutLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );
+
+            mog_withoutLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
+                                                                         weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
+                                                                         nmixtures, varThreshold, backgroundRatio);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////
+        // MOG with learning
+
+        template <typename SrcT, typename WorkT>
+        __global__ void mog_withLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
+                                         PtrStepf gmm_weight, PtrStepf gmm_sortKey, PtrStep<WorkT> gmm_mean, PtrStep<WorkT> gmm_var,
+                                         const int nmixtures, const float varThreshold, const float backgroundRatio, const float learningRate, const float minVar)
+        {
+            const float w0 = 0.05f;
+            const float sk0 = w0 / (30.0f * 0.5f * 2.0f);
+            const float var0 = 30.0f * 0.5f * 30.0f * 0.5f * 4.0f;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= frame.cols || y >= frame.rows)
+                return;
+
+            WorkT pix = cvt(frame(y, x));
+
+            float wsum = 0.0f;
+            int kHit = -1;
+            int kForeground = -1;
+
+            int k = 0;
+            for (; k < nmixtures; ++k)
+            {
+                float w = gmm_weight(k * frame.rows + y, x);
+                wsum += w;
+
+                if (w < numeric_limits<float>::epsilon())
+                    break;
+
+                WorkT mu = gmm_mean(k * frame.rows + y, x);
+                WorkT var = gmm_var(k * frame.rows + y, x);
+
+                WorkT diff = pix - mu;
+
+                if (sqr(diff) < varThreshold * sum(var))
+                {
+                    wsum -= w;
+                    float dw = learningRate * (1.0f - w);
+
+                    var = clamp(var, learningRate, diff, minVar);
+
+                    float sortKey_prev = w / ::sqrtf(sum(var));
+                    gmm_sortKey(k * frame.rows + y, x) = sortKey_prev;
+
+                    float weight_prev = w + dw;
+                    gmm_weight(k * frame.rows + y, x) = weight_prev;
+
+                    WorkT mean_prev = mu + learningRate * diff;
+                    gmm_mean(k * frame.rows + y, x) = mean_prev;
+
+                    WorkT var_prev = var;
+                    gmm_var(k * frame.rows + y, x) = var_prev;
+
+                    int k1 = k - 1;
+
+                    if (k1 >= 0)
+                    {
+                        float sortKey_next = gmm_sortKey(k1 * frame.rows + y, x);
+                        float weight_next = gmm_weight(k1 * frame.rows + y, x);
+                        WorkT mean_next = gmm_mean(k1 * frame.rows + y, x);
+                        WorkT var_next = gmm_var(k1 * frame.rows + y, x);
+
+                        for (; sortKey_next < sortKey_prev && k1 >= 0; --k1)
+                        {
+                            gmm_sortKey(k1 * frame.rows + y, x) = sortKey_prev;
+                            gmm_sortKey((k1 + 1) * frame.rows + y, x) = sortKey_next;
+
+                            gmm_weight(k1 * frame.rows + y, x) = weight_prev;
+                            gmm_weight((k1 + 1) * frame.rows + y, x) = weight_next;
+
+                            gmm_mean(k1 * frame.rows + y, x) = mean_prev;
+                            gmm_mean((k1 + 1) * frame.rows + y, x) = mean_next;
+
+                            gmm_var(k1 * frame.rows + y, x) = var_prev;
+                            gmm_var((k1 + 1) * frame.rows + y, x) = var_next;
+
+                            sortKey_prev = sortKey_next;
+                            sortKey_next = k1 > 0 ? gmm_sortKey((k1 - 1) * frame.rows + y, x) : 0.0f;
+
+                            weight_prev = weight_next;
+                            weight_next = k1 > 0 ? gmm_weight((k1 - 1) * frame.rows + y, x) : 0.0f;
+
+                            mean_prev = mean_next;
+                            mean_next = k1 > 0 ? gmm_mean((k1 - 1) * frame.rows + y, x) : VecTraits<WorkT>::all(0.0f);
+
+                            var_prev = var_next;
+                            var_next = k1 > 0 ? gmm_var((k1 - 1) * frame.rows + y, x) : VecTraits<WorkT>::all(0.0f);
+                        }
+                    }
+
+                    kHit = k1 + 1;
+                    break;
+                }
+            }
+
+            if (kHit < 0)
+            {
+                // no appropriate gaussian mixture found at all, remove the weakest mixture and create a new one
+                kHit = k = ::min(k, nmixtures - 1);
+                wsum += w0 - gmm_weight(k * frame.rows + y, x);
+
+                gmm_weight(k * frame.rows + y, x) = w0;
+                gmm_mean(k * frame.rows + y, x) = pix;
+                gmm_var(k * frame.rows + y, x) = VecTraits<WorkT>::all(var0);
+                gmm_sortKey(k * frame.rows + y, x) = sk0;
+            }
+            else
+            {
+                for( ; k < nmixtures; k++)
+                    wsum += gmm_weight(k * frame.rows + y, x);
+            }
+
+            float wscale = 1.0f / wsum;
+            wsum = 0;
+            for (k = 0; k < nmixtures; ++k)
+            {
+                float w = gmm_weight(k * frame.rows + y, x);
+                wsum += w *= wscale;
+
+                gmm_weight(k * frame.rows + y, x) = w;
+                gmm_sortKey(k * frame.rows + y, x) *= wscale;
+
+                if (wsum > backgroundRatio && kForeground < 0)
+                    kForeground = k + 1;
+            }
+
+            fgmask(y, x) = (uchar)(-(kHit >= kForeground));
+        }
+
+        template <typename SrcT, typename WorkT>
+        void mog_withLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
+                                     int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar,
+                                     cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(mog_withLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );
+
+            mog_withLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
+                                                                      weight, sortKey, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
+                                                                      nmixtures, varThreshold, backgroundRatio, learningRate, minVar);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////
+        // MOG
+
+        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma, cudaStream_t stream)
+        {
+            typedef void (*withoutLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream);
+            typedef void (*withLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar, cudaStream_t stream);
+
+            static const withoutLearning_t withoutLearning[] =
+            {
+                0, mog_withoutLearning_caller<uchar, float>, 0, mog_withoutLearning_caller<uchar3, float3>, mog_withoutLearning_caller<uchar4, float4>
+            };
+            static const withLearning_t withLearning[] =
+            {
+                0, mog_withLearning_caller<uchar, float>, 0, mog_withLearning_caller<uchar3, float3>, mog_withLearning_caller<uchar4, float4>
+            };
+
+            const float minVar = noiseSigma * noiseSigma;
+
+            if (learningRate > 0.0f)
+                withLearning[cn](frame, fgmask, weight, sortKey, mean, var, nmixtures, varThreshold, backgroundRatio, learningRate, minVar, stream);
+            else
+                withoutLearning[cn](frame, fgmask, weight, mean, var, nmixtures, varThreshold, backgroundRatio, stream);
+        }
+
+        template <typename WorkT, typename OutT>
+        __global__ void getBackgroundImage(const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStepSz<OutT> dst, const int nmixtures, const float backgroundRatio)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= dst.cols || y >= dst.rows)
+                return;
+
+            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
+            float totalWeight = 0.0f;
+
+            for (int mode = 0; mode < nmixtures; ++mode)
+            {
+                float weight = gmm_weight(mode * dst.rows + y, x);
+
+                WorkT mean = gmm_mean(mode * dst.rows + y, x);
+                meanVal = meanVal + weight * mean;
+
+                totalWeight += weight;
+
+                if(totalWeight > backgroundRatio)
+                    break;
+            }
+
+            meanVal = meanVal * (1.f / totalWeight);
+
+            dst(y, x) = saturate_cast<OutT>(meanVal);
+        }
+
+        template <typename WorkT, typename OutT>
+        void getBackgroundImage_caller(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage<WorkT, OutT>, cudaFuncCachePreferL1) );
+
+            getBackgroundImage<WorkT, OutT><<<grid, block, 0, stream>>>(weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst, nmixtures, backgroundRatio);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
+        {
+            typedef void (*func_t)(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
+
+            static const func_t funcs[] =
+            {
+                0, getBackgroundImage_caller<float, uchar>, 0, getBackgroundImage_caller<float3, uchar3>, getBackgroundImage_caller<float4, uchar4>
+            };
+
+            funcs[cn](weight, mean, dst, nmixtures, backgroundRatio, stream);
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudabgsegm/src/cuda/mog2.cu b/modules/cudabgsegm/src/cuda/mog2.cu
new file mode 100644
index 00000000000..789afa47a94
--- /dev/null
+++ b/modules/cudabgsegm/src/cuda/mog2.cu
@@ -0,0 +1,439 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace mog2
+    {
+        ///////////////////////////////////////////////////////////////
+        // Utility
+
+        __device__ __forceinline__ float cvt(uchar val)
+        {
+            return val;
+        }
+        __device__ __forceinline__ float3 cvt(const uchar3& val)
+        {
+            return make_float3(val.x, val.y, val.z);
+        }
+        __device__ __forceinline__ float4 cvt(const uchar4& val)
+        {
+            return make_float4(val.x, val.y, val.z, val.w);
+        }
+
+        __device__ __forceinline__ float sqr(float val)
+        {
+            return val * val;
+        }
+        __device__ __forceinline__ float sqr(const float3& val)
+        {
+            return val.x * val.x + val.y * val.y + val.z * val.z;
+        }
+        __device__ __forceinline__ float sqr(const float4& val)
+        {
+            return val.x * val.x + val.y * val.y + val.z * val.z;
+        }
+
+        __device__ __forceinline__ float sum(float val)
+        {
+            return val;
+        }
+        __device__ __forceinline__ float sum(const float3& val)
+        {
+            return val.x + val.y + val.z;
+        }
+        __device__ __forceinline__ float sum(const float4& val)
+        {
+            return val.x + val.y + val.z;
+        }
+
+        template <class Ptr2D>
+        __device__ __forceinline__ void swap(Ptr2D& ptr, int x, int y, int k, int rows)
+        {
+            typename Ptr2D::elem_type val = ptr(k * rows + y, x);
+            ptr(k * rows + y, x) = ptr((k + 1) * rows + y, x);
+            ptr((k + 1) * rows + y, x) = val;
+        }
+
+        ///////////////////////////////////////////////////////////////
+        // MOG2
+
+        __constant__ int           c_nmixtures;
+        __constant__ float         c_Tb;
+        __constant__ float         c_TB;
+        __constant__ float         c_Tg;
+        __constant__ float         c_varInit;
+        __constant__ float         c_varMin;
+        __constant__ float         c_varMax;
+        __constant__ float         c_tau;
+        __constant__ unsigned char c_shadowVal;
+
+        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal)
+        {
+            varMin = ::fminf(varMin, varMax);
+            varMax = ::fmaxf(varMin, varMax);
+
+            cudaSafeCall( cudaMemcpyToSymbol(c_nmixtures, &nmixtures, sizeof(int)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_Tb, &Tb, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_TB, &TB, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_Tg, &Tg, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_varInit, &varInit, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_varMin, &varMin, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_varMax, &varMax, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_tau, &tau, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_shadowVal, &shadowVal, sizeof(unsigned char)) );
+        }
+
+        template <bool detectShadows, typename SrcT, typename WorkT>
+        __global__ void mog2(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStepb modesUsed,
+                             PtrStepf gmm_weight, PtrStepf gmm_variance, PtrStep<WorkT> gmm_mean,
+                             const float alphaT, const float alpha1, const float prune)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= frame.cols || y >= frame.rows)
+                return;
+
+            WorkT pix = cvt(frame(y, x));
+
+            //calculate distances to the modes (+ sort)
+            //here we need to go in descending order!!!
+
+            bool background = false; // true - the pixel classified as background
+
+            //internal:
+
+            bool fitsPDF = false; //if it remains zero a new GMM mode will be added
+
+            int nmodes = modesUsed(y, x);
+            int nNewModes = nmodes; //current number of modes in GMM
+
+            float totalWeight = 0.0f;
+
+            //go through all modes
+
+            for (int mode = 0; mode < nmodes; ++mode)
+            {
+                //need only weight if fit is found
+                float weight = alpha1 * gmm_weight(mode * frame.rows + y, x) + prune;
+                int swap_count = 0;
+                //fit not found yet
+                if (!fitsPDF)
+                {
+                    //check if it belongs to some of the remaining modes
+                    float var = gmm_variance(mode * frame.rows + y, x);
+
+                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
+
+                    //calculate difference and distance
+                    WorkT diff = mean - pix;
+                    float dist2 = sqr(diff);
+
+                    //background? - Tb - usually larger than Tg
+                    if (totalWeight < c_TB && dist2 < c_Tb * var)
+                        background = true;
+
+                    //check fit
+                    if (dist2 < c_Tg * var)
+                    {
+                        //belongs to the mode
+                        fitsPDF = true;
+
+                        //update distribution
+
+                        //update weight
+                        weight += alphaT;
+                        float k = alphaT / weight;
+
+                        //update mean
+                        gmm_mean(mode * frame.rows + y, x) = mean - k * diff;
+
+                        //update variance
+                        float varnew = var + k * (dist2 - var);
+
+                        //limit the variance
+                        varnew = ::fmaxf(varnew, c_varMin);
+                        varnew = ::fminf(varnew, c_varMax);
+
+                        gmm_variance(mode * frame.rows + y, x) = varnew;
+
+                        //sort
+                        //all other weights are at the same place and
+                        //only the matched (iModes) is higher -> just find the new place for it
+
+                        for (int i = mode; i > 0; --i)
+                        {
+                            //check one up
+                            if (weight < gmm_weight((i - 1) * frame.rows + y, x))
+                                break;
+
+                            swap_count++;
+                            //swap one up
+                            swap(gmm_weight, x, y, i - 1, frame.rows);
+                            swap(gmm_variance, x, y, i - 1, frame.rows);
+                            swap(gmm_mean, x, y, i - 1, frame.rows);
+                        }
+
+                        //belongs to the mode - bFitsPDF becomes 1
+                    }
+                } // !fitsPDF
+
+                //check prune
+                if (weight < -prune)
+                {
+                    weight = 0.0f;
+                    nmodes--;
+                }
+
+                gmm_weight((mode - swap_count) * frame.rows + y, x) = weight; //update weight by the calculated value
+                totalWeight += weight;
+            }
+
+            //renormalize weights
+
+            totalWeight = 1.f / totalWeight;
+            for (int mode = 0; mode < nmodes; ++mode)
+                gmm_weight(mode * frame.rows + y, x) *= totalWeight;
+
+            nmodes = nNewModes;
+
+            //make new mode if needed and exit
+
+            if (!fitsPDF)
+            {
+                // replace the weakest or add a new one
+                int mode = nmodes == c_nmixtures ? c_nmixtures - 1 : nmodes++;
+
+                if (nmodes == 1)
+                    gmm_weight(mode * frame.rows + y, x) = 1.f;
+                else
+                {
+                    gmm_weight(mode * frame.rows + y, x) = alphaT;
+
+                    // renormalize all other weights
+
+                    for (int i = 0; i < nmodes - 1; ++i)
+                        gmm_weight(i * frame.rows + y, x) *= alpha1;
+                }
+
+                // init
+
+                gmm_mean(mode * frame.rows + y, x) = pix;
+                gmm_variance(mode * frame.rows + y, x) = c_varInit;
+
+                //sort
+                //find the new place for it
+
+                for (int i = nmodes - 1; i > 0; --i)
+                {
+                    // check one up
+                    if (alphaT < gmm_weight((i - 1) * frame.rows + y, x))
+                        break;
+
+                    //swap one up
+                    swap(gmm_weight, x, y, i - 1, frame.rows);
+                    swap(gmm_variance, x, y, i - 1, frame.rows);
+                    swap(gmm_mean, x, y, i - 1, frame.rows);
+                }
+            }
+
+            //set the number of modes
+            modesUsed(y, x) = nmodes;
+
+            bool isShadow = false;
+            if (detectShadows && !background)
+            {
+                float tWeight = 0.0f;
+
+                // check all the components  marked as background:
+                for (int mode = 0; mode < nmodes; ++mode)
+                {
+                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
+
+                    WorkT pix_mean = pix * mean;
+
+                    float numerator = sum(pix_mean);
+                    float denominator = sqr(mean);
+
+                    // no division by zero allowed
+                    if (denominator == 0)
+                        break;
+
+                    // if tau < a < 1 then also check the color distortion
+                    if (numerator <= denominator && numerator >= c_tau * denominator)
+                    {
+                        float a = numerator / denominator;
+
+                        WorkT dD = a * mean - pix;
+
+                        if (sqr(dD) < c_Tb * gmm_variance(mode * frame.rows + y, x) * a * a)
+                        {
+                            isShadow = true;
+                            break;
+                        }
+                    };
+
+                    tWeight += gmm_weight(mode * frame.rows + y, x);
+                    if (tWeight > c_TB)
+                        break;
+                }
+            }
+
+            fgmask(y, x) = background ? 0 : isShadow ? c_shadowVal : 255;
+        }
+
+        template <typename SrcT, typename WorkT>
+        void mog2_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
+                         float alphaT, float prune, bool detectShadows, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
+
+            const float alpha1 = 1.0f - alphaT;
+
+            if (detectShadows)
+            {
+                cudaSafeCall( cudaFuncSetCacheConfig(mog2<true, SrcT, WorkT>, cudaFuncCachePreferL1) );
+
+                mog2<true, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
+                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
+                                                                    alphaT, alpha1, prune);
+            }
+            else
+            {
+                cudaSafeCall( cudaFuncSetCacheConfig(mog2<false, SrcT, WorkT>, cudaFuncCachePreferL1) );
+
+                mog2<false, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
+                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
+                                                                    alphaT, alpha1, prune);
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
+                      float alphaT, float prune, bool detectShadows, cudaStream_t stream)
+        {
+            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
+
+            static const func_t funcs[] =
+            {
+                0, mog2_caller<uchar, float>, 0, mog2_caller<uchar3, float3>, mog2_caller<uchar4, float4>
+            };
+
+            funcs[cn](frame, fgmask, modesUsed, weight, variance, mean, alphaT, prune, detectShadows, stream);
+        }
+
+        template <typename WorkT, typename OutT>
+        __global__ void getBackgroundImage2(const PtrStepSzb modesUsed, const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStep<OutT> dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= modesUsed.cols || y >= modesUsed.rows)
+                return;
+
+            int nmodes = modesUsed(y, x);
+
+            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
+            float totalWeight = 0.0f;
+
+            for (int mode = 0; mode < nmodes; ++mode)
+            {
+                float weight = gmm_weight(mode * modesUsed.rows + y, x);
+
+                WorkT mean = gmm_mean(mode * modesUsed.rows + y, x);
+                meanVal = meanVal + weight * mean;
+
+                totalWeight += weight;
+
+                if(totalWeight > c_TB)
+                    break;
+            }
+
+            meanVal = meanVal * (1.f / totalWeight);
+
+            dst(y, x) = saturate_cast<OutT>(meanVal);
+        }
+
+        template <typename WorkT, typename OutT>
+        void getBackgroundImage2_caller(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(modesUsed.cols, block.x), divUp(modesUsed.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage2<WorkT, OutT>, cudaFuncCachePreferL1) );
+
+            getBackgroundImage2<WorkT, OutT><<<grid, block, 0, stream>>>(modesUsed, weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
+        {
+            typedef void (*func_t)(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
+
+            static const func_t funcs[] =
+            {
+                0, getBackgroundImage2_caller<float, uchar>, 0, getBackgroundImage2_caller<float3, uchar3>, getBackgroundImage2_caller<float4, uchar4>
+            };
+
+            funcs[cn](modesUsed, weight, mean, dst, stream);
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudabgsegm/src/mog.cpp b/modules/cudabgsegm/src/mog.cpp
new file mode 100644
index 00000000000..8a43293d43a
--- /dev/null
+++ b/modules/cudabgsegm/src/mog.cpp
@@ -0,0 +1,209 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+Ptr<cuda::BackgroundSubtractorMOG> cv::cuda::createBackgroundSubtractorMOG(int, int, double, double)  { throw_no_cuda(); return Ptr<cuda::BackgroundSubtractorMOG>(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace mog
+    {
+        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
+                     int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma,
+                     cudaStream_t stream);
+        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    const int defaultNMixtures = 5;
+    const int defaultHistory = 200;
+    const float defaultBackgroundRatio = 0.7f;
+    const float defaultVarThreshold = 2.5f * 2.5f;
+    const float defaultNoiseSigma = 30.0f * 0.5f;
+    const float defaultInitialWeight = 0.05f;
+
+    class MOGImpl CV_FINAL : public cuda::BackgroundSubtractorMOG
+    {
+    public:
+        MOGImpl(int history, int nmixtures, double backgroundRatio, double noiseSigma);
+
+        void apply(InputArray image, OutputArray fgmask, double learningRate=-1) CV_OVERRIDE;
+        void apply(InputArray image, OutputArray fgmask, double learningRate, Stream& stream) CV_OVERRIDE;
+
+        void getBackgroundImage(OutputArray backgroundImage) const CV_OVERRIDE;
+        void getBackgroundImage(OutputArray backgroundImage, Stream& stream) const CV_OVERRIDE;
+
+        int getHistory() const CV_OVERRIDE { return history_; }
+        void setHistory(int nframes) CV_OVERRIDE { history_ = nframes; }
+
+        int getNMixtures() const CV_OVERRIDE { return nmixtures_; }
+        void setNMixtures(int nmix) CV_OVERRIDE { nmixtures_ = nmix; }
+
+        double getBackgroundRatio() const CV_OVERRIDE { return backgroundRatio_; }
+        void setBackgroundRatio(double backgroundRatio) CV_OVERRIDE { backgroundRatio_ = (float) backgroundRatio; }
+
+        double getNoiseSigma() const CV_OVERRIDE { return noiseSigma_; }
+        void setNoiseSigma(double noiseSigma) CV_OVERRIDE { noiseSigma_ = (float) noiseSigma; }
+
+    private:
+        //! re-initiaization method
+        void initialize(Size frameSize, int frameType);
+
+        int history_;
+        int nmixtures_;
+        float backgroundRatio_;
+        float noiseSigma_;
+
+        float varThreshold_;
+
+        Size frameSize_;
+        int frameType_;
+        int nframes_;
+
+        GpuMat weight_;
+        GpuMat sortKey_;
+        GpuMat mean_;
+        GpuMat var_;
+    };
+
+    MOGImpl::MOGImpl(int history, int nmixtures, double backgroundRatio, double noiseSigma) :
+        frameSize_(0, 0), frameType_(0), nframes_(0)
+    {
+        history_ = history > 0 ? history : defaultHistory;
+        nmixtures_ = std::min(nmixtures > 0 ? nmixtures : defaultNMixtures, 8);
+        backgroundRatio_ = backgroundRatio > 0 ? (float) backgroundRatio : defaultBackgroundRatio;
+        noiseSigma_ = noiseSigma > 0 ? (float) noiseSigma : defaultNoiseSigma;
+
+        varThreshold_ = defaultVarThreshold;
+    }
+
+    void MOGImpl::apply(InputArray image, OutputArray fgmask, double learningRate)
+    {
+        apply(image, fgmask, learningRate, Stream::Null());
+    }
+
+    void MOGImpl::apply(InputArray _frame, OutputArray _fgmask, double learningRate, Stream& stream)
+    {
+        using namespace cv::cuda::device::mog;
+
+        GpuMat frame = _frame.getGpuMat();
+
+        CV_Assert( frame.depth() == CV_8U );
+
+        int ch = frame.channels();
+        int work_ch = ch;
+
+        if (nframes_ == 0 || learningRate >= 1.0 || frame.size() != frameSize_ || work_ch != mean_.channels())
+            initialize(frame.size(), frame.type());
+
+        _fgmask.create(frameSize_, CV_8UC1);
+        GpuMat fgmask = _fgmask.getGpuMat();
+
+        ++nframes_;
+        learningRate = learningRate >= 0 && nframes_ > 1 ? learningRate : 1.0 / std::min(nframes_, history_);
+        CV_Assert( learningRate >= 0 );
+
+        mog_gpu(frame, ch, fgmask, weight_, sortKey_, mean_, var_, nmixtures_,
+                varThreshold_, (float) learningRate, backgroundRatio_, noiseSigma_,
+                StreamAccessor::getStream(stream));
+    }
+
+    void MOGImpl::getBackgroundImage(OutputArray backgroundImage) const
+    {
+        getBackgroundImage(backgroundImage, Stream::Null());
+    }
+
+    void MOGImpl::getBackgroundImage(OutputArray _backgroundImage, Stream& stream) const
+    {
+        using namespace cv::cuda::device::mog;
+
+        _backgroundImage.create(frameSize_, frameType_);
+        GpuMat backgroundImage = _backgroundImage.getGpuMat();
+
+        getBackgroundImage_gpu(backgroundImage.channels(), weight_, mean_, backgroundImage, nmixtures_, backgroundRatio_, StreamAccessor::getStream(stream));
+    }
+
+    void MOGImpl::initialize(Size frameSize, int frameType)
+    {
+        CV_Assert( frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4 );
+
+        frameSize_ = frameSize;
+        frameType_ = frameType;
+
+        int ch = CV_MAT_CN(frameType);
+        int work_ch = ch;
+
+        // for each gaussian mixture of each pixel bg model we store
+        // the mixture sort key (w/sum_of_variances), the mixture weight (w),
+        // the mean (nchannels values) and
+        // the diagonal covariance matrix (another nchannels values)
+
+        weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
+        sortKey_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
+        mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
+        var_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
+
+        weight_.setTo(cv::Scalar::all(0));
+        sortKey_.setTo(cv::Scalar::all(0));
+        mean_.setTo(cv::Scalar::all(0));
+        var_.setTo(cv::Scalar::all(0));
+
+        nframes_ = 0;
+    }
+}
+
+Ptr<cuda::BackgroundSubtractorMOG> cv::cuda::createBackgroundSubtractorMOG(int history, int nmixtures, double backgroundRatio, double noiseSigma)
+{
+    return makePtr<MOGImpl>(history, nmixtures, backgroundRatio, noiseSigma);
+}
+
+#endif
diff --git a/modules/cudabgsegm/src/mog2.cpp b/modules/cudabgsegm/src/mog2.cpp
new file mode 100644
index 00000000000..e727dcfdaf3
--- /dev/null
+++ b/modules/cudabgsegm/src/mog2.cpp
@@ -0,0 +1,253 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+Ptr<cuda::BackgroundSubtractorMOG2> cv::cuda::createBackgroundSubtractorMOG2(int, double, bool) { throw_no_cuda(); return Ptr<cuda::BackgroundSubtractorMOG2>(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace mog2
+    {
+        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal);
+        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
+        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    // default parameters of gaussian background detection algorithm
+    const int defaultHistory = 500; // Learning rate; alpha = 1/defaultHistory2
+    const float defaultVarThreshold = 4.0f * 4.0f;
+    const int defaultNMixtures = 5; // maximal number of Gaussians in mixture
+    const float defaultBackgroundRatio = 0.9f; // threshold sum of weights for background test
+    const float defaultVarThresholdGen = 3.0f * 3.0f;
+    const float defaultVarInit = 15.0f; // initial variance for new components
+    const float defaultVarMax = 5.0f * defaultVarInit;
+    const float defaultVarMin = 4.0f;
+
+    // additional parameters
+    const float defaultCT = 0.05f; // complexity reduction prior constant 0 - no reduction of number of components
+    const unsigned char defaultShadowValue = 127; // value to use in the segmentation mask for shadows, set 0 not to do shadow detection
+    const float defaultShadowThreshold = 0.5f; // Tau - shadow threshold, see the paper for explanation
+
+    class MOG2Impl CV_FINAL : public cuda::BackgroundSubtractorMOG2
+    {
+    public:
+        MOG2Impl(int history, double varThreshold, bool detectShadows);
+
+        void apply(InputArray image, OutputArray fgmask, double learningRate=-1) CV_OVERRIDE;
+        void apply(InputArray image, OutputArray fgmask, double learningRate, Stream& stream) CV_OVERRIDE;
+
+        void getBackgroundImage(OutputArray backgroundImage) const CV_OVERRIDE;
+        void getBackgroundImage(OutputArray backgroundImage, Stream& stream) const CV_OVERRIDE;
+
+        int getHistory() const CV_OVERRIDE { return history_; }
+        void setHistory(int history) CV_OVERRIDE { history_ = history; }
+
+        int getNMixtures() const CV_OVERRIDE { return nmixtures_; }
+        void setNMixtures(int nmixtures) CV_OVERRIDE { nmixtures_ = nmixtures; }
+
+        double getBackgroundRatio() const CV_OVERRIDE { return backgroundRatio_; }
+        void setBackgroundRatio(double ratio) CV_OVERRIDE { backgroundRatio_ = (float) ratio; }
+
+        double getVarThreshold() const CV_OVERRIDE { return varThreshold_; }
+        void setVarThreshold(double varThreshold) CV_OVERRIDE { varThreshold_ = (float) varThreshold; }
+
+        double getVarThresholdGen() const CV_OVERRIDE { return varThresholdGen_; }
+        void setVarThresholdGen(double varThresholdGen) CV_OVERRIDE { varThresholdGen_ = (float) varThresholdGen; }
+
+        double getVarInit() const CV_OVERRIDE { return varInit_; }
+        void setVarInit(double varInit) CV_OVERRIDE { varInit_ = (float) varInit; }
+
+        double getVarMin() const CV_OVERRIDE { return varMin_; }
+        void setVarMin(double varMin) CV_OVERRIDE { varMin_ = (float) varMin; }
+
+        double getVarMax() const CV_OVERRIDE { return varMax_; }
+        void setVarMax(double varMax) CV_OVERRIDE { varMax_ = (float) varMax; }
+
+        double getComplexityReductionThreshold() const CV_OVERRIDE { return ct_; }
+        void setComplexityReductionThreshold(double ct) CV_OVERRIDE { ct_ = (float) ct; }
+
+        bool getDetectShadows() const CV_OVERRIDE { return detectShadows_; }
+        void setDetectShadows(bool detectShadows) CV_OVERRIDE { detectShadows_ = detectShadows; }
+
+        int getShadowValue() const CV_OVERRIDE { return shadowValue_; }
+        void setShadowValue(int value) CV_OVERRIDE { shadowValue_ = (uchar) value; }
+
+        double getShadowThreshold() const CV_OVERRIDE { return shadowThreshold_; }
+        void setShadowThreshold(double threshold) CV_OVERRIDE { shadowThreshold_ = (float) threshold; }
+
+    private:
+        void initialize(Size frameSize, int frameType);
+
+        int history_;
+        int nmixtures_;
+        float backgroundRatio_;
+        float varThreshold_;
+        float varThresholdGen_;
+        float varInit_;
+        float varMin_;
+        float varMax_;
+        float ct_;
+        bool detectShadows_;
+        uchar shadowValue_;
+        float shadowThreshold_;
+
+        Size frameSize_;
+        int frameType_;
+        int nframes_;
+
+        GpuMat weight_;
+        GpuMat variance_;
+        GpuMat mean_;
+
+        //keep track of number of modes per pixel
+        GpuMat bgmodelUsedModes_;
+    };
+
+    MOG2Impl::MOG2Impl(int history, double varThreshold, bool detectShadows) :
+        frameSize_(0, 0), frameType_(0), nframes_(0)
+    {
+        history_ = history > 0 ? history : defaultHistory;
+        varThreshold_ = varThreshold > 0 ? (float) varThreshold : defaultVarThreshold;
+        detectShadows_ = detectShadows;
+
+        nmixtures_ = defaultNMixtures;
+        backgroundRatio_ = defaultBackgroundRatio;
+        varInit_ = defaultVarInit;
+        varMax_ = defaultVarMax;
+        varMin_ = defaultVarMin;
+        varThresholdGen_ = defaultVarThresholdGen;
+        ct_ = defaultCT;
+        shadowValue_ =  defaultShadowValue;
+        shadowThreshold_ = defaultShadowThreshold;
+    }
+
+    void MOG2Impl::apply(InputArray image, OutputArray fgmask, double learningRate)
+    {
+        apply(image, fgmask, learningRate, Stream::Null());
+    }
+
+    void MOG2Impl::apply(InputArray _frame, OutputArray _fgmask, double learningRate, Stream& stream)
+    {
+        using namespace cv::cuda::device::mog2;
+
+        GpuMat frame = _frame.getGpuMat();
+
+        int ch = frame.channels();
+        int work_ch = ch;
+
+        if (nframes_ == 0 || learningRate >= 1.0 || frame.size() != frameSize_ || work_ch != mean_.channels())
+            initialize(frame.size(), frame.type());
+
+        _fgmask.create(frameSize_, CV_8UC1);
+        GpuMat fgmask = _fgmask.getGpuMat();
+
+        fgmask.setTo(Scalar::all(0), stream);
+
+        ++nframes_;
+        learningRate = learningRate >= 0 && nframes_ > 1 ? learningRate : 1.0 / std::min(2 * nframes_, history_);
+        CV_Assert( learningRate >= 0 );
+
+        mog2_gpu(frame, frame.channels(), fgmask, bgmodelUsedModes_, weight_, variance_, mean_,
+                 (float) learningRate, static_cast<float>(-learningRate * ct_), detectShadows_, StreamAccessor::getStream(stream));
+    }
+
+    void MOG2Impl::getBackgroundImage(OutputArray backgroundImage) const
+    {
+        getBackgroundImage(backgroundImage, Stream::Null());
+    }
+
+    void MOG2Impl::getBackgroundImage(OutputArray _backgroundImage, Stream& stream) const
+    {
+        using namespace cv::cuda::device::mog2;
+
+        _backgroundImage.create(frameSize_, frameType_);
+        GpuMat backgroundImage = _backgroundImage.getGpuMat();
+
+        getBackgroundImage2_gpu(backgroundImage.channels(), bgmodelUsedModes_, weight_, mean_, backgroundImage, StreamAccessor::getStream(stream));
+    }
+
+    void MOG2Impl::initialize(cv::Size frameSize, int frameType)
+    {
+        using namespace cv::cuda::device::mog2;
+
+        CV_Assert( frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4 );
+
+        frameSize_ = frameSize;
+        frameType_ = frameType;
+        nframes_ = 0;
+
+        int ch = CV_MAT_CN(frameType);
+        int work_ch = ch;
+
+        // for each gaussian mixture of each pixel bg model we store ...
+        // the mixture weight (w),
+        // the mean (nchannels values) and
+        // the covariance
+        weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
+        variance_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
+        mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
+
+        //make the array for keeping track of the used modes per pixel - all zeros at start
+        bgmodelUsedModes_.create(frameSize_, CV_8UC1);
+        bgmodelUsedModes_.setTo(Scalar::all(0));
+
+        loadConstants(nmixtures_, varThreshold_, backgroundRatio_, varThresholdGen_, varInit_, varMin_, varMax_, shadowThreshold_, shadowValue_);
+    }
+}
+
+Ptr<cuda::BackgroundSubtractorMOG2> cv::cuda::createBackgroundSubtractorMOG2(int history, double varThreshold, bool detectShadows)
+{
+    return makePtr<MOG2Impl>(history, varThreshold, detectShadows);
+}
+
+#endif
diff --git a/modules/cudabgsegm/src/precomp.hpp b/modules/cudabgsegm/src/precomp.hpp
new file mode 100644
index 00000000000..12429c24164
--- /dev/null
+++ b/modules/cudabgsegm/src/precomp.hpp
@@ -0,0 +1,54 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_PRECOMP_H
+#define OPENCV_PRECOMP_H
+
+#include <limits>
+
+#include "opencv2/cudabgsegm.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#endif /* OPENCV_PRECOMP_H */
diff --git a/modules/cudabgsegm/test/test_bgsegm.cpp b/modules/cudabgsegm/test/test_bgsegm.cpp
new file mode 100644
index 00000000000..ce7c22808b1
--- /dev/null
+++ b/modules/cudabgsegm/test/test_bgsegm.cpp
@@ -0,0 +1,171 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////
+// MOG2
+
+#ifdef HAVE_VIDEO_INPUT
+
+namespace
+    {
+IMPLEMENT_PARAM_CLASS(UseGray, bool)
+    IMPLEMENT_PARAM_CLASS(DetectShadow, bool)
+}
+
+PARAM_TEST_CASE(MOG2, cv::cuda::DeviceInfo, std::string, UseGray, DetectShadow, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    std::string inputFile;
+    bool useGray;
+    bool detectShadow;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
+        useGray = GET_PARAM(2);
+        detectShadow = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+    }
+};
+
+CUDA_TEST_P(MOG2, Update)
+{
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    cv::Ptr<cv::BackgroundSubtractorMOG2> mog2 = cv::cuda::createBackgroundSubtractorMOG2();
+    mog2->setDetectShadows(detectShadow);
+    cv::cuda::GpuMat foreground = createMat(frame.size(), CV_8UC1, useRoi);
+
+    cv::Ptr<cv::BackgroundSubtractorMOG2> mog2_gold = cv::createBackgroundSubtractorMOG2();
+    mog2_gold->setDetectShadows(detectShadow);
+    cv::Mat foreground_gold;
+
+    for (int i = 0; i < 10; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        if (useGray)
+        {
+            cv::Mat temp;
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+            cv::swap(temp, frame);
+        }
+
+        mog2->apply(loadMat(frame, useRoi), foreground);
+
+        mog2_gold->apply(frame, foreground_gold);
+
+        if (detectShadow)
+        {
+            ASSERT_MAT_SIMILAR(foreground_gold, foreground, 1e-2);
+        }
+        else
+        {
+            ASSERT_MAT_NEAR(foreground_gold, foreground, 0);
+        }
+    }
+}
+
+CUDA_TEST_P(MOG2, getBackgroundImage)
+{
+    if (useGray)
+        return;
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+
+    cv::Ptr<cv::BackgroundSubtractorMOG2> mog2 = cv::cuda::createBackgroundSubtractorMOG2();
+    mog2->setDetectShadows(detectShadow);
+    cv::cuda::GpuMat foreground;
+
+    cv::Ptr<cv::BackgroundSubtractorMOG2> mog2_gold = cv::createBackgroundSubtractorMOG2();
+    mog2_gold->setDetectShadows(detectShadow);
+    cv::Mat foreground_gold;
+
+    for (int i = 0; i < 10; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        mog2->apply(loadMat(frame, useRoi), foreground);
+
+        mog2_gold->apply(frame, foreground_gold);
+    }
+
+    cv::cuda::GpuMat background = createMat(frame.size(), frame.type(), useRoi);
+    mog2->getBackgroundImage(background);
+
+    cv::Mat background_gold;
+    mog2_gold->getBackgroundImage(background_gold);
+
+    ASSERT_MAT_NEAR(background_gold, background, 1);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_BgSegm, MOG2, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi")),
+    testing::Values(UseGray(true), UseGray(false)),
+    testing::Values(DetectShadow(true), DetectShadow(false)),
+    WHOLE_SUBMAT));
+
+#endif
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudabgsegm/test/test_main.cpp b/modules/cudabgsegm/test/test_main.cpp
new file mode 100644
index 00000000000..04f4fcf6e60
--- /dev/null
+++ b/modules/cudabgsegm/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cudabgsegm/test/test_precomp.hpp b/modules/cudabgsegm/test/test_precomp.hpp
new file mode 100644
index 00000000000..c0408cb7631
--- /dev/null
+++ b/modules/cudabgsegm/test/test_precomp.hpp
@@ -0,0 +1,54 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef OPENCV_TEST_PRECOMP_HPP
+#define OPENCV_TEST_PRECOMP_HPP
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudabgsegm.hpp"
+#include "opencv2/video.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+#include "cvconfig.h"
+
+#endif
diff --git a/modules/cudacodec/CMakeLists.txt b/modules/cudacodec/CMakeLists.txt
new file mode 100644
index 00000000000..071404ecc76
--- /dev/null
+++ b/modules/cudacodec/CMakeLists.txt
@@ -0,0 +1,29 @@
+if(IOS OR APPLE OR WINRT OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudacodec)
+endif()
+
+set(the_description "CUDA-accelerated Video Encoding/Decoding")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wshadow)
+
+ocv_add_module(cudacodec opencv_core opencv_videoio OPTIONAL opencv_cudev WRAP python)
+
+ocv_module_include_directories()
+ocv_glob_module_sources()
+
+set(extra_libs "")
+
+if(HAVE_NVCUVID)
+  list(APPEND extra_libs ${CUDA_CUDA_LIBRARY} ${CUDA_nvcuvid_LIBRARY})
+endif()
+
+if(HAVE_NVCUVENC)
+  if(WIN32)
+    list(APPEND extra_libs ${CUDA_nvcuvenc_LIBRARY})
+  endif()
+endif()
+
+ocv_create_module(${extra_libs})
+
+ocv_add_accuracy_tests()
+ocv_add_perf_tests()
diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
new file mode 100644
index 00000000000..e404a48ce96
--- /dev/null
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -0,0 +1,342 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDACODEC_HPP
+#define OPENCV_CUDACODEC_HPP
+
+#ifndef __cplusplus
+#  error cudacodec.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudacodec Video Encoding/Decoding
+  @}
+ */
+
+namespace cv { namespace cudacodec {
+
+//! @addtogroup cudacodec
+//! @{
+
+////////////////////////////////// Video Encoding //////////////////////////////////
+
+// Works only under Windows.
+// Supports only H264 video codec and AVI files.
+
+enum SurfaceFormat
+{
+    SF_UYVY = 0,
+    SF_YUY2,
+    SF_YV12,
+    SF_NV12,
+    SF_IYUV,
+    SF_BGR,
+    SF_GRAY = SF_BGR
+};
+
+/** @brief Different parameters for CUDA video encoder.
+ */
+struct CV_EXPORTS_W EncoderParams
+{
+    int P_Interval;      //!< NVVE_P_INTERVAL,
+    int IDR_Period;      //!< NVVE_IDR_PERIOD,
+    int DynamicGOP;      //!< NVVE_DYNAMIC_GOP,
+    int RCType;          //!< NVVE_RC_TYPE,
+    int AvgBitrate;      //!< NVVE_AVG_BITRATE,
+    int PeakBitrate;     //!< NVVE_PEAK_BITRATE,
+    int QP_Level_Intra;  //!< NVVE_QP_LEVEL_INTRA,
+    int QP_Level_InterP; //!< NVVE_QP_LEVEL_INTER_P,
+    int QP_Level_InterB; //!< NVVE_QP_LEVEL_INTER_B,
+    int DeblockMode;     //!< NVVE_DEBLOCK_MODE,
+    int ProfileLevel;    //!< NVVE_PROFILE_LEVEL,
+    int ForceIntra;      //!< NVVE_FORCE_INTRA,
+    int ForceIDR;        //!< NVVE_FORCE_IDR,
+    int ClearStat;       //!< NVVE_CLEAR_STAT,
+    int DIMode;          //!< NVVE_SET_DEINTERLACE,
+    int Presets;         //!< NVVE_PRESETS,
+    int DisableCabac;    //!< NVVE_DISABLE_CABAC,
+    int NaluFramingType; //!< NVVE_CONFIGURE_NALU_FRAMING_TYPE
+    int DisableSPSPPS;   //!< NVVE_DISABLE_SPS_PPS
+
+    EncoderParams();
+    /** @brief Constructors.
+
+    @param configFile Config file name.
+
+    Creates default parameters or reads parameters from config file.
+     */
+    explicit EncoderParams(const String& configFile);
+
+    /** @brief Reads parameters from config file.
+
+    @param configFile Config file name.
+     */
+    void load(const String& configFile);
+    /** @brief Saves parameters to config file.
+
+    @param configFile Config file name.
+     */
+    void save(const String& configFile) const;
+};
+
+/** @brief Callbacks for CUDA video encoder.
+ */
+class CV_EXPORTS_W EncoderCallBack
+{
+public:
+    enum PicType
+    {
+        IFRAME = 1,
+        PFRAME = 2,
+        BFRAME = 3
+    };
+
+    virtual ~EncoderCallBack() {}
+
+    /** @brief Callback function to signal the start of bitstream that is to be encoded.
+
+    Callback must allocate buffer for CUDA encoder and return pointer to it and it's size.
+     */
+    virtual uchar* acquireBitStream(int* bufferSize) = 0;
+
+    /** @brief Callback function to signal that the encoded bitstream is ready to be written to file.
+    */
+    virtual void releaseBitStream(unsigned char* data, int size) = 0;
+
+    /** @brief Callback function to signal that the encoding operation on the frame has started.
+
+    @param frameNumber
+    @param picType Specify frame type (I-Frame, P-Frame or B-Frame).
+     */
+    CV_WRAP virtual void onBeginFrame(int frameNumber, EncoderCallBack::PicType picType) = 0;
+
+    /** @brief Callback function signals that the encoding operation on the frame has finished.
+
+    @param frameNumber
+    @param picType Specify frame type (I-Frame, P-Frame or B-Frame).
+     */
+    CV_WRAP virtual void onEndFrame(int frameNumber, EncoderCallBack::PicType picType) = 0;
+};
+
+/** @brief Video writer interface.
+
+The implementation uses H264 video codec.
+
+@note Currently only Windows platform is supported.
+
+@note
+   -   An example on how to use the videoWriter class can be found at
+        opencv_source_code/samples/gpu/video_writer.cpp
+ */
+class CV_EXPORTS_W VideoWriter
+{
+public:
+    virtual ~VideoWriter() {}
+
+    /** @brief Writes the next video frame.
+
+    @param frame The written frame.
+    @param lastFrame Indicates that it is end of stream. The parameter can be ignored.
+
+    The method write the specified image to video file. The image must have the same size and the same
+    surface format as has been specified when opening the video writer.
+     */
+    CV_WRAP virtual void write(InputArray frame, bool lastFrame = false) = 0;
+
+    CV_WRAP virtual EncoderParams getEncoderParams() const = 0;
+};
+
+/** @brief Creates video writer.
+
+@param fileName Name of the output video file. Only AVI file format is supported.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param format Surface format of input frames ( SF_UYVY , SF_YUY2 , SF_YV12 , SF_NV12 ,
+SF_IYUV , SF_BGR or SF_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+
+The constructors initialize video writer. FFMPEG is used to write videos. User can implement own
+multiplexing with cudacodec::EncoderCallBack .
+ */
+CV_EXPORTS_W Ptr<cudacodec::VideoWriter> createVideoWriter(const String& fileName, Size frameSize, double fps, SurfaceFormat format = SF_BGR);
+/** @overload
+@param fileName Name of the output video file. Only AVI file format is supported.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param params Encoder parameters. See cudacodec::EncoderParams .
+@param format Surface format of input frames ( SF_UYVY , SF_YUY2 , SF_YV12 , SF_NV12 ,
+SF_IYUV , SF_BGR or SF_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+*/
+CV_EXPORTS_W Ptr<cudacodec::VideoWriter> createVideoWriter(const String& fileName, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
+
+/** @overload
+@param encoderCallback Callbacks for video encoder. See cudacodec::EncoderCallBack . Use it if you
+want to work with raw video stream.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param format Surface format of input frames ( SF_UYVY , SF_YUY2 , SF_YV12 , SF_NV12 ,
+SF_IYUV , SF_BGR or SF_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+*/
+CV_EXPORTS_W Ptr<cudacodec::VideoWriter> createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, SurfaceFormat format = SF_BGR);
+/** @overload
+@param encoderCallback Callbacks for video encoder. See cudacodec::EncoderCallBack . Use it if you
+want to work with raw video stream.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param params Encoder parameters. See cudacodec::EncoderParams .
+@param format Surface format of input frames ( SF_UYVY , SF_YUY2 , SF_YV12 , SF_NV12 ,
+SF_IYUV , SF_BGR or SF_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+*/
+CV_EXPORTS_W Ptr<cudacodec::VideoWriter> createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
+
+////////////////////////////////// Video Decoding //////////////////////////////////////////
+
+/** @brief Video codecs supported by cudacodec::VideoReader .
+ */
+enum Codec
+{
+    MPEG1 = 0,
+    MPEG2,
+    MPEG4,
+    VC1,
+    H264,
+    JPEG,
+    H264_SVC,
+    H264_MVC,
+
+    Uncompressed_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')),   //!< Y,U,V (4:2:0)
+    Uncompressed_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   //!< Y,V,U (4:2:0)
+    Uncompressed_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   //!< Y,UV  (4:2:0)
+    Uncompressed_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   //!< YUYV/YUY2 (4:2:2)
+    Uncompressed_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y'))    //!< UYVY (4:2:2)
+};
+
+/** @brief Chroma formats supported by cudacodec::VideoReader .
+ */
+enum ChromaFormat
+{
+    Monochrome = 0,
+    YUV420,
+    YUV422,
+    YUV444
+};
+
+/** @brief Struct providing information about video file format. :
+ */
+struct FormatInfo
+{
+    Codec codec;
+    ChromaFormat chromaFormat;
+    int width;
+    int height;
+};
+
+/** @brief Video reader interface.
+
+@note
+   -   An example on how to use the videoReader class can be found at
+        opencv_source_code/samples/gpu/video_reader.cpp
+ */
+class CV_EXPORTS_W VideoReader
+{
+public:
+    virtual ~VideoReader() {}
+
+    /** @brief Grabs, decodes and returns the next video frame.
+
+    If no frames has been grabbed (there are no more frames in video file), the methods return false .
+    The method throws Exception if error occurs.
+     */
+    CV_WRAP virtual bool nextFrame(OutputArray frame) = 0;
+
+    /** @brief Returns information about video file format.
+    */
+    virtual FormatInfo format() const = 0;
+};
+
+/** @brief Interface for video demultiplexing. :
+
+User can implement own demultiplexing by implementing this interface.
+ */
+class CV_EXPORTS_W RawVideoSource
+{
+public:
+    virtual ~RawVideoSource() {}
+
+    /** @brief Returns next packet with RAW video frame.
+
+    @param data Pointer to frame data.
+    @param size Size in bytes of current frame.
+    @param endOfFile Indicates that it is end of stream.
+     */
+    virtual bool getNextPacket(unsigned char** data, int* size, bool* endOfFile) = 0;
+
+    /** @brief Returns information about video file format.
+    */
+    virtual FormatInfo format() const = 0;
+};
+
+/** @brief Creates video reader.
+
+@param filename Name of the input video file.
+
+FFMPEG is used to read videos. User can implement own demultiplexing with cudacodec::RawVideoSource
+ */
+CV_EXPORTS_W Ptr<VideoReader> createVideoReader(const String& filename);
+/** @overload
+@param source RAW video source implemented by user.
+*/
+CV_EXPORTS_W Ptr<VideoReader> createVideoReader(const Ptr<RawVideoSource>& source);
+
+//! @}
+
+}} // namespace cv { namespace cudacodec {
+
+#endif /* OPENCV_CUDACODEC_HPP */
diff --git a/modules/cudacodec/misc/python/pyopencv_cudacodec.hpp b/modules/cudacodec/misc/python/pyopencv_cudacodec.hpp
new file mode 100644
index 00000000000..6a4066edadf
--- /dev/null
+++ b/modules/cudacodec/misc/python/pyopencv_cudacodec.hpp
@@ -0,0 +1,14 @@
+#ifdef HAVE_OPENCV_CUDACODEC
+
+#include "opencv2/cudacodec.hpp"
+
+typedef cudacodec::EncoderCallBack::PicType EncoderCallBack_PicType;
+
+CV_PY_TO_CLASS(cudacodec::EncoderParams);
+
+CV_PY_TO_ENUM(cudacodec::EncoderCallBack::PicType);
+CV_PY_TO_ENUM(cudacodec::SurfaceFormat);
+
+CV_PY_FROM_CLASS(cudacodec::EncoderParams);
+
+#endif
diff --git a/modules/cudacodec/perf/perf_main.cpp b/modules/cudacodec/perf/perf_main.cpp
new file mode 100644
index 00000000000..68e91460f79
--- /dev/null
+++ b/modules/cudacodec/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudacodec)
diff --git a/modules/cudacodec/perf/perf_precomp.hpp b/modules/cudacodec/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..1cc4bdd9845
--- /dev/null
+++ b/modules/cudacodec/perf/perf_precomp.hpp
@@ -0,0 +1,54 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef OPENCV_PERF_PRECOMP_HPP
+#define OPENCV_PERF_PRECOMP_HPP
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudacodec.hpp"
+
+namespace opencv_test {
+using namespace perf;
+}
+
+#endif
diff --git a/modules/cudacodec/perf/perf_video.cpp b/modules/cudacodec/perf/perf_video.cpp
new file mode 100644
index 00000000000..e290d4c3faa
--- /dev/null
+++ b/modules/cudacodec/perf/perf_video.cpp
@@ -0,0 +1,148 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/highgui/highgui_c.h"
+
+namespace opencv_test { namespace {
+
+DEF_PARAM_TEST_1(FileName, string);
+
+//////////////////////////////////////////////////////
+// VideoReader
+
+#if defined(HAVE_NVCUVID) && defined(HAVE_VIDEO_INPUT)
+
+PERF_TEST_P(FileName, VideoReader, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
+{
+    declare.time(20);
+
+    const string inputFile = perf::TestBase::getDataPath(GetParam());
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cudacodec::VideoReader> d_reader = cv::cudacodec::createVideoReader(inputFile);
+
+        cv::cuda::GpuMat frame;
+
+        TEST_CYCLE_N(10) d_reader->nextFrame(frame);
+
+        CUDA_SANITY_CHECK(frame);
+    }
+    else
+    {
+        cv::VideoCapture reader(inputFile);
+        ASSERT_TRUE( reader.isOpened() );
+
+        cv::Mat frame;
+
+        TEST_CYCLE_N(10) reader >> frame;
+
+        CPU_SANITY_CHECK(frame);
+    }
+}
+
+#endif
+
+//////////////////////////////////////////////////////
+// VideoWriter
+
+#if defined(HAVE_NVCUVID) && defined(_WIN32)
+
+PERF_TEST_P(FileName, VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
+{
+    declare.time(30);
+
+    const string inputFile = perf::TestBase::getDataPath(GetParam());
+    const string outputFile = cv::tempfile(".avi");
+
+    const double FPS = 25.0;
+
+    cv::VideoCapture reader(inputFile);
+    ASSERT_TRUE( reader.isOpened() );
+
+    cv::Mat frame;
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cudacodec::VideoWriter> d_writer;
+
+        cv::cuda::GpuMat d_frame;
+
+        for (int i = 0; i < 10; ++i)
+        {
+            reader >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            d_frame.upload(frame);
+
+            if (d_writer.empty())
+                d_writer = cv::cudacodec::createVideoWriter(outputFile, frame.size(), FPS);
+
+            startTimer(); next();
+            d_writer->write(d_frame);
+            stopTimer();
+        }
+    }
+    else
+    {
+        cv::VideoWriter writer;
+
+        for (int i = 0; i < 10; ++i)
+        {
+            reader >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            if (!writer.isOpened())
+                writer.open(outputFile, CV_FOURCC('X', 'V', 'I', 'D'), FPS, frame.size());
+
+            startTimer(); next();
+            writer.write(frame);
+            stopTimer();
+        }
+    }
+
+    SANITY_CHECK(frame);
+}
+
+#endif
+}} // namespace
diff --git a/modules/cudacodec/src/cuda/nv12_to_rgb.cu b/modules/cudacodec/src/cuda/nv12_to_rgb.cu
new file mode 100644
index 00000000000..b96f2acf80c
--- /dev/null
+++ b/modules/cudacodec/src/cuda/nv12_to_rgb.cu
@@ -0,0 +1,207 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+ * NV12ToARGB color space conversion CUDA kernel
+ *
+ * This sample uses CUDA to perform a simple NV12 (YUV 4:2:0 planar)
+ * source and converts to output in ARGB format
+ */
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev/common.hpp"
+
+using namespace cv;
+using namespace cv::cudev;
+
+void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height);
+
+namespace
+{
+    __constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f};
+
+    __device__ static void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
+    {
+        float luma, chromaCb, chromaCr;
+
+        // Prepare for hue adjustment
+        luma     = (float)yuvi[0];
+        chromaCb = (float)((int)yuvi[1] - 512.0f);
+        chromaCr = (float)((int)yuvi[2] - 512.0f);
+
+       // Convert YUV To RGB with hue adjustment
+       *red   = (luma     * constHueColorSpaceMat[0]) +
+                (chromaCb * constHueColorSpaceMat[1]) +
+                (chromaCr * constHueColorSpaceMat[2]);
+
+       *green = (luma     * constHueColorSpaceMat[3]) +
+                (chromaCb * constHueColorSpaceMat[4]) +
+                (chromaCr * constHueColorSpaceMat[5]);
+
+       *blue  = (luma     * constHueColorSpaceMat[6]) +
+                (chromaCb * constHueColorSpaceMat[7]) +
+                (chromaCr * constHueColorSpaceMat[8]);
+    }
+
+    __device__ static uint RGBA_pack_10bit(float red, float green, float blue, uint alpha)
+    {
+        uint ARGBpixel = 0;
+
+        // Clamp final 10 bit results
+        red   = ::fmin(::fmax(red,   0.0f), 1023.f);
+        green = ::fmin(::fmax(green, 0.0f), 1023.f);
+        blue  = ::fmin(::fmax(blue,  0.0f), 1023.f);
+
+        // Convert to 8 bit unsigned integers per color component
+        ARGBpixel = (((uint)blue  >> 2) |
+                    (((uint)green >> 2) << 8)  |
+                    (((uint)red   >> 2) << 16) |
+                    (uint)alpha);
+
+        return ARGBpixel;
+    }
+
+    // CUDA kernel for outputting the final ARGB output from NV12
+
+    #define COLOR_COMPONENT_BIT_SIZE 10
+    #define COLOR_COMPONENT_MASK     0x3FF
+
+    __global__ void NV12_to_RGB(const uchar* srcImage, size_t nSourcePitch,
+                                  uint* dstImage, size_t nDestPitch,
+                                  uint width, uint height)
+    {
+        // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
+        const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
+        const int y = blockIdx.y *  blockDim.y       +  threadIdx.y;
+
+        if (x >= width || y >= height)
+            return;
+
+        // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
+        // if we move to texture we could read 4 luminance values
+
+        uint yuv101010Pel[2];
+
+        yuv101010Pel[0] = (srcImage[y * nSourcePitch + x    ]) << 2;
+        yuv101010Pel[1] = (srcImage[y * nSourcePitch + x + 1]) << 2;
+
+        const size_t chromaOffset = nSourcePitch * height;
+
+        const int y_chroma = y >> 1;
+
+        if (y & 1)  // odd scanline ?
+        {
+            uint chromaCb = srcImage[chromaOffset + y_chroma * nSourcePitch + x    ];
+            uint chromaCr = srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1];
+
+            if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
+            {
+                chromaCb = (chromaCb + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x    ] + 1) >> 1;
+                chromaCr = (chromaCr + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x + 1] + 1) >> 1;
+            }
+
+            yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+            yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+            yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+            yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+        }
+        else
+        {
+            yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+            yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+            yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+            yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+        }
+
+        // this steps performs the color conversion
+        uint yuvi[6];
+        float red[2], green[2], blue[2];
+
+        yuvi[0] =  (yuv101010Pel[0] &   COLOR_COMPONENT_MASK    );
+        yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+        yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+        yuvi[3] =  (yuv101010Pel[1] &   COLOR_COMPONENT_MASK    );
+        yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+        yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+        // YUV to RGB Transformation conversion
+        YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
+        YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
+
+        // Clamp the results to RGBA
+
+        const size_t dstImagePitch = nDestPitch >> 2;
+
+        dstImage[y * dstImagePitch + x     ] = RGBA_pack_10bit(red[0], green[0], blue[0], ((uint)0xff << 24));
+        dstImage[y * dstImagePitch + x + 1 ] = RGBA_pack_10bit(red[1], green[1], blue[1], ((uint)0xff << 24));
+    }
+}
+
+void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height)
+{
+    // Final Stage: NV12toARGB color space conversion
+
+    _outFrame.create(height, width, CV_8UC4);
+    GpuMat outFrame = _outFrame.getGpuMat();
+
+    dim3 block(32, 8);
+    dim3 grid(divUp(width, 2 * block.x), divUp(height, block.y));
+
+    NV12_to_RGB<<<grid, block>>>(decodedFrame.ptr<uchar>(), decodedFrame.step,
+                                 outFrame.ptr<uint>(), outFrame.step,
+                                 width, height);
+
+    CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+    CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+}
+
+#endif
diff --git a/modules/cudacodec/src/cuda/rgb_to_yv12.cu b/modules/cudacodec/src/cuda/rgb_to_yv12.cu
new file mode 100644
index 00000000000..ed0e0df9ba8
--- /dev/null
+++ b/modules/cudacodec/src/cuda/rgb_to_yv12.cu
@@ -0,0 +1,167 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudev/ptr2d/glob.hpp"
+
+using namespace cv::cudev;
+
+void RGB_to_YV12(const GpuMat& src, GpuMat& dst);
+
+namespace
+{
+    __device__ __forceinline__ void rgb_to_y(const uchar b, const uchar g, const uchar r, uchar& y)
+    {
+        y = static_cast<uchar>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
+    }
+
+    __device__ __forceinline__ void rgb_to_yuv(const uchar b, const uchar g, const uchar r, uchar& y, uchar& u, uchar& v)
+    {
+        rgb_to_y(b, g, r, y);
+        u = static_cast<uchar>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
+        v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
+    }
+
+    __global__ void Gray_to_YV12(const GlobPtrSz<uchar> src, GlobPtr<uchar> dst)
+    {
+        const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+        const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+
+        if (x + 1 >= src.cols || y + 1 >= src.rows)
+            return;
+
+        // get pointers to the data
+        const size_t planeSize = src.rows * dst.step;
+        GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step);
+        GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
+        GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
+
+        uchar pix;
+        uchar y_val, u_val, v_val;
+
+        pix = src(y, x);
+        rgb_to_y(pix, pix, pix, y_val);
+        y_plane(y, x) = y_val;
+
+        pix = src(y, x + 1);
+        rgb_to_y(pix, pix, pix, y_val);
+        y_plane(y, x + 1) = y_val;
+
+        pix = src(y + 1, x);
+        rgb_to_y(pix, pix, pix, y_val);
+        y_plane(y + 1, x) = y_val;
+
+        pix = src(y + 1, x + 1);
+        rgb_to_yuv(pix, pix, pix, y_val, u_val, v_val);
+        y_plane(y + 1, x + 1) = y_val;
+        u_plane(y / 2, x / 2) = u_val;
+        v_plane(y / 2, x / 2) = v_val;
+    }
+
+    template <typename T>
+    __global__ void RGB_to_YV12(const GlobPtrSz<T> src, GlobPtr<uchar> dst)
+    {
+        const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+        const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+
+        if (x + 1 >= src.cols || y + 1 >= src.rows)
+            return;
+
+        // get pointers to the data
+        const size_t planeSize = src.rows * dst.step;
+        GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step);
+        GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
+        GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
+
+        T pix;
+        uchar y_val, u_val, v_val;
+
+        pix = src(y, x);
+        rgb_to_y(pix.z, pix.y, pix.x, y_val);
+        y_plane(y, x) = y_val;
+
+        pix = src(y, x + 1);
+        rgb_to_y(pix.z, pix.y, pix.x, y_val);
+        y_plane(y, x + 1) = y_val;
+
+        pix = src(y + 1, x);
+        rgb_to_y(pix.z, pix.y, pix.x, y_val);
+        y_plane(y + 1, x) = y_val;
+
+        pix = src(y + 1, x + 1);
+        rgb_to_yuv(pix.z, pix.y, pix.x, y_val, u_val, v_val);
+        y_plane(y + 1, x + 1) = y_val;
+        u_plane(y / 2, x / 2) = u_val;
+        v_plane(y / 2, x / 2) = v_val;
+    }
+}
+
+void RGB_to_YV12(const GpuMat& src, GpuMat& dst)
+{
+    const dim3 block(32, 8);
+    const dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
+
+    switch (src.channels())
+    {
+    case 1:
+        Gray_to_YV12<<<grid, block>>>(globPtr<uchar>(src), globPtr<uchar>(dst));
+        break;
+    case 3:
+        RGB_to_YV12<<<grid, block>>>(globPtr<uchar3>(src), globPtr<uchar>(dst));
+        break;
+    case 4:
+        RGB_to_YV12<<<grid, block>>>(globPtr<uchar4>(src), globPtr<uchar>(dst));
+        break;
+    }
+
+    CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+    CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+}
+
+#endif
diff --git a/modules/cudacodec/src/cuvid_video_source.cpp b/modules/cudacodec/src/cuvid_video_source.cpp
new file mode 100644
index 00000000000..fd6eac72de9
--- /dev/null
+++ b/modules/cudacodec/src/cuvid_video_source.cpp
@@ -0,0 +1,114 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_NVCUVID
+
+using namespace cv;
+using namespace cv::cudacodec;
+using namespace cv::cudacodec::detail;
+
+cv::cudacodec::detail::CuvidVideoSource::CuvidVideoSource(const String& fname)
+{
+    CUVIDSOURCEPARAMS params;
+    std::memset(&params, 0, sizeof(CUVIDSOURCEPARAMS));
+
+    // Fill parameter struct
+    params.pUserData = this;                        // will be passed to data handlers
+    params.pfnVideoDataHandler = HandleVideoData;   // our local video-handler callback
+    params.pfnAudioDataHandler = 0;
+
+    // now create the actual source
+    CUresult cuRes = cuvidCreateVideoSource(&videoSource_, fname.c_str(), &params);
+    if (cuRes == CUDA_ERROR_INVALID_SOURCE)
+        throw std::runtime_error("");
+    cuSafeCall( cuRes );
+
+    CUVIDEOFORMAT vidfmt;
+    cuSafeCall( cuvidGetSourceVideoFormat(videoSource_, &vidfmt, 0) );
+
+    format_.codec = static_cast<Codec>(vidfmt.codec);
+    format_.chromaFormat = static_cast<ChromaFormat>(vidfmt.chroma_format);
+    format_.width = vidfmt.coded_width;
+    format_.height = vidfmt.coded_height;
+}
+
+cv::cudacodec::detail::CuvidVideoSource::~CuvidVideoSource()
+{
+    cuvidDestroyVideoSource(videoSource_);
+}
+
+FormatInfo cv::cudacodec::detail::CuvidVideoSource::format() const
+{
+    return format_;
+}
+
+void cv::cudacodec::detail::CuvidVideoSource::start()
+{
+    cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Started) );
+}
+
+void cv::cudacodec::detail::CuvidVideoSource::stop()
+{
+    cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Stopped) );
+}
+
+bool cv::cudacodec::detail::CuvidVideoSource::isStarted() const
+{
+    return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Started);
+}
+
+bool cv::cudacodec::detail::CuvidVideoSource::hasError() const
+{
+    return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Error);
+}
+
+int CUDAAPI cv::cudacodec::detail::CuvidVideoSource::HandleVideoData(void* userData, CUVIDSOURCEDATAPACKET* packet)
+{
+    CuvidVideoSource* thiz = static_cast<CuvidVideoSource*>(userData);
+
+    return thiz->parseVideoData(packet->payload, packet->payload_size, (packet->flags & CUVID_PKT_ENDOFSTREAM) != 0);
+}
+
+#endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/src/cuvid_video_source.hpp b/modules/cudacodec/src/cuvid_video_source.hpp
new file mode 100644
index 00000000000..3d7524a855a
--- /dev/null
+++ b/modules/cudacodec/src/cuvid_video_source.hpp
@@ -0,0 +1,90 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __CUVID_VIDEO_SOURCE_HPP__
+#define __CUVID_VIDEO_SOURCE_HPP__
+
+#if CUDA_VERSION >= 9000
+    #include <dynlink_nvcuvid.h>
+#else
+    #include <nvcuvid.h>
+#endif
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/cudacodec.hpp"
+#include "video_source.hpp"
+
+namespace cv { namespace cudacodec { namespace detail
+{
+
+class CuvidVideoSource : public VideoSource
+{
+public:
+    explicit CuvidVideoSource(const String& fname);
+    ~CuvidVideoSource();
+
+    FormatInfo format() const CV_OVERRIDE;
+    void start() CV_OVERRIDE;
+    void stop() CV_OVERRIDE;
+    bool isStarted() const CV_OVERRIDE;
+    bool hasError() const CV_OVERRIDE;
+
+private:
+    // Callback for handling packages of demuxed video data.
+    //
+    // Parameters:
+    //      pUserData - Pointer to user data. We must pass a pointer to a
+    //          VideoSourceData struct here, that contains a valid CUvideoparser
+    //          and FrameQueue.
+    //      pPacket - video-source data packet.
+    //
+    // NOTE: called from a different thread that doesn't not have a cuda context
+    //
+    static int CUDAAPI HandleVideoData(void* pUserData, CUVIDSOURCEDATAPACKET* pPacket);
+
+    CUvideosource videoSource_;
+    FormatInfo format_;
+};
+
+}}}
+
+#endif // __CUVID_VIDEO_SOURCE_HPP__
diff --git a/modules/cudacodec/src/ffmpeg_video_source.cpp b/modules/cudacodec/src/ffmpeg_video_source.cpp
new file mode 100644
index 00000000000..1978e7860ff
--- /dev/null
+++ b/modules/cudacodec/src/ffmpeg_video_source.cpp
@@ -0,0 +1,139 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_NVCUVID
+
+using namespace cv;
+using namespace cv::cudacodec;
+using namespace cv::cudacodec::detail;
+
+namespace
+{
+    Create_InputMediaStream_FFMPEG_Plugin create_InputMediaStream_FFMPEG_p = 0;
+    Release_InputMediaStream_FFMPEG_Plugin release_InputMediaStream_FFMPEG_p = 0;
+    Read_InputMediaStream_FFMPEG_Plugin read_InputMediaStream_FFMPEG_p = 0;
+
+    bool init_MediaStream_FFMPEG()
+    {
+        static bool initialized = 0;
+
+        if (!initialized)
+        {
+            #if defined _WIN32
+                const char* module_name = "opencv_ffmpeg"
+                    CVAUX_STR(CV_VERSION_MAJOR) CVAUX_STR(CV_VERSION_MINOR) CVAUX_STR(CV_VERSION_REVISION)
+                #if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__)
+                    "_64"
+                #endif
+                    ".dll";
+
+                static HMODULE cvFFOpenCV = LoadLibrary(module_name);
+
+                if (cvFFOpenCV)
+                {
+                    create_InputMediaStream_FFMPEG_p =
+                        (Create_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "create_InputMediaStream_FFMPEG");
+                    release_InputMediaStream_FFMPEG_p =
+                        (Release_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "release_InputMediaStream_FFMPEG");
+                    read_InputMediaStream_FFMPEG_p =
+                        (Read_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "read_InputMediaStream_FFMPEG");
+
+                    initialized = create_InputMediaStream_FFMPEG_p != 0 && release_InputMediaStream_FFMPEG_p != 0 && read_InputMediaStream_FFMPEG_p != 0;
+                }
+            #elif defined HAVE_FFMPEG
+                create_InputMediaStream_FFMPEG_p = create_InputMediaStream_FFMPEG;
+                release_InputMediaStream_FFMPEG_p = release_InputMediaStream_FFMPEG;
+                read_InputMediaStream_FFMPEG_p = read_InputMediaStream_FFMPEG;
+
+                initialized = true;
+            #endif
+        }
+
+        return initialized;
+    }
+}
+
+cv::cudacodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname) :
+    stream_(0)
+{
+    CV_Assert( init_MediaStream_FFMPEG() );
+
+    int codec;
+    int chroma_format;
+    int width;
+    int height;
+
+    stream_ = create_InputMediaStream_FFMPEG_p(fname.c_str(), &codec, &chroma_format, &width, &height);
+    if (!stream_)
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");
+
+    format_.codec = static_cast<Codec>(codec);
+    format_.chromaFormat = static_cast<ChromaFormat>(chroma_format);
+    format_.width = width;
+    format_.height = height;
+}
+
+cv::cudacodec::detail::FFmpegVideoSource::~FFmpegVideoSource()
+{
+    if (stream_)
+        release_InputMediaStream_FFMPEG_p(stream_);
+}
+
+FormatInfo cv::cudacodec::detail::FFmpegVideoSource::format() const
+{
+    return format_;
+}
+
+bool cv::cudacodec::detail::FFmpegVideoSource::getNextPacket(unsigned char** data, int* size, bool* bEndOfFile)
+{
+    int endOfFile;
+
+    int res = read_InputMediaStream_FFMPEG_p(stream_, data, size, &endOfFile);
+
+    *bEndOfFile = (endOfFile != 0);
+    return res != 0;
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/cudacodec/src/ffmpeg_video_source.hpp b/modules/cudacodec/src/ffmpeg_video_source.hpp
new file mode 100644
index 00000000000..6431c30075f
--- /dev/null
+++ b/modules/cudacodec/src/ffmpeg_video_source.hpp
@@ -0,0 +1,71 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __FFMPEG_VIDEO_SOURCE_HPP__
+#define __FFMPEG_VIDEO_SOURCE_HPP__
+
+#include "opencv2/cudacodec.hpp"
+
+struct InputMediaStream_FFMPEG;
+
+namespace cv { namespace cudacodec { namespace detail {
+
+class FFmpegVideoSource : public RawVideoSource
+{
+public:
+    FFmpegVideoSource(const String& fname);
+    ~FFmpegVideoSource();
+
+    bool getNextPacket(unsigned char** data, int* size, bool* endOfFile) CV_OVERRIDE;
+
+    FormatInfo format() const CV_OVERRIDE;
+
+private:
+    FormatInfo format_;
+
+    InputMediaStream_FFMPEG* stream_;
+};
+
+}}}
+
+#endif // __FFMPEG_VIDEO_SOURCE_HPP__
diff --git a/modules/cudacodec/src/frame_queue.cpp b/modules/cudacodec/src/frame_queue.cpp
new file mode 100644
index 00000000000..d3c42c902ca
--- /dev/null
+++ b/modules/cudacodec/src/frame_queue.cpp
@@ -0,0 +1,118 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_NVCUVID
+
+cv::cudacodec::detail::FrameQueue::FrameQueue() :
+    endOfDecode_(0),
+    framesInQueue_(0),
+    readPosition_(0)
+{
+    std::memset(displayQueue_, 0, sizeof(displayQueue_));
+    std::memset((void*) isFrameInUse_, 0, sizeof(isFrameInUse_));
+}
+
+bool cv::cudacodec::detail::FrameQueue::waitUntilFrameAvailable(int pictureIndex)
+{
+    while (isInUse(pictureIndex))
+    {
+        // Decoder is getting too far ahead from display
+        Thread::sleep(1);
+
+        if (isEndOfDecode())
+            return false;
+    }
+
+    return true;
+}
+
+void cv::cudacodec::detail::FrameQueue::enqueue(const CUVIDPARSERDISPINFO* picParams)
+{
+    // Mark the frame as 'in-use' so we don't re-use it for decoding until it is no longer needed
+    // for display
+    isFrameInUse_[picParams->picture_index] = true;
+
+    // Wait until we have a free entry in the display queue (should never block if we have enough entries)
+    do
+    {
+        bool isFramePlaced = false;
+
+        {
+            AutoLock autoLock(mtx_);
+
+            if (framesInQueue_ < MaximumSize)
+            {
+                int writePosition = (readPosition_ + framesInQueue_) % MaximumSize;
+                displayQueue_[writePosition] = *picParams;
+                framesInQueue_++;
+                isFramePlaced = true;
+            }
+        }
+
+        if (isFramePlaced) // Done
+            break;
+
+        // Wait a bit
+        Thread::sleep(1);
+    } while (!isEndOfDecode());
+}
+
+bool cv::cudacodec::detail::FrameQueue::dequeue(CUVIDPARSERDISPINFO& displayInfo)
+{
+    AutoLock autoLock(mtx_);
+
+    if (framesInQueue_ > 0)
+    {
+        int entry = readPosition_;
+        displayInfo = displayQueue_[entry];
+        readPosition_ = (entry + 1) % MaximumSize;
+        framesInQueue_--;
+        return true;
+    }
+
+    return false;
+}
+
+#endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/src/frame_queue.hpp b/modules/cudacodec/src/frame_queue.hpp
new file mode 100644
index 00000000000..88add3abafe
--- /dev/null
+++ b/modules/cudacodec/src/frame_queue.hpp
@@ -0,0 +1,102 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __FRAME_QUEUE_HPP__
+#define __FRAME_QUEUE_HPP__
+
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+#if CUDA_VERSION >= 9000
+    #include <dynlink_nvcuvid.h>
+#else
+    #include <nvcuvid.h>
+#endif
+
+namespace cv { namespace cudacodec { namespace detail
+{
+
+class FrameQueue
+{
+public:
+    static const int MaximumSize = 20; // MAX_FRM_CNT;
+
+    FrameQueue();
+
+    void endDecode() { endOfDecode_ = true; }
+    bool isEndOfDecode() const { return endOfDecode_ != 0;}
+
+    // Spins until frame becomes available or decoding gets canceled.
+    // If the requested frame is available the method returns true.
+    // If decoding was interrupted before the requested frame becomes
+    // available, the method returns false.
+    bool waitUntilFrameAvailable(int pictureIndex);
+
+    void enqueue(const CUVIDPARSERDISPINFO* picParams);
+
+    // Deque the next frame.
+    // Parameters:
+    //      displayInfo - New frame info gets placed into this object.
+    // Returns:
+    //      true, if a new frame was returned,
+    //      false, if the queue was empty and no new frame could be returned.
+    bool dequeue(CUVIDPARSERDISPINFO& displayInfo);
+
+    void releaseFrame(const CUVIDPARSERDISPINFO& picParams) { isFrameInUse_[picParams.picture_index] = false; }
+
+private:
+    bool isInUse(int pictureIndex) const { return isFrameInUse_[pictureIndex] != 0; }
+
+    Mutex mtx_;
+
+    volatile int isFrameInUse_[MaximumSize];
+    volatile int endOfDecode_;
+
+    int framesInQueue_;
+    int readPosition_;
+    CUVIDPARSERDISPINFO displayQueue_[MaximumSize];
+};
+
+}}}
+
+#endif // __FRAME_QUEUE_HPP__
diff --git a/modules/cudacodec/src/precomp.hpp b/modules/cudacodec/src/precomp.hpp
new file mode 100644
index 00000000000..3c53a9b940a
--- /dev/null
+++ b/modules/cudacodec/src/precomp.hpp
@@ -0,0 +1,87 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_PRECOMP_H
+#define OPENCV_PRECOMP_H
+
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <utility>
+#include <stdexcept>
+#include <iostream>
+
+#include "opencv2/cudacodec.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#ifdef HAVE_NVCUVID
+    #if CUDA_VERSION >= 9000
+        #include <dynlink_nvcuvid.h>
+    #else
+        #include <nvcuvid.h>
+    #endif
+
+    #ifdef _WIN32
+        #define NOMINMAX
+        #include <windows.h>
+        #ifdef HAVE_NVCUVENC
+            #include <NVEncoderAPI.h>
+        #endif
+    #else
+        #include <pthread.h>
+        #include <unistd.h>
+    #endif
+
+    #include "thread.hpp"
+    #include "video_source.hpp"
+    #include "ffmpeg_video_source.hpp"
+    #include "cuvid_video_source.hpp"
+    #include "frame_queue.hpp"
+    #include "video_decoder.hpp"
+    #include "video_parser.hpp"
+
+    #include "../src/cap_ffmpeg_api.hpp"
+#endif
+
+#endif /* OPENCV_PRECOMP_H */
diff --git a/modules/cudacodec/src/thread.cpp b/modules/cudacodec/src/thread.cpp
new file mode 100644
index 00000000000..d3264491351
--- /dev/null
+++ b/modules/cudacodec/src/thread.cpp
@@ -0,0 +1,170 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_NVCUVID
+
+using namespace cv::cudacodec::detail;
+
+#ifdef _WIN32
+
+namespace
+{
+    struct UserData
+    {
+        Thread::Func func;
+        void* param;
+    };
+
+    DWORD WINAPI WinThreadFunction(LPVOID lpParam)
+    {
+        UserData* userData = static_cast<UserData*>(lpParam);
+
+        userData->func(userData->param);
+
+        return 0;
+    }
+}
+
+class cv::cudacodec::detail::Thread::Impl
+{
+public:
+    Impl(Thread::Func func, void* userData)
+    {
+        userData_.func = func;
+        userData_.param = userData;
+
+        thread_ = CreateThread(
+            NULL,                   // default security attributes
+            0,                      // use default stack size
+            WinThreadFunction,      // thread function name
+            &userData_,             // argument to thread function
+            0,                      // use default creation flags
+            &threadId_);            // returns the thread identifier
+    }
+
+    ~Impl()
+    {
+        CloseHandle(thread_);
+    }
+
+    void wait()
+    {
+        WaitForSingleObject(thread_, INFINITE);
+    }
+
+private:
+    UserData userData_;
+    HANDLE thread_;
+    DWORD threadId_;
+};
+
+#else
+
+namespace
+{
+    struct UserData
+    {
+        Thread::Func func;
+        void* param;
+    };
+
+    void* PThreadFunction(void* lpParam)
+    {
+        UserData* userData = static_cast<UserData*>(lpParam);
+
+        userData->func(userData->param);
+
+        return 0;
+    }
+}
+
+class cv::cudacodec::detail::Thread::Impl
+{
+public:
+    Impl(Thread::Func func, void* userData)
+    {
+        userData_.func = func;
+        userData_.param = userData;
+
+        pthread_create(&thread_, NULL, PThreadFunction, &userData_);
+    }
+
+    ~Impl()
+    {
+        pthread_detach(thread_);
+    }
+
+    void wait()
+    {
+        pthread_join(thread_, NULL);
+    }
+
+private:
+    pthread_t thread_;
+    UserData userData_;
+};
+
+#endif
+
+cv::cudacodec::detail::Thread::Thread(Func func, void* userData) :
+    impl_(new Impl(func, userData))
+{
+}
+
+void cv::cudacodec::detail::Thread::wait()
+{
+    impl_->wait();
+}
+
+void cv::cudacodec::detail::Thread::sleep(int ms)
+{
+#ifdef _WIN32
+    ::Sleep(ms);
+#else
+    ::usleep(ms * 1000);
+#endif
+}
+
+#endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/src/thread.hpp b/modules/cudacodec/src/thread.hpp
new file mode 100644
index 00000000000..25c2b2251b9
--- /dev/null
+++ b/modules/cudacodec/src/thread.hpp
@@ -0,0 +1,70 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __THREAD_WRAPPERS_HPP__
+#define __THREAD_WRAPPERS_HPP__
+
+#include "opencv2/core.hpp"
+
+namespace cv { namespace cudacodec { namespace detail {
+
+class Thread
+{
+public:
+    typedef void (*Func)(void* userData);
+
+    explicit Thread(Func func, void* userData = 0);
+
+    void wait();
+
+    static void sleep(int ms);
+
+    class Impl;
+
+private:
+    cv::Ptr<Impl> impl_;
+};
+
+}}}
+
+#endif // __THREAD_WRAPPERS_HPP__
diff --git a/modules/cudacodec/src/video_decoder.cpp b/modules/cudacodec/src/video_decoder.cpp
new file mode 100644
index 00000000000..35919c3f568
--- /dev/null
+++ b/modules/cudacodec/src/video_decoder.cpp
@@ -0,0 +1,116 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_NVCUVID
+
+void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
+{
+    release();
+
+    cudaVideoCodec _codec = static_cast<cudaVideoCodec>(videoFormat.codec);
+    cudaVideoChromaFormat _chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat.chromaFormat);
+
+    cudaVideoCreateFlags videoCreateFlags = (_codec == cudaVideoCodec_JPEG || _codec == cudaVideoCodec_MPEG2) ?
+                                            cudaVideoCreate_PreferCUDA :
+                                            cudaVideoCreate_PreferCUVID;
+
+    // Validate video format.  These are the currently supported formats via NVCUVID
+    CV_Assert(cudaVideoCodec_MPEG1 == _codec ||
+              cudaVideoCodec_MPEG2 == _codec ||
+              cudaVideoCodec_MPEG4 == _codec ||
+              cudaVideoCodec_VC1   == _codec ||
+              cudaVideoCodec_H264  == _codec ||
+              cudaVideoCodec_JPEG  == _codec ||
+              cudaVideoCodec_YUV420== _codec ||
+              cudaVideoCodec_YV12  == _codec ||
+              cudaVideoCodec_NV12  == _codec ||
+              cudaVideoCodec_YUYV  == _codec ||
+              cudaVideoCodec_UYVY  == _codec );
+
+    CV_Assert(cudaVideoChromaFormat_Monochrome == _chromaFormat ||
+              cudaVideoChromaFormat_420        == _chromaFormat ||
+              cudaVideoChromaFormat_422        == _chromaFormat ||
+              cudaVideoChromaFormat_444        == _chromaFormat);
+
+    // Fill the decoder-create-info struct from the given video-format struct.
+    std::memset(&createInfo_, 0, sizeof(CUVIDDECODECREATEINFO));
+
+    // Create video decoder
+    createInfo_.CodecType           = _codec;
+    createInfo_.ulWidth             = videoFormat.width;
+    createInfo_.ulHeight            = videoFormat.height;
+    createInfo_.ulNumDecodeSurfaces = FrameQueue::MaximumSize;
+
+    // Limit decode memory to 24MB (16M pixels at 4:2:0 = 24M bytes)
+    while (createInfo_.ulNumDecodeSurfaces * videoFormat.width * videoFormat.height > 16 * 1024 * 1024)
+        createInfo_.ulNumDecodeSurfaces--;
+
+    createInfo_.ChromaFormat    = _chromaFormat;
+    createInfo_.OutputFormat    = cudaVideoSurfaceFormat_NV12;
+    createInfo_.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive;
+
+    // No scaling
+    static const int MAX_FRAME_COUNT = 2;
+
+    createInfo_.ulTargetWidth       = createInfo_.ulWidth;
+    createInfo_.ulTargetHeight      = createInfo_.ulHeight;
+    createInfo_.ulNumOutputSurfaces = MAX_FRAME_COUNT;  // We won't simultaneously map more than 8 surfaces
+    createInfo_.ulCreationFlags     = videoCreateFlags;
+    createInfo_.vidLock = lock_;
+
+    // create the decoder
+    cuSafeCall( cuvidCreateDecoder(&decoder_, &createInfo_) );
+}
+
+void cv::cudacodec::detail::VideoDecoder::release()
+{
+    if (decoder_)
+    {
+        cuvidDestroyDecoder(decoder_);
+        decoder_ = 0;
+    }
+}
+
+#endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/src/video_decoder.hpp b/modules/cudacodec/src/video_decoder.hpp
new file mode 100644
index 00000000000..3fce5194c5e
--- /dev/null
+++ b/modules/cudacodec/src/video_decoder.hpp
@@ -0,0 +1,116 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __VIDEO_DECODER_HPP__
+#define __VIDEO_DECODER_HPP__
+
+#if CUDA_VERSION >= 9000
+    #include <dynlink_nvcuvid.h>
+#else
+    #include <nvcuvid.h>
+#endif
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/cudacodec.hpp"
+
+namespace cv { namespace cudacodec { namespace detail
+{
+
+class VideoDecoder
+{
+public:
+    VideoDecoder(const FormatInfo& videoFormat, CUvideoctxlock lock) : lock_(lock), decoder_(0)
+    {
+        create(videoFormat);
+    }
+
+    ~VideoDecoder()
+    {
+        release();
+    }
+
+    void create(const FormatInfo& videoFormat);
+    void release();
+
+    // Get the code-type currently used.
+    cudaVideoCodec codec() const { return createInfo_.CodecType; }
+    unsigned long maxDecodeSurfaces() const { return createInfo_.ulNumDecodeSurfaces; }
+
+    unsigned long frameWidth() const { return createInfo_.ulWidth; }
+    unsigned long frameHeight() const { return createInfo_.ulHeight; }
+
+    unsigned long targetWidth() const { return createInfo_.ulTargetWidth; }
+    unsigned long targetHeight() const { return createInfo_.ulTargetHeight; }
+
+    cudaVideoChromaFormat chromaFormat() const { return createInfo_.ChromaFormat; }
+
+    bool decodePicture(CUVIDPICPARAMS* picParams)
+    {
+        return cuvidDecodePicture(decoder_, picParams) == CUDA_SUCCESS;
+    }
+
+    cuda::GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
+    {
+        CUdeviceptr ptr;
+        unsigned int pitch;
+
+        cuSafeCall( cuvidMapVideoFrame(decoder_, picIdx, &ptr, &pitch, &videoProcParams) );
+
+        return cuda::GpuMat(targetHeight() * 3 / 2, targetWidth(), CV_8UC1, (void*) ptr, pitch);
+    }
+
+    void unmapFrame(cuda::GpuMat& frame)
+    {
+        cuSafeCall( cuvidUnmapVideoFrame(decoder_, (CUdeviceptr) frame.data) );
+        frame.release();
+    }
+
+private:
+    CUvideoctxlock lock_;
+    CUVIDDECODECREATEINFO createInfo_;
+    CUvideodecoder        decoder_;
+};
+
+}}}
+
+#endif // __VIDEO_DECODER_HPP__
diff --git a/modules/cudacodec/src/video_parser.cpp b/modules/cudacodec/src/video_parser.cpp
new file mode 100644
index 00000000000..737f5851180
--- /dev/null
+++ b/modules/cudacodec/src/video_parser.cpp
@@ -0,0 +1,162 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_NVCUVID
+
+cv::cudacodec::detail::VideoParser::VideoParser(VideoDecoder* videoDecoder, FrameQueue* frameQueue) :
+    videoDecoder_(videoDecoder), frameQueue_(frameQueue), unparsedPackets_(0), hasError_(false)
+{
+    CUVIDPARSERPARAMS params;
+    std::memset(&params, 0, sizeof(CUVIDPARSERPARAMS));
+
+    params.CodecType              = videoDecoder->codec();
+    params.ulMaxNumDecodeSurfaces = videoDecoder->maxDecodeSurfaces();
+    params.ulMaxDisplayDelay      = 1; // this flag is needed so the parser will push frames out to the decoder as quickly as it can
+    params.pUserData              = this;
+    params.pfnSequenceCallback    = HandleVideoSequence;    // Called before decoding frames and/or whenever there is a format change
+    params.pfnDecodePicture       = HandlePictureDecode;    // Called when a picture is ready to be decoded (decode order)
+    params.pfnDisplayPicture      = HandlePictureDisplay;   // Called whenever a picture is ready to be displayed (display order)
+
+    cuSafeCall( cuvidCreateVideoParser(&parser_, &params) );
+}
+
+bool cv::cudacodec::detail::VideoParser::parseVideoData(const unsigned char* data, size_t size, bool endOfStream)
+{
+    CUVIDSOURCEDATAPACKET packet;
+    std::memset(&packet, 0, sizeof(CUVIDSOURCEDATAPACKET));
+
+    if (endOfStream)
+        packet.flags |= CUVID_PKT_ENDOFSTREAM;
+
+    packet.payload_size = static_cast<unsigned long>(size);
+    packet.payload = data;
+
+    if (cuvidParseVideoData(parser_, &packet) != CUDA_SUCCESS)
+    {
+        hasError_ = true;
+        frameQueue_->endDecode();
+        return false;
+    }
+
+    const int maxUnparsedPackets = 15;
+
+    ++unparsedPackets_;
+    if (unparsedPackets_ > maxUnparsedPackets)
+    {
+        hasError_ = true;
+        frameQueue_->endDecode();
+        return false;
+    }
+
+    if (endOfStream)
+        frameQueue_->endDecode();
+
+    return !frameQueue_->isEndOfDecode();
+}
+
+int CUDAAPI cv::cudacodec::detail::VideoParser::HandleVideoSequence(void* userData, CUVIDEOFORMAT* format)
+{
+    VideoParser* thiz = static_cast<VideoParser*>(userData);
+
+    thiz->unparsedPackets_ = 0;
+
+    if (format->codec         != thiz->videoDecoder_->codec()       ||
+        format->coded_width   != thiz->videoDecoder_->frameWidth()  ||
+        format->coded_height  != thiz->videoDecoder_->frameHeight() ||
+        format->chroma_format != thiz->videoDecoder_->chromaFormat())
+    {
+        FormatInfo newFormat;
+
+        newFormat.codec = static_cast<Codec>(format->codec);
+        newFormat.chromaFormat = static_cast<ChromaFormat>(format->chroma_format);
+        newFormat.width = format->coded_width;
+        newFormat.height = format->coded_height;
+
+        try
+        {
+            thiz->videoDecoder_->create(newFormat);
+        }
+        catch (const cv::Exception&)
+        {
+            thiz->hasError_ = true;
+            return false;
+        }
+    }
+
+    return true;
+}
+
+int CUDAAPI cv::cudacodec::detail::VideoParser::HandlePictureDecode(void* userData, CUVIDPICPARAMS* picParams)
+{
+    VideoParser* thiz = static_cast<VideoParser*>(userData);
+
+    thiz->unparsedPackets_ = 0;
+
+    bool isFrameAvailable = thiz->frameQueue_->waitUntilFrameAvailable(picParams->CurrPicIdx);
+
+    if (!isFrameAvailable)
+        return false;
+
+    if (!thiz->videoDecoder_->decodePicture(picParams))
+    {
+        thiz->hasError_ = true;
+        return false;
+    }
+
+    return true;
+}
+
+int CUDAAPI cv::cudacodec::detail::VideoParser::HandlePictureDisplay(void* userData, CUVIDPARSERDISPINFO* picParams)
+{
+    VideoParser* thiz = static_cast<VideoParser*>(userData);
+
+    thiz->unparsedPackets_ = 0;
+
+    thiz->frameQueue_->enqueue(picParams);
+
+    return true;
+}
+
+#endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/src/video_parser.hpp b/modules/cudacodec/src/video_parser.hpp
new file mode 100644
index 00000000000..165b4206fe5
--- /dev/null
+++ b/modules/cudacodec/src/video_parser.hpp
@@ -0,0 +1,99 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __VIDEO_PARSER_HPP__
+#define __VIDEO_PARSER_HPP__
+
+#if CUDA_VERSION >= 9000
+    #include <dynlink_nvcuvid.h>
+#else
+    #include <nvcuvid.h>
+#endif
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/cudacodec.hpp"
+#include "frame_queue.hpp"
+#include "video_decoder.hpp"
+
+namespace cv { namespace cudacodec { namespace detail
+{
+
+class VideoParser
+{
+public:
+    VideoParser(VideoDecoder* videoDecoder, FrameQueue* frameQueue);
+
+    ~VideoParser()
+    {
+        cuvidDestroyVideoParser(parser_);
+    }
+
+    bool parseVideoData(const unsigned char* data, size_t size, bool endOfStream);
+
+    bool hasError() const { return hasError_; }
+
+private:
+    VideoDecoder* videoDecoder_;
+    FrameQueue* frameQueue_;
+    CUvideoparser parser_;
+    int unparsedPackets_;
+    volatile bool hasError_;
+
+    // Called when the decoder encounters a video format change (or initial sequence header)
+    // This particular implementation of the callback returns 0 in case the video format changes
+    // to something different than the original format. Returning 0 causes a stop of the app.
+    static int CUDAAPI HandleVideoSequence(void* pUserData, CUVIDEOFORMAT* pFormat);
+
+    // Called by the video parser to decode a single picture
+    // Since the parser will deliver data as fast as it can, we need to make sure that the picture
+    // index we're attempting to use for decode is no longer used for display
+    static int CUDAAPI HandlePictureDecode(void* pUserData, CUVIDPICPARAMS* pPicParams);
+
+    // Called by the video parser to display a video frame (in the case of field pictures, there may be
+    // 2 decode calls per 1 display call, since two fields make up one frame)
+    static int CUDAAPI HandlePictureDisplay(void* pUserData, CUVIDPARSERDISPINFO* pPicParams);
+};
+
+}}}
+
+#endif // __VIDEO_PARSER_HPP__
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
new file mode 100644
index 00000000000..3024cd9268c
--- /dev/null
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -0,0 +1,223 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudacodec;
+
+#ifndef HAVE_NVCUVID
+
+Ptr<VideoReader> cv::cudacodec::createVideoReader(const String&) { throw_no_cuda(); return Ptr<VideoReader>(); }
+Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>&) { throw_no_cuda(); return Ptr<VideoReader>(); }
+
+#else // HAVE_NVCUVID
+
+void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height);
+
+using namespace cv::cudacodec::detail;
+
+namespace
+{
+    class VideoReaderImpl : public VideoReader
+    {
+    public:
+        explicit VideoReaderImpl(const Ptr<VideoSource>& source);
+        ~VideoReaderImpl();
+
+        bool nextFrame(OutputArray frame) CV_OVERRIDE;
+
+        FormatInfo format() const CV_OVERRIDE;
+
+    private:
+        Ptr<VideoSource> videoSource_;
+
+        Ptr<FrameQueue> frameQueue_;
+        Ptr<VideoDecoder> videoDecoder_;
+        Ptr<VideoParser> videoParser_;
+
+        CUvideoctxlock lock_;
+
+        std::deque< std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> > frames_;
+    };
+
+    FormatInfo VideoReaderImpl::format() const
+    {
+        return videoSource_->format();
+    }
+
+    VideoReaderImpl::VideoReaderImpl(const Ptr<VideoSource>& source) :
+        videoSource_(source),
+        lock_(0)
+    {
+        // init context
+        GpuMat temp(1, 1, CV_8UC1);
+        temp.release();
+
+        CUcontext ctx;
+        cuSafeCall( cuCtxGetCurrent(&ctx) );
+        cuSafeCall( cuvidCtxLockCreate(&lock_, ctx) );
+
+        frameQueue_.reset(new FrameQueue);
+        videoDecoder_.reset(new VideoDecoder(videoSource_->format(), lock_));
+        videoParser_.reset(new VideoParser(videoDecoder_, frameQueue_));
+
+        videoSource_->setVideoParser(videoParser_);
+        videoSource_->start();
+    }
+
+    VideoReaderImpl::~VideoReaderImpl()
+    {
+        frameQueue_->endDecode();
+        videoSource_->stop();
+    }
+
+    class VideoCtxAutoLock
+    {
+    public:
+        VideoCtxAutoLock(CUvideoctxlock lock) : m_lock(lock) { cuSafeCall( cuvidCtxLock(m_lock, 0) ); }
+        ~VideoCtxAutoLock() { cuvidCtxUnlock(m_lock, 0); }
+
+    private:
+        CUvideoctxlock m_lock;
+    };
+
+    bool VideoReaderImpl::nextFrame(OutputArray frame)
+    {
+        if (videoSource_->hasError() || videoParser_->hasError())
+            CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");
+
+        if (!videoSource_->isStarted() || frameQueue_->isEndOfDecode())
+            return false;
+
+        if (frames_.empty())
+        {
+            CUVIDPARSERDISPINFO displayInfo;
+
+            for (;;)
+            {
+                if (frameQueue_->dequeue(displayInfo))
+                    break;
+
+                if (videoSource_->hasError() || videoParser_->hasError())
+                    CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");
+
+                if (frameQueue_->isEndOfDecode())
+                    return false;
+
+                // Wait a bit
+                Thread::sleep(1);
+            }
+
+            bool isProgressive = displayInfo.progressive_frame != 0;
+            const int num_fields = isProgressive ? 1 : 2 + displayInfo.repeat_first_field;
+
+            for (int active_field = 0; active_field < num_fields; ++active_field)
+            {
+                CUVIDPROCPARAMS videoProcParams;
+                std::memset(&videoProcParams, 0, sizeof(CUVIDPROCPARAMS));
+
+                videoProcParams.progressive_frame = displayInfo.progressive_frame;
+                videoProcParams.second_field      = active_field;
+                videoProcParams.top_field_first   = displayInfo.top_field_first;
+                videoProcParams.unpaired_field    = (num_fields == 1);
+
+                frames_.push_back(std::make_pair(displayInfo, videoProcParams));
+            }
+        }
+
+        if (frames_.empty())
+            return false;
+
+        std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo = frames_.front();
+        frames_.pop_front();
+
+        {
+            VideoCtxAutoLock autoLock(lock_);
+
+            // map decoded video frame to CUDA surface
+            GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);
+
+            // perform post processing on the CUDA surface (performs colors space conversion and post processing)
+            // comment this out if we include the line of code seen above
+            videoDecPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight());
+
+            // unmap video frame
+            // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
+            videoDecoder_->unmapFrame(decodedFrame);
+        }
+
+        // release the frame, so it can be re-used in decoder
+        if (frames_.empty())
+            frameQueue_->releaseFrame(frameInfo.first);
+
+        return true;
+    }
+}
+
+Ptr<VideoReader> cv::cudacodec::createVideoReader(const String& filename)
+{
+    CV_Assert( !filename.empty() );
+
+    Ptr<VideoSource> videoSource;
+
+    try
+    {
+        videoSource.reset(new CuvidVideoSource(filename));
+    }
+    catch (...)
+    {
+        Ptr<RawVideoSource> source(new FFmpegVideoSource(filename));
+        videoSource.reset(new RawVideoSourceWrapper(source));
+    }
+
+    return makePtr<VideoReaderImpl>(videoSource);
+}
+
+Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>& source)
+{
+    Ptr<VideoSource> videoSource(new RawVideoSourceWrapper(source));
+    return makePtr<VideoReaderImpl>(videoSource);
+}
+
+#endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/src/video_source.cpp b/modules/cudacodec/src/video_source.cpp
new file mode 100644
index 00000000000..ce6b3415f2a
--- /dev/null
+++ b/modules/cudacodec/src/video_source.cpp
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_NVCUVID
+
+using namespace cv;
+using namespace cv::cudacodec;
+using namespace cv::cudacodec::detail;
+
+bool cv::cudacodec::detail::VideoSource::parseVideoData(const unsigned char* data, size_t size, bool endOfStream)
+{
+    return videoParser_->parseVideoData(data, size, endOfStream);
+}
+
+cv::cudacodec::detail::RawVideoSourceWrapper::RawVideoSourceWrapper(const Ptr<RawVideoSource>& source) :
+    source_(source)
+{
+    CV_Assert( !source_.empty() );
+}
+
+cv::cudacodec::FormatInfo cv::cudacodec::detail::RawVideoSourceWrapper::format() const
+{
+    return source_->format();
+}
+
+void cv::cudacodec::detail::RawVideoSourceWrapper::start()
+{
+    stop_ = false;
+    hasError_ = false;
+    thread_.reset(new Thread(readLoop, this));
+}
+
+void cv::cudacodec::detail::RawVideoSourceWrapper::stop()
+{
+    stop_ = true;
+    thread_->wait();
+    thread_.release();
+}
+
+bool cv::cudacodec::detail::RawVideoSourceWrapper::isStarted() const
+{
+    return !stop_;
+}
+
+bool cv::cudacodec::detail::RawVideoSourceWrapper::hasError() const
+{
+    return hasError_;
+}
+
+void cv::cudacodec::detail::RawVideoSourceWrapper::readLoop(void* userData)
+{
+    RawVideoSourceWrapper* thiz = static_cast<RawVideoSourceWrapper*>(userData);
+
+    for (;;)
+    {
+        unsigned char* data;
+        int size;
+        bool endOfFile;
+
+        if (!thiz->source_->getNextPacket(&data, &size, &endOfFile))
+        {
+            thiz->hasError_ = !endOfFile;
+            break;
+        }
+
+        if (!thiz->parseVideoData(data, size))
+        {
+            thiz->hasError_ = true;
+            break;
+        }
+
+        if (thiz->stop_)
+            break;
+    }
+
+    thiz->parseVideoData(0, 0, true);
+}
+
+#endif // HAVE_NVCUVID
diff --git a/modules/cudacodec/src/video_source.hpp b/modules/cudacodec/src/video_source.hpp
new file mode 100644
index 00000000000..9f2ed29d588
--- /dev/null
+++ b/modules/cudacodec/src/video_source.hpp
@@ -0,0 +1,99 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __CUDACODEC_VIDEO_SOURCE_H__
+#define __CUDACODEC_VIDEO_SOURCE_H__
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/cudacodec.hpp"
+#include "thread.hpp"
+
+namespace cv { namespace cudacodec { namespace detail
+{
+
+class VideoParser;
+
+class VideoSource
+{
+public:
+    virtual ~VideoSource() {}
+
+    virtual FormatInfo format() const = 0;
+    virtual void start() = 0;
+    virtual void stop() = 0;
+    virtual bool isStarted() const = 0;
+    virtual bool hasError() const = 0;
+
+    void setVideoParser(detail::VideoParser* videoParser) { videoParser_ = videoParser; }
+
+protected:
+    bool parseVideoData(const uchar* data, size_t size, bool endOfStream = false);
+
+private:
+    detail::VideoParser* videoParser_;
+};
+
+class RawVideoSourceWrapper : public VideoSource
+{
+public:
+    RawVideoSourceWrapper(const Ptr<RawVideoSource>& source);
+
+    FormatInfo format() const CV_OVERRIDE;
+    void start() CV_OVERRIDE;
+    void stop() CV_OVERRIDE;
+    bool isStarted() const CV_OVERRIDE;
+    bool hasError() const CV_OVERRIDE;
+
+private:
+    Ptr<RawVideoSource> source_;
+
+    Ptr<Thread> thread_;
+    volatile bool stop_;
+    volatile bool hasError_;
+
+    static void readLoop(void* userData);
+};
+
+}}}
+
+#endif // __CUDACODEC_VIDEO_SOURCE_H__
diff --git a/modules/cudacodec/src/video_writer.cpp b/modules/cudacodec/src/video_writer.cpp
new file mode 100644
index 00000000000..9d8e62e9cd9
--- /dev/null
+++ b/modules/cudacodec/src/video_writer.cpp
@@ -0,0 +1,916 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudacodec;
+
+#if !defined(HAVE_NVCUVENC) || !defined(_WIN32)
+
+cv::cudacodec::EncoderParams::EncoderParams() { throw_no_cuda(); }
+cv::cudacodec::EncoderParams::EncoderParams(const String&) { throw_no_cuda(); }
+void cv::cudacodec::EncoderParams::load(const String&) { throw_no_cuda(); }
+void cv::cudacodec::EncoderParams::save(const String&) const { throw_no_cuda(); }
+
+Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const String&, Size, double, SurfaceFormat) { throw_no_cuda(); return Ptr<VideoWriter>(); }
+Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const String&, Size, double, const EncoderParams&, SurfaceFormat) { throw_no_cuda(); return Ptr<VideoWriter>(); }
+
+Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const Ptr<EncoderCallBack>&, Size, double, SurfaceFormat) { throw_no_cuda(); return Ptr<VideoWriter>(); }
+Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const Ptr<EncoderCallBack>&, Size, double, const EncoderParams&, SurfaceFormat) { throw_no_cuda(); return Ptr<VideoWriter>(); }
+
+#else // !defined HAVE_NVCUVENC || !defined _WIN32
+
+void RGB_to_YV12(const GpuMat& src, GpuMat& dst);
+
+///////////////////////////////////////////////////////////////////////////
+// VideoWriterImpl
+
+namespace
+{
+    class NVEncoderWrapper
+    {
+    public:
+        NVEncoderWrapper() : encoder_(0)
+        {
+            int err;
+
+            err = NVGetHWEncodeCaps();
+            if (err)
+                CV_Error(Error::GpuNotSupported, "No CUDA capability present");
+
+            // Create the Encoder API Interface
+            err = NVCreateEncoder(&encoder_);
+            CV_Assert( err == 0 );
+        }
+
+        ~NVEncoderWrapper()
+        {
+            if (encoder_)
+                NVDestroyEncoder(encoder_);
+        }
+
+        operator NVEncoder() const
+        {
+            return encoder_;
+        }
+
+    private:
+        NVEncoder encoder_;
+    };
+
+    enum CodecType
+    {
+        MPEG1, // not supported yet
+        MPEG2, // not supported yet
+        MPEG4, // not supported yet
+        H264
+    };
+
+    class VideoWriterImpl : public VideoWriter
+    {
+    public:
+        VideoWriterImpl(const Ptr<EncoderCallBack>& callback, Size frameSize, double fps, SurfaceFormat format, CodecType codec = H264);
+        VideoWriterImpl(const Ptr<EncoderCallBack>& callback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format, CodecType codec = H264);
+
+        void write(InputArray frame, bool lastFrame = false);
+
+        EncoderParams getEncoderParams() const;
+
+    private:
+        void initEncoder(double fps);
+        void setEncodeParams(const EncoderParams& params);
+        void initGpuMemory();
+        void initCallBacks();
+        void createHWEncoder();
+
+        Ptr<EncoderCallBack> callback_;
+        Size frameSize_;
+
+        CodecType codec_;
+        SurfaceFormat inputFormat_;
+        NVVE_SurfaceFormat surfaceFormat_;
+
+        NVEncoderWrapper encoder_;
+
+        GpuMat videoFrame_;
+        CUvideoctxlock cuCtxLock_;
+
+        // CallBacks
+
+        static unsigned char* NVENCAPI HandleAcquireBitStream(int* pBufferSize, void* pUserdata);
+        static void NVENCAPI HandleReleaseBitStream(int nBytesInBuffer, unsigned char* cb, void* pUserdata);
+        static void NVENCAPI HandleOnBeginFrame(const NVVE_BeginFrameInfo* pbfi, void* pUserdata);
+        static void NVENCAPI HandleOnEndFrame(const NVVE_EndFrameInfo* pefi, void* pUserdata);
+    };
+
+    VideoWriterImpl::VideoWriterImpl(const Ptr<EncoderCallBack>& callback, Size frameSize, double fps, SurfaceFormat format, CodecType codec) :
+        callback_(callback),
+        frameSize_(frameSize),
+        codec_(codec),
+        inputFormat_(format),
+        cuCtxLock_(0)
+    {
+        surfaceFormat_ = (inputFormat_ == SF_BGR ? YV12 : static_cast<NVVE_SurfaceFormat>(inputFormat_));
+
+        initEncoder(fps);
+
+        initGpuMemory();
+
+        initCallBacks();
+
+        createHWEncoder();
+    }
+
+    VideoWriterImpl::VideoWriterImpl(const Ptr<EncoderCallBack>& callback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format, CodecType codec) :
+        callback_(callback),
+        frameSize_(frameSize),
+        codec_(codec),
+        inputFormat_(format),
+        cuCtxLock_(0)
+    {
+        surfaceFormat_ = (inputFormat_ == SF_BGR ? YV12 : static_cast<NVVE_SurfaceFormat>(inputFormat_));
+
+        initEncoder(fps);
+
+        setEncodeParams(params);
+
+        initGpuMemory();
+
+        initCallBacks();
+
+        createHWEncoder();
+    }
+
+    void VideoWriterImpl::initEncoder(double fps)
+    {
+        int err;
+
+        // Set codec
+
+        static const unsigned long codecs_id[] =
+        {
+            NV_CODEC_TYPE_MPEG1, NV_CODEC_TYPE_MPEG2, NV_CODEC_TYPE_MPEG4, NV_CODEC_TYPE_H264, NV_CODEC_TYPE_VC1
+        };
+        err = NVSetCodec(encoder_, codecs_id[codec_]);
+        if (err)
+            CV_Error(Error::StsNotImplemented, "Codec format is not supported");
+
+        // Set default params
+
+        err = NVSetDefaultParam(encoder_);
+        CV_Assert( err == 0 );
+
+        // Set some common params
+
+        int inputSize[] = { frameSize_.width, frameSize_.height };
+        err = NVSetParamValue(encoder_, NVVE_IN_SIZE, &inputSize);
+        CV_Assert( err == 0 );
+        err = NVSetParamValue(encoder_, NVVE_OUT_SIZE, &inputSize);
+        CV_Assert( err == 0 );
+
+        int aspectRatio[] = { frameSize_.width, frameSize_.height, ASPECT_RATIO_DAR };
+        err = NVSetParamValue(encoder_, NVVE_ASPECT_RATIO, &aspectRatio);
+        CV_Assert( err == 0 );
+
+        // FPS
+
+        int frame_rate = static_cast<int>(fps + 0.5);
+        int frame_rate_base = 1;
+        while (fabs(static_cast<double>(frame_rate) / frame_rate_base) - fps > 0.001)
+        {
+            frame_rate_base *= 10;
+            frame_rate = static_cast<int>(fps*frame_rate_base + 0.5);
+        }
+        int FrameRate[] = { frame_rate, frame_rate_base };
+        err = NVSetParamValue(encoder_, NVVE_FRAME_RATE, &FrameRate);
+        CV_Assert( err == 0 );
+
+        // Select device for encoding
+
+        int gpuID = getDevice();
+        err = NVSetParamValue(encoder_, NVVE_FORCE_GPU_SELECTION, &gpuID);
+        CV_Assert( err == 0 );
+    }
+
+    void VideoWriterImpl::setEncodeParams(const EncoderParams& params)
+    {
+        int err;
+
+        int P_Interval = params.P_Interval;
+        err = NVSetParamValue(encoder_, NVVE_P_INTERVAL, &P_Interval);
+        CV_Assert( err == 0 );
+
+        int IDR_Period = params.IDR_Period;
+        err = NVSetParamValue(encoder_, NVVE_IDR_PERIOD, &IDR_Period);
+        CV_Assert( err == 0 );
+
+        int DynamicGOP = params.DynamicGOP;
+        err = NVSetParamValue(encoder_, NVVE_DYNAMIC_GOP, &DynamicGOP);
+        CV_Assert( err == 0 );
+
+        NVVE_RateCtrlType RCType = static_cast<NVVE_RateCtrlType>(params.RCType);
+        err = NVSetParamValue(encoder_, NVVE_RC_TYPE, &RCType);
+        CV_Assert( err == 0 );
+
+        int AvgBitrate = params.AvgBitrate;
+        err = NVSetParamValue(encoder_, NVVE_AVG_BITRATE, &AvgBitrate);
+        CV_Assert( err == 0 );
+
+        int PeakBitrate = params.PeakBitrate;
+        err = NVSetParamValue(encoder_, NVVE_PEAK_BITRATE, &PeakBitrate);
+        CV_Assert( err == 0 );
+
+        int QP_Level_Intra = params.QP_Level_Intra;
+        err = NVSetParamValue(encoder_, NVVE_QP_LEVEL_INTRA, &QP_Level_Intra);
+        CV_Assert( err == 0 );
+
+        int QP_Level_InterP = params.QP_Level_InterP;
+        err = NVSetParamValue(encoder_, NVVE_QP_LEVEL_INTER_P, &QP_Level_InterP);
+        CV_Assert( err == 0 );
+
+        int QP_Level_InterB = params.QP_Level_InterB;
+        err = NVSetParamValue(encoder_, NVVE_QP_LEVEL_INTER_B, &QP_Level_InterB);
+        CV_Assert( err == 0 );
+
+        int DeblockMode = params.DeblockMode;
+        err = NVSetParamValue(encoder_, NVVE_DEBLOCK_MODE, &DeblockMode);
+        CV_Assert( err == 0 );
+
+        int ProfileLevel = params.ProfileLevel;
+        err = NVSetParamValue(encoder_, NVVE_PROFILE_LEVEL, &ProfileLevel);
+        CV_Assert( err == 0 );
+
+        int ForceIntra = params.ForceIntra;
+        err = NVSetParamValue(encoder_, NVVE_FORCE_INTRA, &ForceIntra);
+        CV_Assert( err == 0 );
+
+        int ForceIDR = params.ForceIDR;
+        err = NVSetParamValue(encoder_, NVVE_FORCE_IDR, &ForceIDR);
+        CV_Assert( err == 0 );
+
+        int ClearStat = params.ClearStat;
+        err = NVSetParamValue(encoder_, NVVE_CLEAR_STAT, &ClearStat);
+        CV_Assert( err == 0 );
+
+        NVVE_DI_MODE DIMode = static_cast<NVVE_DI_MODE>(params.DIMode);
+        err = NVSetParamValue(encoder_, NVVE_SET_DEINTERLACE, &DIMode);
+        CV_Assert( err == 0 );
+
+        if (params.Presets != -1)
+        {
+            NVVE_PRESETS_TARGET Presets = static_cast<NVVE_PRESETS_TARGET>(params.Presets);
+            err = NVSetParamValue(encoder_, NVVE_PRESETS, &Presets);
+            CV_Assert( err == 0 );
+        }
+
+        int DisableCabac = params.DisableCabac;
+        err = NVSetParamValue(encoder_, NVVE_DISABLE_CABAC, &DisableCabac);
+        CV_Assert( err == 0 );
+
+        int NaluFramingType = params.NaluFramingType;
+        err = NVSetParamValue(encoder_, NVVE_CONFIGURE_NALU_FRAMING_TYPE, &NaluFramingType);
+        CV_Assert( err == 0 );
+
+        int DisableSPSPPS = params.DisableSPSPPS;
+        err = NVSetParamValue(encoder_, NVVE_DISABLE_SPS_PPS, &DisableSPSPPS);
+        CV_Assert( err == 0 );
+    }
+
+    EncoderParams VideoWriterImpl::getEncoderParams() const
+    {
+        int err;
+
+        EncoderParams params;
+
+        int P_Interval;
+        err = NVGetParamValue(encoder_, NVVE_P_INTERVAL, &P_Interval);
+        CV_Assert( err == 0 );
+        params.P_Interval = P_Interval;
+
+        int IDR_Period;
+        err = NVGetParamValue(encoder_, NVVE_IDR_PERIOD, &IDR_Period);
+        CV_Assert( err == 0 );
+        params.IDR_Period = IDR_Period;
+
+        int DynamicGOP;
+        err = NVGetParamValue(encoder_, NVVE_DYNAMIC_GOP, &DynamicGOP);
+        CV_Assert( err == 0 );
+        params.DynamicGOP = DynamicGOP;
+
+        NVVE_RateCtrlType RCType;
+        err = NVGetParamValue(encoder_, NVVE_RC_TYPE, &RCType);
+        CV_Assert( err == 0 );
+        params.RCType = RCType;
+
+        int AvgBitrate;
+        err = NVGetParamValue(encoder_, NVVE_AVG_BITRATE, &AvgBitrate);
+        CV_Assert( err == 0 );
+        params.AvgBitrate = AvgBitrate;
+
+        int PeakBitrate;
+        err = NVGetParamValue(encoder_, NVVE_PEAK_BITRATE, &PeakBitrate);
+        CV_Assert( err == 0 );
+        params.PeakBitrate = PeakBitrate;
+
+        int QP_Level_Intra;
+        err = NVGetParamValue(encoder_, NVVE_QP_LEVEL_INTRA, &QP_Level_Intra);
+        CV_Assert( err == 0 );
+        params.QP_Level_Intra = QP_Level_Intra;
+
+        int QP_Level_InterP;
+        err = NVGetParamValue(encoder_, NVVE_QP_LEVEL_INTER_P, &QP_Level_InterP);
+        CV_Assert( err == 0 );
+        params.QP_Level_InterP = QP_Level_InterP;
+
+        int QP_Level_InterB;
+        err = NVGetParamValue(encoder_, NVVE_QP_LEVEL_INTER_B, &QP_Level_InterB);
+        CV_Assert( err == 0 );
+        params.QP_Level_InterB = QP_Level_InterB;
+
+        int DeblockMode;
+        err = NVGetParamValue(encoder_, NVVE_DEBLOCK_MODE, &DeblockMode);
+        CV_Assert( err == 0 );
+        params.DeblockMode = DeblockMode;
+
+        int ProfileLevel;
+        err = NVGetParamValue(encoder_, NVVE_PROFILE_LEVEL, &ProfileLevel);
+        CV_Assert( err == 0 );
+        params.ProfileLevel = ProfileLevel;
+
+        int ForceIntra;
+        err = NVGetParamValue(encoder_, NVVE_FORCE_INTRA, &ForceIntra);
+        CV_Assert( err == 0 );
+        params.ForceIntra = ForceIntra;
+
+        int ForceIDR;
+        err = NVGetParamValue(encoder_, NVVE_FORCE_IDR, &ForceIDR);
+        CV_Assert( err == 0 );
+        params.ForceIDR = ForceIDR;
+
+        int ClearStat;
+        err = NVGetParamValue(encoder_, NVVE_CLEAR_STAT, &ClearStat);
+        CV_Assert( err == 0 );
+        params.ClearStat = ClearStat;
+
+        NVVE_DI_MODE DIMode;
+        err = NVGetParamValue(encoder_, NVVE_SET_DEINTERLACE, &DIMode);
+        CV_Assert( err == 0 );
+        params.DIMode = DIMode;
+
+        params.Presets = -1;
+
+        int DisableCabac;
+        err = NVGetParamValue(encoder_, NVVE_DISABLE_CABAC, &DisableCabac);
+        CV_Assert( err == 0 );
+        params.DisableCabac = DisableCabac;
+
+        int NaluFramingType;
+        err = NVGetParamValue(encoder_, NVVE_CONFIGURE_NALU_FRAMING_TYPE, &NaluFramingType);
+        CV_Assert( err == 0 );
+        params.NaluFramingType = NaluFramingType;
+
+        int DisableSPSPPS;
+        err = NVGetParamValue(encoder_, NVVE_DISABLE_SPS_PPS, &DisableSPSPPS);
+        CV_Assert( err == 0 );
+        params.DisableSPSPPS = DisableSPSPPS;
+
+        return params;
+    }
+
+    void VideoWriterImpl::initGpuMemory()
+    {
+        int err;
+
+        // initialize context
+        GpuMat temp(1, 1, CV_8U);
+        temp.release();
+
+        static const int bpp[] =
+        {
+            16, // UYVY, 4:2:2
+            16, // YUY2, 4:2:2
+            12, // YV12, 4:2:0
+            12, // NV12, 4:2:0
+            12, // IYUV, 4:2:0
+        };
+
+        CUcontext cuContext;
+        cuSafeCall( cuCtxGetCurrent(&cuContext) );
+
+        // Allocate the CUDA memory Pitched Surface
+        if (surfaceFormat_ == UYVY || surfaceFormat_ == YUY2)
+            videoFrame_.create(frameSize_.height, (frameSize_.width * bpp[surfaceFormat_]) / 8, CV_8UC1);
+        else
+            videoFrame_.create((frameSize_.height * bpp[surfaceFormat_]) / 8, frameSize_.width, CV_8UC1);
+
+        // Create the Video Context Lock (used for synchronization)
+        cuSafeCall( cuvidCtxLockCreate(&cuCtxLock_, cuContext) );
+
+        // If we are using GPU Device Memory with NVCUVENC, it is necessary to create a
+        // CUDA Context with a Context Lock cuvidCtxLock.  The Context Lock needs to be passed to NVCUVENC
+
+        int iUseDeviceMem = 1;
+        err = NVSetParamValue(encoder_, NVVE_DEVICE_MEMORY_INPUT, &iUseDeviceMem);
+        CV_Assert( err == 0 );
+
+        err = NVSetParamValue(encoder_, NVVE_DEVICE_CTX_LOCK, &cuCtxLock_);
+        CV_Assert( err == 0 );
+    }
+
+    void VideoWriterImpl::initCallBacks()
+    {
+        NVVE_CallbackParams cb;
+        memset(&cb, 0, sizeof(NVVE_CallbackParams));
+
+        cb.pfnacquirebitstream = HandleAcquireBitStream;
+        cb.pfnonbeginframe     = HandleOnBeginFrame;
+        cb.pfnonendframe       = HandleOnEndFrame;
+        cb.pfnreleasebitstream = HandleReleaseBitStream;
+
+        NVRegisterCB(encoder_, cb, this);
+    }
+
+    void VideoWriterImpl::createHWEncoder()
+    {
+        int err;
+
+        // Create the NVIDIA HW resources for Encoding on NVIDIA hardware
+        err = NVCreateHWEncoder(encoder_);
+        CV_Assert( err == 0 );
+    }
+
+    // UYVY/YUY2 are both 4:2:2 formats (16bpc)
+    // Luma, U, V are interleaved, chroma is subsampled (w/2,h)
+    void copyUYVYorYUY2Frame(Size frameSize, const GpuMat& src, GpuMat& dst)
+    {
+        // Source is YUVY/YUY2 4:2:2, the YUV data in a packed and interleaved
+
+        // YUV Copy setup
+        CUDA_MEMCPY2D stCopyYUV422;
+        memset(&stCopyYUV422, 0, sizeof(CUDA_MEMCPY2D));
+
+        stCopyYUV422.srcXInBytes          = 0;
+        stCopyYUV422.srcY                 = 0;
+        stCopyYUV422.srcMemoryType        = CU_MEMORYTYPE_DEVICE;
+        stCopyYUV422.srcHost              = 0;
+        stCopyYUV422.srcDevice            = (CUdeviceptr) src.data;
+        stCopyYUV422.srcArray             = 0;
+        stCopyYUV422.srcPitch             = src.step;
+
+        stCopyYUV422.dstXInBytes          = 0;
+        stCopyYUV422.dstY                 = 0;
+        stCopyYUV422.dstMemoryType        = CU_MEMORYTYPE_DEVICE;
+        stCopyYUV422.dstHost              = 0;
+        stCopyYUV422.dstDevice            = (CUdeviceptr) dst.data;
+        stCopyYUV422.dstArray             = 0;
+        stCopyYUV422.dstPitch             = dst.step;
+
+        stCopyYUV422.WidthInBytes         = frameSize.width * 2;
+        stCopyYUV422.Height               = frameSize.height;
+
+        // DMA Luma/Chroma
+        cuSafeCall( cuMemcpy2D(&stCopyYUV422) );
+    }
+
+    // YV12/IYUV are both 4:2:0 planar formats (12bpc)
+    // Luma, U, V chroma planar (12bpc), chroma is subsampled (w/2,h/2)
+    void copyYV12orIYUVFrame(Size frameSize, const GpuMat& src, GpuMat& dst)
+    {
+        // Source is YV12/IYUV, this native format is converted to NV12 format by the video encoder
+
+        // (1) luma copy setup
+        CUDA_MEMCPY2D stCopyLuma;
+        memset(&stCopyLuma, 0, sizeof(CUDA_MEMCPY2D));
+
+        stCopyLuma.srcXInBytes          = 0;
+        stCopyLuma.srcY                 = 0;
+        stCopyLuma.srcMemoryType        = CU_MEMORYTYPE_DEVICE;
+        stCopyLuma.srcHost              = 0;
+        stCopyLuma.srcDevice            = (CUdeviceptr) src.data;
+        stCopyLuma.srcArray             = 0;
+        stCopyLuma.srcPitch             = src.step;
+
+        stCopyLuma.dstXInBytes          = 0;
+        stCopyLuma.dstY                 = 0;
+        stCopyLuma.dstMemoryType        = CU_MEMORYTYPE_DEVICE;
+        stCopyLuma.dstHost              = 0;
+        stCopyLuma.dstDevice            = (CUdeviceptr) dst.data;
+        stCopyLuma.dstArray             = 0;
+        stCopyLuma.dstPitch             = dst.step;
+
+        stCopyLuma.WidthInBytes         = frameSize.width;
+        stCopyLuma.Height               = frameSize.height;
+
+        // (2) chroma copy setup, U/V can be done together
+        CUDA_MEMCPY2D stCopyChroma;
+        memset(&stCopyChroma, 0, sizeof(CUDA_MEMCPY2D));
+
+        stCopyChroma.srcXInBytes        = 0;
+        stCopyChroma.srcY               = frameSize.height << 1; // U/V chroma offset
+        stCopyChroma.srcMemoryType      = CU_MEMORYTYPE_DEVICE;
+        stCopyChroma.srcHost            = 0;
+        stCopyChroma.srcDevice          = (CUdeviceptr) src.data;
+        stCopyChroma.srcArray           = 0;
+        stCopyChroma.srcPitch           = src.step >> 1; // chroma is subsampled by 2 (but it has U/V are next to each other)
+
+        stCopyChroma.dstXInBytes        = 0;
+        stCopyChroma.dstY               = frameSize.height << 1; // chroma offset (srcY*srcPitch now points to the chroma planes)
+        stCopyChroma.dstMemoryType      = CU_MEMORYTYPE_DEVICE;
+        stCopyChroma.dstHost            = 0;
+        stCopyChroma.dstDevice          = (CUdeviceptr) dst.data;
+        stCopyChroma.dstArray           = 0;
+        stCopyChroma.dstPitch           = dst.step >> 1;
+
+        stCopyChroma.WidthInBytes       = frameSize.width >> 1;
+        stCopyChroma.Height             = frameSize.height; // U/V are sent together
+
+        // DMA Luma
+        cuSafeCall( cuMemcpy2D(&stCopyLuma) );
+
+        // DMA Chroma channels (UV side by side)
+        cuSafeCall( cuMemcpy2D(&stCopyChroma) );
+    }
+
+    // NV12 is 4:2:0 format (12bpc)
+    // Luma followed by U/V chroma interleaved (12bpc), chroma is subsampled (w/2,h/2)
+    void copyNV12Frame(Size frameSize, const GpuMat& src, GpuMat& dst)
+    {
+        // Source is NV12 in pitch linear memory
+        // Because we are assume input is NV12 (if we take input in the native format), the encoder handles NV12 as a native format in pitch linear memory
+
+        // Luma/Chroma can be done in a single transfer
+        CUDA_MEMCPY2D stCopyNV12;
+        memset(&stCopyNV12, 0, sizeof(CUDA_MEMCPY2D));
+
+        stCopyNV12.srcXInBytes          = 0;
+        stCopyNV12.srcY                 = 0;
+        stCopyNV12.srcMemoryType        = CU_MEMORYTYPE_DEVICE;
+        stCopyNV12.srcHost              = 0;
+        stCopyNV12.srcDevice            = (CUdeviceptr) src.data;
+        stCopyNV12.srcArray             = 0;
+        stCopyNV12.srcPitch             = src.step;
+
+        stCopyNV12.dstXInBytes          = 0;
+        stCopyNV12.dstY                 = 0;
+        stCopyNV12.dstMemoryType        = CU_MEMORYTYPE_DEVICE;
+        stCopyNV12.dstHost              = 0;
+        stCopyNV12.dstDevice            = (CUdeviceptr) dst.data;
+        stCopyNV12.dstArray             = 0;
+        stCopyNV12.dstPitch             = dst.step;
+
+        stCopyNV12.WidthInBytes         = frameSize.width;
+        stCopyNV12.Height               = (frameSize.height * 3) >> 1;
+
+        // DMA Luma/Chroma
+        cuSafeCall( cuMemcpy2D(&stCopyNV12) );
+    }
+
+    void VideoWriterImpl::write(InputArray _frame, bool lastFrame)
+    {
+        GpuMat frame = _frame.getGpuMat();
+
+        if (inputFormat_ == SF_BGR)
+        {
+            CV_Assert( frame.size() == frameSize_ );
+            CV_Assert( frame.type() == CV_8UC1 || frame.type() == CV_8UC3 || frame.type() == CV_8UC4 );
+        }
+        else
+        {
+            CV_Assert( frame.size() == videoFrame_.size() );
+            CV_Assert( frame.type() == videoFrame_.type() );
+        }
+
+        NVVE_EncodeFrameParams efparams;
+        efparams.Width = frameSize_.width;
+        efparams.Height = frameSize_.height;
+        efparams.Pitch = static_cast<int>(videoFrame_.step);
+        efparams.SurfFmt = surfaceFormat_;
+        efparams.PictureStruc = FRAME_PICTURE;
+        efparams.topfieldfirst =  0;
+        efparams.repeatFirstField = 0;
+        efparams.progressiveFrame = (surfaceFormat_ == NV12) ? 1 : 0;
+        efparams.bLast = lastFrame;
+        efparams.picBuf = 0; // Must be set to NULL in order to support device memory input
+
+        // Don't forget we need to lock/unlock between memcopies
+        cuSafeCall( cuvidCtxLock(cuCtxLock_, 0) );
+
+        if (inputFormat_ == SF_BGR)
+        {
+            RGB_to_YV12(frame, videoFrame_);
+        }
+        else
+        {
+            switch (surfaceFormat_)
+            {
+            case UYVY: // UYVY (4:2:2)
+            case YUY2: // YUY2 (4:2:2)
+                copyUYVYorYUY2Frame(frameSize_, frame, videoFrame_);
+                break;
+
+            case YV12: // YV12 (4:2:0), Y V U
+            case IYUV: // IYUV (4:2:0), Y U V
+                copyYV12orIYUVFrame(frameSize_, frame, videoFrame_);
+                break;
+
+            case NV12: // NV12 (4:2:0)
+                copyNV12Frame(frameSize_, frame, videoFrame_);
+                break;
+            }
+        }
+
+        cuSafeCall( cuvidCtxUnlock(cuCtxLock_, 0) );
+
+        int err = NVEncodeFrame(encoder_, &efparams, 0, videoFrame_.data);
+        CV_Assert( err == 0 );
+    }
+
+    unsigned char* NVENCAPI VideoWriterImpl::HandleAcquireBitStream(int* pBufferSize, void* pUserdata)
+    {
+        VideoWriterImpl* thiz = static_cast<VideoWriterImpl*>(pUserdata);
+
+        return thiz->callback_->acquireBitStream(pBufferSize);
+    }
+
+    void NVENCAPI VideoWriterImpl::HandleReleaseBitStream(int nBytesInBuffer, unsigned char* cb, void* pUserdata)
+    {
+        VideoWriterImpl* thiz = static_cast<VideoWriterImpl*>(pUserdata);
+
+        thiz->callback_->releaseBitStream(cb, nBytesInBuffer);
+    }
+
+    void NVENCAPI VideoWriterImpl::HandleOnBeginFrame(const NVVE_BeginFrameInfo* pbfi, void* pUserdata)
+    {
+        VideoWriterImpl* thiz = static_cast<VideoWriterImpl*>(pUserdata);
+
+        thiz->callback_->onBeginFrame(pbfi->nFrameNumber, static_cast<EncoderCallBack::PicType>(pbfi->nPicType));
+    }
+
+    void NVENCAPI VideoWriterImpl::HandleOnEndFrame(const NVVE_EndFrameInfo* pefi, void* pUserdata)
+    {
+        VideoWriterImpl* thiz = static_cast<VideoWriterImpl*>(pUserdata);
+
+        thiz->callback_->onEndFrame(pefi->nFrameNumber, static_cast<EncoderCallBack::PicType>(pefi->nPicType));
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // FFMPEG
+
+    class EncoderCallBackFFMPEG : public EncoderCallBack
+    {
+    public:
+        EncoderCallBackFFMPEG(const String& fileName, Size frameSize, double fps);
+        ~EncoderCallBackFFMPEG();
+
+        unsigned char* acquireBitStream(int* bufferSize);
+        void releaseBitStream(unsigned char* data, int size);
+        void onBeginFrame(int frameNumber, PicType picType);
+        void onEndFrame(int frameNumber, PicType picType);
+
+    private:
+        static bool init_MediaStream_FFMPEG();
+
+        struct OutputMediaStream_FFMPEG* stream_;
+        std::vector<uchar> buf_;
+        bool isKeyFrame_;
+
+        static Create_OutputMediaStream_FFMPEG_Plugin create_OutputMediaStream_FFMPEG_p;
+        static Release_OutputMediaStream_FFMPEG_Plugin release_OutputMediaStream_FFMPEG_p;
+        static Write_OutputMediaStream_FFMPEG_Plugin write_OutputMediaStream_FFMPEG_p;
+    };
+
+    Create_OutputMediaStream_FFMPEG_Plugin EncoderCallBackFFMPEG::create_OutputMediaStream_FFMPEG_p = 0;
+    Release_OutputMediaStream_FFMPEG_Plugin EncoderCallBackFFMPEG::release_OutputMediaStream_FFMPEG_p = 0;
+    Write_OutputMediaStream_FFMPEG_Plugin EncoderCallBackFFMPEG::write_OutputMediaStream_FFMPEG_p = 0;
+
+    bool EncoderCallBackFFMPEG::init_MediaStream_FFMPEG()
+    {
+        static bool initialized = false;
+
+        if (!initialized)
+        {
+            #if defined(_WIN32)
+                const char* module_name = "opencv_ffmpeg"
+                    CVAUX_STR(CV_VERSION_MAJOR) CVAUX_STR(CV_VERSION_MINOR) CVAUX_STR(CV_VERSION_REVISION)
+                #if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__)
+                    "_64"
+                #endif
+                    ".dll";
+
+                static HMODULE cvFFOpenCV = LoadLibrary(module_name);
+
+                if (cvFFOpenCV)
+                {
+                    create_OutputMediaStream_FFMPEG_p =
+                        (Create_OutputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "create_OutputMediaStream_FFMPEG");
+                    release_OutputMediaStream_FFMPEG_p =
+                        (Release_OutputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "release_OutputMediaStream_FFMPEG");
+                    write_OutputMediaStream_FFMPEG_p =
+                        (Write_OutputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "write_OutputMediaStream_FFMPEG");
+
+                    initialized = create_OutputMediaStream_FFMPEG_p != 0 && release_OutputMediaStream_FFMPEG_p != 0 && write_OutputMediaStream_FFMPEG_p != 0;
+                }
+            #elif defined(HAVE_FFMPEG)
+                create_OutputMediaStream_FFMPEG_p = create_OutputMediaStream_FFMPEG;
+                release_OutputMediaStream_FFMPEG_p = release_OutputMediaStream_FFMPEG;
+                write_OutputMediaStream_FFMPEG_p = write_OutputMediaStream_FFMPEG;
+
+                initialized = true;
+            #endif
+        }
+
+        return initialized;
+    }
+
+    EncoderCallBackFFMPEG::EncoderCallBackFFMPEG(const String& fileName, Size frameSize, double fps) :
+        stream_(0), isKeyFrame_(false)
+    {
+        int buf_size = std::max(frameSize.area() * 4, 1024 * 1024);
+        buf_.resize(buf_size);
+
+        CV_Assert( init_MediaStream_FFMPEG() );
+
+        stream_ = create_OutputMediaStream_FFMPEG_p(fileName.c_str(), frameSize.width, frameSize.height, fps);
+        CV_Assert( stream_ != 0 );
+    }
+
+    EncoderCallBackFFMPEG::~EncoderCallBackFFMPEG()
+    {
+        release_OutputMediaStream_FFMPEG_p(stream_);
+    }
+
+    unsigned char* EncoderCallBackFFMPEG::acquireBitStream(int* bufferSize)
+    {
+        *bufferSize = static_cast<int>(buf_.size());
+        return &buf_[0];
+    }
+
+    void EncoderCallBackFFMPEG::releaseBitStream(unsigned char* data, int size)
+    {
+        write_OutputMediaStream_FFMPEG_p(stream_, data, size, isKeyFrame_);
+    }
+
+    void EncoderCallBackFFMPEG::onBeginFrame(int frameNumber, PicType picType)
+    {
+        CV_UNUSED(frameNumber);
+        isKeyFrame_ = (picType == IFRAME);
+    }
+
+    void EncoderCallBackFFMPEG::onEndFrame(int frameNumber, PicType picType)
+    {
+        CV_UNUSED(frameNumber);
+        CV_UNUSED(picType);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// EncoderParams
+
+cv::cudacodec::EncoderParams::EncoderParams()
+{
+    P_Interval = 3;
+    IDR_Period = 15;
+    DynamicGOP = 0;
+    RCType = 1;
+    AvgBitrate = 4000000;
+    PeakBitrate = 10000000;
+    QP_Level_Intra = 25;
+    QP_Level_InterP = 28;
+    QP_Level_InterB = 31;
+    DeblockMode = 1;
+    ProfileLevel = 65357;
+    ForceIntra = 0;
+    ForceIDR = 0;
+    ClearStat = 0;
+    DIMode = 1;
+    Presets = 2;
+    DisableCabac = 0;
+    NaluFramingType = 0;
+    DisableSPSPPS = 0;
+}
+
+cv::cudacodec::EncoderParams::EncoderParams(const String& configFile)
+{
+    load(configFile);
+}
+
+void cv::cudacodec::EncoderParams::load(const String& configFile)
+{
+    FileStorage fs(configFile, FileStorage::READ);
+    CV_Assert( fs.isOpened() );
+
+    read(fs["P_Interval"     ], P_Interval, 3);
+    read(fs["IDR_Period"     ], IDR_Period, 15);
+    read(fs["DynamicGOP"     ], DynamicGOP, 0);
+    read(fs["RCType"         ], RCType, 1);
+    read(fs["AvgBitrate"     ], AvgBitrate, 4000000);
+    read(fs["PeakBitrate"    ], PeakBitrate, 10000000);
+    read(fs["QP_Level_Intra" ], QP_Level_Intra, 25);
+    read(fs["QP_Level_InterP"], QP_Level_InterP, 28);
+    read(fs["QP_Level_InterB"], QP_Level_InterB, 31);
+    read(fs["DeblockMode"    ], DeblockMode, 1);
+    read(fs["ProfileLevel"   ], ProfileLevel, 65357);
+    read(fs["ForceIntra"     ], ForceIntra, 0);
+    read(fs["ForceIDR"       ], ForceIDR, 0);
+    read(fs["ClearStat"      ], ClearStat, 0);
+    read(fs["DIMode"         ], DIMode, 1);
+    read(fs["Presets"        ], Presets, 2);
+    read(fs["DisableCabac"   ], DisableCabac, 0);
+    read(fs["NaluFramingType"], NaluFramingType, 0);
+    read(fs["DisableSPSPPS"  ], DisableSPSPPS, 0);
+}
+
+void cv::cudacodec::EncoderParams::save(const String& configFile) const
+{
+    FileStorage fs(configFile, FileStorage::WRITE);
+    CV_Assert( fs.isOpened() );
+
+    write(fs, "P_Interval"     , P_Interval);
+    write(fs, "IDR_Period"     , IDR_Period);
+    write(fs, "DynamicGOP"     , DynamicGOP);
+    write(fs, "RCType"         , RCType);
+    write(fs, "AvgBitrate"     , AvgBitrate);
+    write(fs, "PeakBitrate"    , PeakBitrate);
+    write(fs, "QP_Level_Intra" , QP_Level_Intra);
+    write(fs, "QP_Level_InterP", QP_Level_InterP);
+    write(fs, "QP_Level_InterB", QP_Level_InterB);
+    write(fs, "DeblockMode"    , DeblockMode);
+    write(fs, "ProfileLevel"   , ProfileLevel);
+    write(fs, "ForceIntra"     , ForceIntra);
+    write(fs, "ForceIDR"       , ForceIDR);
+    write(fs, "ClearStat"      , ClearStat);
+    write(fs, "DIMode"         , DIMode);
+    write(fs, "Presets"        , Presets);
+    write(fs, "DisableCabac"   , DisableCabac);
+    write(fs, "NaluFramingType", NaluFramingType);
+    write(fs, "DisableSPSPPS"  , DisableSPSPPS);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// createVideoWriter
+
+Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const String& fileName, Size frameSize, double fps, SurfaceFormat format)
+{
+    Ptr<EncoderCallBack> encoderCallback(new EncoderCallBackFFMPEG(fileName, frameSize, fps));
+    return createVideoWriter(encoderCallback, frameSize, fps, format);
+}
+
+Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const String& fileName, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format)
+{
+    Ptr<EncoderCallBack> encoderCallback(new EncoderCallBackFFMPEG(fileName, frameSize, fps));
+    return createVideoWriter(encoderCallback, frameSize, fps, params, format);
+}
+
+Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, SurfaceFormat format)
+{
+    return makePtr<VideoWriterImpl>(encoderCallback, frameSize, fps, format);
+}
+
+Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format)
+{
+    return makePtr<VideoWriterImpl>(encoderCallback, frameSize, fps, params, format);
+}
+
+#endif // !defined HAVE_NVCUVENC || !defined _WIN32
diff --git a/modules/cudacodec/test/test_main.cpp b/modules/cudacodec/test/test_main.cpp
new file mode 100644
index 00000000000..04f4fcf6e60
--- /dev/null
+++ b/modules/cudacodec/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cudacodec/test/test_precomp.hpp b/modules/cudacodec/test/test_precomp.hpp
new file mode 100644
index 00000000000..d22ad8e020f
--- /dev/null
+++ b/modules/cudacodec/test/test_precomp.hpp
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef OPENCV_TEST_PRECOMP_HPP
+#define OPENCV_TEST_PRECOMP_HPP
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudacodec.hpp"
+
+#include "cvconfig.h"
+
+#endif
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
new file mode 100644
index 00000000000..a346ea6961b
--- /dev/null
+++ b/modules/cudacodec/test/test_video.cpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+#ifdef HAVE_NVCUVID
+
+PARAM_TEST_CASE(Video, cv::cuda::DeviceInfo, std::string)
+{
+};
+
+//////////////////////////////////////////////////////
+// VideoReader
+
+CUDA_TEST_P(Video, Reader)
+{
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
+
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
+
+    cv::cuda::GpuMat frame;
+
+    for (int i = 0; i < 10; ++i)
+    {
+        ASSERT_TRUE(reader->nextFrame(frame));
+        ASSERT_FALSE(frame.empty());
+    }
+}
+
+//////////////////////////////////////////////////////
+// VideoWriter
+
+#ifdef _WIN32
+
+CUDA_TEST_P(Video, Writer)
+{
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
+
+    std::string outputFile = cv::tempfile(".avi");
+    const double FPS = 25.0;
+
+    cv::VideoCapture reader(inputFile);
+    ASSERT_TRUE(reader.isOpened());
+
+    cv::Ptr<cv::cudacodec::VideoWriter> d_writer;
+
+    cv::Mat frame;
+    cv::cuda::GpuMat d_frame;
+
+    for (int i = 0; i < 10; ++i)
+    {
+        reader >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        d_frame.upload(frame);
+
+        if (d_writer.empty())
+            d_writer = cv::cudacodec::createVideoWriter(outputFile, frame.size(), FPS);
+
+        d_writer->write(d_frame);
+    }
+
+    reader.release();
+    d_writer.release();
+
+    reader.open(outputFile);
+    ASSERT_TRUE(reader.isOpened());
+
+    for (int i = 0; i < 5; ++i)
+    {
+        reader >> frame;
+        ASSERT_FALSE(frame.empty());
+    }
+}
+
+#endif // _WIN32
+
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, Video, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
+
+#endif // HAVE_NVCUVID
+}} // namespace
diff --git a/modules/cudafeatures2d/CMakeLists.txt b/modules/cudafeatures2d/CMakeLists.txt
new file mode 100644
index 00000000000..aba40283dd9
--- /dev/null
+++ b/modules/cudafeatures2d/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR WINRT OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudafeatures2d)
+endif()
+
+set(the_description "CUDA-accelerated Feature Detection and Description")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter -Wshadow)
+
+ocv_define_module(cudafeatures2d opencv_features2d opencv_cudafilters opencv_cudawarping WRAP python)
diff --git a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
new file mode 100644
index 00000000000..91e85cff141
--- /dev/null
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -0,0 +1,490 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDAFEATURES2D_HPP
+#define OPENCV_CUDAFEATURES2D_HPP
+
+#ifndef __cplusplus
+#  error cudafeatures2d.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/cudafilters.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudafeatures2d Feature Detection and Description
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudafeatures2d
+//! @{
+
+//
+// DescriptorMatcher
+//
+
+/** @brief Abstract base class for matching keypoint descriptors.
+
+It has two groups of match methods: for matching descriptors of an image with another image or with
+an image set.
+ */
+class CV_EXPORTS_W DescriptorMatcher : public cv::Algorithm
+{
+public:
+    //
+    // Factories
+    //
+
+    /** @brief Brute-force descriptor matcher.
+
+    For each descriptor in the first set, this matcher finds the closest descriptor in the second set
+    by trying each one. This descriptor matcher supports masking permissible matches of descriptor
+    sets.
+
+    @param normType One of NORM_L1, NORM_L2, NORM_HAMMING. L1 and L2 norms are
+    preferable choices for SIFT and SURF descriptors, NORM_HAMMING should be used with ORB, BRISK and
+    BRIEF).
+     */
+    CV_WRAP static Ptr<cuda::DescriptorMatcher> createBFMatcher(int normType = cv::NORM_L2);
+
+    //
+    // Utility
+    //
+
+    /** @brief Returns true if the descriptor matcher supports masking permissible matches.
+     */
+    CV_WRAP virtual bool isMaskSupported() const = 0;
+
+    //
+    // Descriptor collection
+    //
+
+    /** @brief Adds descriptors to train a descriptor collection.
+
+    If the collection is not empty, the new descriptors are added to existing train descriptors.
+
+    @param descriptors Descriptors to add. Each descriptors[i] is a set of descriptors from the same
+    train image.
+     */
+    CV_WRAP virtual void add(const std::vector<GpuMat>& descriptors) = 0;
+
+    /** @brief Returns a constant link to the train descriptor collection.
+     */
+    CV_WRAP virtual const std::vector<GpuMat>& getTrainDescriptors() const = 0;
+
+    /** @brief Clears the train descriptor collection.
+     */
+    CV_WRAP virtual void clear() = 0;
+
+    /** @brief Returns true if there are no train descriptors in the collection.
+     */
+    CV_WRAP virtual bool empty() const = 0;
+
+    /** @brief Trains a descriptor matcher.
+
+    Trains a descriptor matcher (for example, the flann index). In all methods to match, the method
+    train() is run every time before matching.
+     */
+    CV_WRAP virtual void train() = 0;
+
+    //
+    // 1 to 1 match
+    //
+
+    /** @brief Finds the best match for each descriptor from a query set (blocking version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches. If a query descriptor is masked out in mask , no match is added for this
+    descriptor. So, matches size may be smaller than the query descriptors count.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
+    mask.at\<uchar\>(i,j) is non-zero.
+     */
+    CV_WRAP virtual void match(InputArray queryDescriptors, InputArray trainDescriptors,
+                       CV_OUT std::vector<DMatch>& matches,
+                       InputArray mask = noArray()) = 0;
+
+    /** @overload
+     */
+    CV_WRAP virtual void match(InputArray queryDescriptors,
+                       CV_OUT std::vector<DMatch>& matches,
+                       const std::vector<GpuMat>& masks = std::vector<GpuMat>()) = 0;
+
+    /** @brief Finds the best match for each descriptor from a query set (asynchronous version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::matchConvert method to retrieve results in standard representation.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param stream CUDA stream.
+
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
+    mask.at\<uchar\>(i,j) is non-zero.
+     */
+    CV_WRAP virtual void matchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                            OutputArray matches,
+                            InputArray mask = noArray(),
+                            Stream& stream = Stream::Null()) = 0;
+
+    /** @overload
+     */
+    CV_WRAP virtual void matchAsync(InputArray queryDescriptors,
+                            OutputArray matches,
+                            const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                            Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts matches array from internal representation to standard matches vector.
+
+    The method is supposed to be used with DescriptorMatcher::matchAsync to get final result.
+    Call this method only after DescriptorMatcher::matchAsync is completed (ie. after synchronization).
+
+    @param gpu_matches Matches, returned from DescriptorMatcher::matchAsync.
+    @param matches Vector of DMatch objects.
+     */
+    CV_WRAP virtual void matchConvert(InputArray gpu_matches,
+                              CV_OUT std::vector<DMatch>& matches) = 0;
+
+    //
+    // knn match
+    //
+
+    /** @brief Finds the k best matches for each descriptor from a query set (blocking version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches. Each matches[i] is k or less matches for the same query descriptor.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+
+    These extended variants of DescriptorMatcher::match methods find several best matches for each query
+    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::match
+    for the details about query and train descriptors.
+     */
+    CV_WRAP virtual void knnMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                          CV_OUT std::vector<std::vector<DMatch> >& matches,
+                          int k,
+                          InputArray mask = noArray(),
+                          bool compactResult = false) = 0;
+
+    /** @overload
+     */
+    CV_WRAP virtual void knnMatch(InputArray queryDescriptors,
+                          CV_OUT std::vector<std::vector<DMatch> >& matches,
+                          int k,
+                          const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                          bool compactResult = false) = 0;
+
+    /** @brief Finds the k best matches for each descriptor from a query set (asynchronous version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::knnMatchConvert method to retrieve results in standard representation.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param stream CUDA stream.
+
+    These extended variants of DescriptorMatcher::matchAsync methods find several best matches for each query
+    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::matchAsync
+    for the details about query and train descriptors.
+     */
+    CV_WRAP virtual void knnMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                               OutputArray matches,
+                               int k,
+                               InputArray mask = noArray(),
+                               Stream& stream = Stream::Null()) = 0;
+
+    /** @overload
+     */
+    CV_WRAP virtual void knnMatchAsync(InputArray queryDescriptors,
+                               OutputArray matches,
+                               int k,
+                               const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                               Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts matches array from internal representation to standard matches vector.
+
+    The method is supposed to be used with DescriptorMatcher::knnMatchAsync to get final result.
+    Call this method only after DescriptorMatcher::knnMatchAsync is completed (ie. after synchronization).
+
+    @param gpu_matches Matches, returned from DescriptorMatcher::knnMatchAsync.
+    @param matches Vector of DMatch objects.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+     */
+    CV_WRAP virtual void knnMatchConvert(InputArray gpu_matches,
+                                 std::vector< std::vector<DMatch> >& matches,
+                                 bool compactResult = false) = 0;
+
+    //
+    // radius match
+    //
+
+    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance (blocking version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Found matches.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+
+    For each query descriptor, the methods find such training descriptors that the distance between the
+    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
+    returned in the distance increasing order.
+     */
+    CV_WRAP virtual void radiusMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                             CV_OUT std::vector<std::vector<DMatch> >& matches,
+                             float maxDistance,
+                             InputArray mask = noArray(),
+                             bool compactResult = false) = 0;
+
+    /** @overload
+     */
+    CV_WRAP virtual void radiusMatch(InputArray queryDescriptors,
+                             CV_OUT std::vector<std::vector<DMatch> >& matches,
+                             float maxDistance,
+                             const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                             bool compactResult = false) = 0;
+
+    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance (asynchronous version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::radiusMatchConvert method to retrieve results in standard representation.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param stream CUDA stream.
+
+    For each query descriptor, the methods find such training descriptors that the distance between the
+    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
+    returned in the distance increasing order.
+     */
+    CV_WRAP virtual void radiusMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                  OutputArray matches,
+                                  float maxDistance,
+                                  InputArray mask = noArray(),
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @overload
+     */
+    CV_WRAP virtual void radiusMatchAsync(InputArray queryDescriptors,
+                                  OutputArray matches,
+                                  float maxDistance,
+                                  const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts matches array from internal representation to standard matches vector.
+
+    The method is supposed to be used with DescriptorMatcher::radiusMatchAsync to get final result.
+    Call this method only after DescriptorMatcher::radiusMatchAsync is completed (ie. after synchronization).
+
+    @param gpu_matches Matches, returned from DescriptorMatcher::radiusMatchAsync.
+    @param matches Vector of DMatch objects.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+     */
+    CV_WRAP virtual void radiusMatchConvert(InputArray gpu_matches,
+                                    std::vector< std::vector<DMatch> >& matches,
+                                    bool compactResult = false) = 0;
+};
+
+//
+// Feature2DAsync
+//
+
+/** @brief Abstract base class for CUDA asynchronous 2D image feature detectors and descriptor extractors.
+ */
+class CV_EXPORTS_W Feature2DAsync : public cv::Feature2D
+{
+public:
+    CV_WRAP virtual ~Feature2DAsync();
+
+    /** @brief Detects keypoints in an image.
+
+    @param image Image.
+    @param keypoints The detected keypoints.
+    @param mask Mask specifying where to look for keypoints (optional). It must be a 8-bit integer
+    matrix with non-zero values in the region of interest.
+    @param stream CUDA stream.
+     */
+    CV_WRAP virtual void detectAsync(InputArray image,
+                             OutputArray keypoints,
+                             InputArray mask = noArray(),
+                             Stream& stream = Stream::Null());
+
+    /** @brief Computes the descriptors for a set of keypoints detected in an image.
+
+    @param image Image.
+    @param keypoints Input collection of keypoints.
+    @param descriptors Computed descriptors. Row j is the descriptor for j-th keypoint.
+    @param stream CUDA stream.
+     */
+    CV_WRAP virtual void computeAsync(InputArray image,
+                              OutputArray keypoints,
+                              OutputArray descriptors,
+                              Stream& stream = Stream::Null());
+
+    /** Detects keypoints and computes the descriptors. */
+    CV_WRAP virtual void detectAndComputeAsync(InputArray image,
+                                       InputArray mask,
+                                       OutputArray keypoints,
+                                       OutputArray descriptors,
+                                       bool useProvidedKeypoints = false,
+                                       Stream& stream = Stream::Null());
+
+    /** Converts keypoints array from internal representation to standard vector. */
+    CV_WRAP virtual void convert(InputArray gpu_keypoints,
+                         std::vector<KeyPoint>& keypoints) = 0;
+};
+
+//
+// FastFeatureDetector
+//
+
+/** @brief Wrapping class for feature detection using the FAST method.
+ */
+class CV_EXPORTS_W FastFeatureDetector : public Feature2DAsync
+{
+public:
+    enum
+    {
+        LOCATION_ROW = 0,
+        RESPONSE_ROW,
+        ROWS_COUNT,
+
+        FEATURE_SIZE = 7
+    };
+
+    CV_WRAP static Ptr<cuda::FastFeatureDetector> create(int threshold=10,
+                                           bool nonmaxSuppression=true,
+                                           int type=cv::FastFeatureDetector::TYPE_9_16,
+                                           int max_npoints = 5000);
+    CV_WRAP virtual void setThreshold(int threshold) = 0;
+
+    CV_WRAP virtual void setMaxNumPoints(int max_npoints) = 0;
+    CV_WRAP virtual int getMaxNumPoints() const = 0;
+};
+
+//
+// ORB
+//
+
+/** @brief Class implementing the ORB (*oriented BRIEF*) keypoint detector and descriptor extractor
+ *
+ * @sa cv::ORB
+ */
+class CV_EXPORTS_W ORB : public Feature2DAsync
+{
+public:
+    enum
+    {
+        X_ROW = 0,
+        Y_ROW,
+        RESPONSE_ROW,
+        ANGLE_ROW,
+        OCTAVE_ROW,
+        SIZE_ROW,
+        ROWS_COUNT
+    };
+
+    CV_WRAP static Ptr<cuda::ORB> create(int nfeatures=500,
+                           float scaleFactor=1.2f,
+                           int nlevels=8,
+                           int edgeThreshold=31,
+                           int firstLevel=0,
+                           int WTA_K=2,
+                           int scoreType=cv::ORB::HARRIS_SCORE,
+                           int patchSize=31,
+                           int fastThreshold=20,
+                           bool blurForDescriptor=false);
+
+    //! if true, image will be blurred before descriptors calculation
+    CV_WRAP virtual void setBlurForDescriptor(bool blurForDescriptor) = 0;
+    CV_WRAP virtual bool getBlurForDescriptor() const = 0;
+};
+
+//! @}
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_CUDAFEATURES2D_HPP */
diff --git a/modules/cudafeatures2d/perf/perf_features2d.cpp b/modules/cudafeatures2d/perf/perf_features2d.cpp
new file mode 100644
index 00000000000..8fb2fb07e0c
--- /dev/null
+++ b/modules/cudafeatures2d/perf/perf_features2d.cpp
@@ -0,0 +1,312 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// FAST
+
+DEF_PARAM_TEST(Image_Threshold_NonMaxSuppression, string, int, bool);
+
+PERF_TEST_P(Image_Threshold_NonMaxSuppression, FAST,
+            Combine(Values<string>("gpu/perf/aloe.png"),
+                    Values(20),
+                    Bool()))
+{
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    const int threshold = GET_PARAM(1);
+    const bool nonMaxSuppersion = GET_PARAM(2);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::FastFeatureDetector> d_fast =
+                cv::cuda::FastFeatureDetector::create(threshold, nonMaxSuppersion,
+                                                      cv::FastFeatureDetector::TYPE_9_16,
+                                                      0.5 * img.size().area());
+
+        const cv::cuda::GpuMat d_img(img);
+        cv::cuda::GpuMat d_keypoints;
+
+        TEST_CYCLE() d_fast->detectAsync(d_img, d_keypoints);
+
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_fast->convert(d_keypoints, gpu_keypoints);
+
+        sortKeyPoints(gpu_keypoints);
+
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> cpu_keypoints;
+
+        TEST_CYCLE() cv::FAST(img, cpu_keypoints, threshold, nonMaxSuppersion);
+
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ORB
+
+DEF_PARAM_TEST(Image_NFeatures, string, int);
+
+PERF_TEST_P(Image_NFeatures, ORB,
+            Combine(Values<string>("gpu/perf/aloe.png"),
+                    Values(4000)))
+{
+    declare.time(300.0);
+
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    const int nFeatures = GET_PARAM(1);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::ORB> d_orb = cv::cuda::ORB::create(nFeatures);
+
+        const cv::cuda::GpuMat d_img(img);
+        cv::cuda::GpuMat d_keypoints, d_descriptors;
+
+        TEST_CYCLE() d_orb->detectAndComputeAsync(d_img, cv::noArray(), d_keypoints, d_descriptors);
+
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_orb->convert(d_keypoints, gpu_keypoints);
+
+        cv::Mat gpu_descriptors(d_descriptors);
+
+        gpu_keypoints.resize(10);
+        gpu_descriptors = gpu_descriptors.rowRange(0, 10);
+
+        sortKeyPoints(gpu_keypoints, gpu_descriptors);
+
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints, 1e-4);
+        SANITY_CHECK(gpu_descriptors);
+    }
+    else
+    {
+        cv::Ptr<cv::ORB> orb = cv::ORB::create(nFeatures);
+
+        std::vector<cv::KeyPoint> cpu_keypoints;
+        cv::Mat cpu_descriptors;
+
+        TEST_CYCLE() orb->detectAndCompute(img, cv::noArray(), cpu_keypoints, cpu_descriptors);
+
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
+        SANITY_CHECK(cpu_descriptors);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BFMatch
+
+DEF_PARAM_TEST(DescSize_Norm, int, NormType);
+
+PERF_TEST_P(DescSize_Norm, BFMatch,
+            Combine(Values(64, 128, 256),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+{
+    declare.time(20.0);
+
+    const int desc_size = GET_PARAM(0);
+    const int normType = GET_PARAM(1);
+
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+
+    cv::Mat query(3000, desc_size, type);
+    declare.in(query, WARMUP_RNG);
+
+    cv::Mat train(3000, desc_size, type);
+    declare.in(train, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
+
+        const cv::cuda::GpuMat d_query(query);
+        const cv::cuda::GpuMat d_train(train);
+        cv::cuda::GpuMat d_matches;
+
+        TEST_CYCLE() d_matcher->matchAsync(d_query, d_train, d_matches);
+
+        std::vector<cv::DMatch> gpu_matches;
+        d_matcher->matchConvert(d_matches, gpu_matches);
+
+        SANITY_CHECK_MATCHES(gpu_matches);
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector<cv::DMatch> cpu_matches;
+
+        TEST_CYCLE() matcher.match(query, train, cpu_matches);
+
+        SANITY_CHECK_MATCHES(cpu_matches);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BFKnnMatch
+
+static void toOneRowMatches(const std::vector< std::vector<cv::DMatch> >& src, std::vector<cv::DMatch>& dst)
+{
+    dst.clear();
+    for (size_t i = 0; i < src.size(); ++i)
+        for (size_t j = 0; j < src[i].size(); ++j)
+            dst.push_back(src[i][j]);
+}
+
+DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);
+
+PERF_TEST_P(DescSize_K_Norm, BFKnnMatch,
+            Combine(Values(64, 128, 256),
+                    Values(2, 3),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+{
+    declare.time(30.0);
+
+    const int desc_size = GET_PARAM(0);
+    const int k = GET_PARAM(1);
+    const int normType = GET_PARAM(2);
+
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+
+    cv::Mat query(3000, desc_size, type);
+    declare.in(query, WARMUP_RNG);
+
+    cv::Mat train(3000, desc_size, type);
+    declare.in(train, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
+
+        const cv::cuda::GpuMat d_query(query);
+        const cv::cuda::GpuMat d_train(train);
+        cv::cuda::GpuMat d_matches;
+
+        TEST_CYCLE() d_matcher->knnMatchAsync(d_query, d_train, d_matches, k);
+
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+        d_matcher->knnMatchConvert(d_matches, matchesTbl);
+
+        std::vector<cv::DMatch> gpu_matches;
+        toOneRowMatches(matchesTbl, gpu_matches);
+
+        SANITY_CHECK_MATCHES(gpu_matches);
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+
+        TEST_CYCLE() matcher.knnMatch(query, train, matchesTbl, k);
+
+        std::vector<cv::DMatch> cpu_matches;
+        toOneRowMatches(matchesTbl, cpu_matches);
+
+        SANITY_CHECK_MATCHES(cpu_matches);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BFRadiusMatch
+
+PERF_TEST_P(DescSize_Norm, BFRadiusMatch,
+            Combine(Values(64, 128, 256),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+{
+    declare.time(30.0);
+
+    const int desc_size = GET_PARAM(0);
+    const int normType = GET_PARAM(1);
+
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+    const float maxDistance = 10000;
+
+    cv::Mat query(3000, desc_size, type);
+    declare.in(query, WARMUP_RNG);
+
+    cv::Mat train(3000, desc_size, type);
+    declare.in(train, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
+
+        const cv::cuda::GpuMat d_query(query);
+        const cv::cuda::GpuMat d_train(train);
+        cv::cuda::GpuMat d_matches;
+
+        TEST_CYCLE() d_matcher->radiusMatchAsync(d_query, d_train, d_matches, maxDistance);
+
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+        d_matcher->radiusMatchConvert(d_matches, matchesTbl);
+
+        std::vector<cv::DMatch> gpu_matches;
+        toOneRowMatches(matchesTbl, gpu_matches);
+
+        SANITY_CHECK_MATCHES(gpu_matches);
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+
+        TEST_CYCLE() matcher.radiusMatch(query, train, matchesTbl, maxDistance);
+
+        std::vector<cv::DMatch> cpu_matches;
+        toOneRowMatches(matchesTbl, cpu_matches);
+
+        SANITY_CHECK_MATCHES(cpu_matches);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudafeatures2d/perf/perf_main.cpp b/modules/cudafeatures2d/perf/perf_main.cpp
new file mode 100644
index 00000000000..07b891703da
--- /dev/null
+++ b/modules/cudafeatures2d/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudafeatures2d)
diff --git a/modules/cudafeatures2d/perf/perf_precomp.hpp b/modules/cudafeatures2d/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..df801ba5368
--- /dev/null
+++ b/modules/cudafeatures2d/perf/perf_precomp.hpp
@@ -0,0 +1,56 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudafeatures2d.hpp"
+#include "opencv2/features2d.hpp"
+
+namespace opencv_test {
+using namespace perf;
+using namespace testing;
+}
+
+#endif
diff --git a/modules/cudafeatures2d/src/brute_force_matcher.cpp b/modules/cudafeatures2d/src/brute_force_matcher.cpp
new file mode 100644
index 00000000000..87316846d55
--- /dev/null
+++ b/modules/cudafeatures2d/src/brute_force_matcher.cpp
@@ -0,0 +1,1078 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cv::cuda::DescriptorMatcher> cv::cuda::DescriptorMatcher::createBFMatcher(int) { throw_no_cuda(); return Ptr<cv::cuda::DescriptorMatcher>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace bf_match
+    {
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+            cudaStream_t stream);
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+            cudaStream_t stream);
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+            cudaStream_t stream);
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+            cudaStream_t stream);
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+            cudaStream_t stream);
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+            cudaStream_t stream);
+    }
+
+    namespace bf_knnmatch
+    {
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream);
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream);
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream);
+
+        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream);
+        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream);
+        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream);
+    }
+
+    namespace bf_radius_match
+    {
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream);
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream);
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream);
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    static void makeGpuCollection(const std::vector<GpuMat>& trainDescCollection,
+                                  const std::vector<GpuMat>& masks,
+                                  GpuMat& trainCollection,
+                                  GpuMat& maskCollection)
+    {
+        if (trainDescCollection.empty())
+            return;
+
+        if (masks.empty())
+        {
+            Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
+
+            PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
+
+            for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr)
+                *trainCollectionCPU_ptr = trainDescCollection[i];
+
+            trainCollection.upload(trainCollectionCPU);
+            maskCollection.release();
+        }
+        else
+        {
+            CV_Assert( masks.size() == trainDescCollection.size() );
+
+            Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
+            Mat maskCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepb)));
+
+            PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
+            PtrStepb* maskCollectionCPU_ptr = maskCollectionCPU.ptr<PtrStepb>();
+
+            for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr, ++maskCollectionCPU_ptr)
+            {
+                const GpuMat& train = trainDescCollection[i];
+                const GpuMat& mask = masks[i];
+
+                CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.cols == train.rows) );
+
+                *trainCollectionCPU_ptr = train;
+                *maskCollectionCPU_ptr = mask;
+            }
+
+            trainCollection.upload(trainCollectionCPU);
+            maskCollection.upload(maskCollectionCPU);
+        }
+    }
+
+    class BFMatcher_Impl : public cv::cuda::DescriptorMatcher
+    {
+    public:
+        explicit BFMatcher_Impl(int norm) : norm_(norm)
+        {
+            CV_Assert( norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING );
+        }
+
+        virtual bool isMaskSupported() const { return true; }
+
+        virtual void add(const std::vector<GpuMat>& descriptors)
+        {
+            trainDescCollection_.insert(trainDescCollection_.end(), descriptors.begin(), descriptors.end());
+        }
+
+        virtual const std::vector<GpuMat>& getTrainDescriptors() const
+        {
+            return trainDescCollection_;
+        }
+
+        virtual void clear()
+        {
+            trainDescCollection_.clear();
+        }
+
+        virtual bool empty() const
+        {
+            return trainDescCollection_.empty();
+        }
+
+        virtual void train()
+        {
+        }
+
+        virtual void match(InputArray queryDescriptors, InputArray trainDescriptors,
+                           std::vector<DMatch>& matches,
+                           InputArray mask = noArray());
+
+        virtual void match(InputArray queryDescriptors,
+                           std::vector<DMatch>& matches,
+                           const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+
+        virtual void matchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                OutputArray matches,
+                                InputArray mask = noArray(),
+                                Stream& stream = Stream::Null());
+
+        virtual void matchAsync(InputArray queryDescriptors,
+                                OutputArray matches,
+                                const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                Stream& stream = Stream::Null());
+
+        virtual void matchConvert(InputArray gpu_matches,
+                                  std::vector<DMatch>& matches);
+
+        virtual void knnMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                              std::vector<std::vector<DMatch> >& matches,
+                              int k,
+                              InputArray mask = noArray(),
+                              bool compactResult = false);
+
+        virtual void knnMatch(InputArray queryDescriptors,
+                              std::vector<std::vector<DMatch> >& matches,
+                              int k,
+                              const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                              bool compactResult = false);
+
+        virtual void knnMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                   OutputArray matches,
+                                   int k,
+                                   InputArray mask = noArray(),
+                                   Stream& stream = Stream::Null());
+
+        virtual void knnMatchAsync(InputArray queryDescriptors,
+                                   OutputArray matches,
+                                   int k,
+                                   const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                   Stream& stream = Stream::Null());
+
+        virtual void knnMatchConvert(InputArray gpu_matches,
+                                     std::vector< std::vector<DMatch> >& matches,
+                                     bool compactResult = false);
+
+        virtual void radiusMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                                 std::vector<std::vector<DMatch> >& matches,
+                                 float maxDistance,
+                                 InputArray mask = noArray(),
+                                 bool compactResult = false);
+
+        virtual void radiusMatch(InputArray queryDescriptors,
+                                 std::vector<std::vector<DMatch> >& matches,
+                                 float maxDistance,
+                                 const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                 bool compactResult = false);
+
+        virtual void radiusMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                      OutputArray matches,
+                                      float maxDistance,
+                                      InputArray mask = noArray(),
+                                      Stream& stream = Stream::Null());
+
+        virtual void radiusMatchAsync(InputArray queryDescriptors,
+                                      OutputArray matches,
+                                      float maxDistance,
+                                      const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                      Stream& stream = Stream::Null());
+
+        virtual void radiusMatchConvert(InputArray gpu_matches,
+                                        std::vector< std::vector<DMatch> >& matches,
+                                        bool compactResult = false);
+
+    private:
+        int norm_;
+        std::vector<GpuMat> trainDescCollection_;
+    };
+
+    //
+    // 1 to 1 match
+    //
+
+    void BFMatcher_Impl::match(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                               std::vector<DMatch>& matches,
+                               InputArray _mask)
+    {
+        GpuMat d_matches;
+        matchAsync(_queryDescriptors, _trainDescriptors, d_matches, _mask);
+        matchConvert(d_matches, matches);
+    }
+
+    void BFMatcher_Impl::match(InputArray _queryDescriptors,
+                               std::vector<DMatch>& matches,
+                               const std::vector<GpuMat>& masks)
+    {
+        GpuMat d_matches;
+        matchAsync(_queryDescriptors, d_matches, masks);
+        matchConvert(d_matches, matches);
+    }
+
+    void BFMatcher_Impl::matchAsync(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                    OutputArray _matches,
+                                    InputArray _mask,
+                                    Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+        const GpuMat train = _trainDescriptors.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        if (query.empty() || train.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+        CV_Assert( train.cols == query.cols && train.type() == query.type() );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.rows == query.rows && mask.cols == train.rows) );
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(2, nQuery, CV_32SC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(1, nQuery, CV_32SC1, matches.ptr(0));
+        GpuMat distance(1, nQuery, CV_32FC1, matches.ptr(1));
+
+        func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::matchAsync(InputArray _queryDescriptors,
+                                    OutputArray _matches,
+                                    const std::vector<GpuMat>& masks,
+                                    Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+
+        if (query.empty() || trainDescCollection_.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+        GpuMat trainCollection, maskCollection;
+        makeGpuCollection(trainDescCollection_, masks, trainCollection, maskCollection);
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(3, nQuery, CV_32SC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(1, nQuery, CV_32SC1, matches.ptr(0));
+        GpuMat imgIdx(1, nQuery, CV_32SC1, matches.ptr(1));
+        GpuMat distance(1, nQuery, CV_32FC1, matches.ptr(2));
+
+        func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::matchConvert(InputArray _gpu_matches,
+                                      std::vector<DMatch>& matches)
+    {
+        Mat gpu_matches;
+        if (_gpu_matches.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_matches.getGpuMat().download(gpu_matches);
+        }
+        else
+        {
+            gpu_matches = _gpu_matches.getMat();
+        }
+
+        if (gpu_matches.empty())
+        {
+            matches.clear();
+            return;
+        }
+
+        CV_Assert( (gpu_matches.type() == CV_32SC1) && (gpu_matches.rows == 2 || gpu_matches.rows == 3) );
+
+        const int nQuery = gpu_matches.cols;
+
+        matches.clear();
+        matches.reserve(nQuery);
+
+        const int* trainIdxPtr = NULL;
+        const int* imgIdxPtr = NULL;
+        const float* distancePtr = NULL;
+
+        if (gpu_matches.rows == 2)
+        {
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            distancePtr =  gpu_matches.ptr<float>(1);
+        }
+        else
+        {
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            imgIdxPtr =  gpu_matches.ptr<int>(1);
+            distancePtr =  gpu_matches.ptr<float>(2);
+        }
+
+        for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+        {
+            const int trainIdx = trainIdxPtr[queryIdx];
+            if (trainIdx == -1)
+                continue;
+
+            const int imgIdx = imgIdxPtr ? imgIdxPtr[queryIdx] : 0;
+            const float distance = distancePtr[queryIdx];
+
+            DMatch m(queryIdx, trainIdx, imgIdx, distance);
+
+            matches.push_back(m);
+        }
+    }
+
+    //
+    // knn match
+    //
+
+    void BFMatcher_Impl::knnMatch(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                  std::vector<std::vector<DMatch> >& matches,
+                                  int k,
+                                  InputArray _mask,
+                                  bool compactResult)
+    {
+        GpuMat d_matches;
+        knnMatchAsync(_queryDescriptors, _trainDescriptors, d_matches, k, _mask);
+        knnMatchConvert(d_matches, matches, compactResult);
+    }
+
+    void BFMatcher_Impl::knnMatch(InputArray _queryDescriptors,
+                                  std::vector<std::vector<DMatch> >& matches,
+                                  int k,
+                                  const std::vector<GpuMat>& masks,
+                                  bool compactResult)
+    {
+        if (k == 2)
+        {
+            GpuMat d_matches;
+            knnMatchAsync(_queryDescriptors, d_matches, k, masks);
+            knnMatchConvert(d_matches, matches, compactResult);
+        }
+        else
+        {
+            const GpuMat query = _queryDescriptors.getGpuMat();
+
+            if (query.empty() || trainDescCollection_.empty())
+            {
+                matches.clear();
+                return;
+            }
+
+            CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+            std::vector< std::vector<DMatch> > curMatches;
+            std::vector<DMatch> temp;
+            temp.reserve(2 * k);
+
+            matches.resize(query.rows);
+            for (size_t i = 0; i < matches.size(); ++i)
+                matches[i].reserve(k);
+
+            for (size_t imgIdx = 0; imgIdx < trainDescCollection_.size(); ++imgIdx)
+            {
+                knnMatch(query, trainDescCollection_[imgIdx], curMatches, k, masks.empty() ? GpuMat() : masks[imgIdx]);
+
+                for (int queryIdx = 0; queryIdx < query.rows; ++queryIdx)
+                {
+                    std::vector<DMatch>& localMatch = curMatches[queryIdx];
+                    std::vector<DMatch>& globalMatch = matches[queryIdx];
+
+                    for (size_t i = 0; i < localMatch.size(); ++i)
+                        localMatch[i].imgIdx = imgIdx;
+
+                    temp.clear();
+                    std::merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), std::back_inserter(temp));
+
+                    globalMatch.clear();
+                    const size_t count = std::min(static_cast<size_t>(k), temp.size());
+                    std::copy(temp.begin(), temp.begin() + count, std::back_inserter(globalMatch));
+                }
+            }
+
+            if (compactResult)
+            {
+                std::vector< std::vector<DMatch> >::iterator new_end = std::remove_if(matches.begin(), matches.end(),
+                    [](const std::vector<DMatch>& e)->bool { return e.empty(); });
+                matches.erase(new_end, matches.end());
+            }
+        }
+    }
+
+    void BFMatcher_Impl::knnMatchAsync(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                       OutputArray _matches,
+                                       int k,
+                                       InputArray _mask,
+                                       Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_knnmatch;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+        const GpuMat train = _trainDescriptors.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        if (query.empty() || train.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+        CV_Assert( train.cols == query.cols && train.type() == query.type() );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.rows == query.rows && mask.cols == train.rows) );
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+                                 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+        const int nTrain = train.rows;
+
+        GpuMat trainIdx, distance, allDist;
+        if (k == 2)
+        {
+            _matches.create(2, nQuery, CV_32SC2);
+            GpuMat matches = _matches.getGpuMat();
+
+            trainIdx = GpuMat(1, nQuery, CV_32SC2, matches.ptr(0));
+            distance = GpuMat(1, nQuery, CV_32FC2, matches.ptr(1));
+        }
+        else
+        {
+            _matches.create(2 * nQuery, k, CV_32SC1);
+            GpuMat matches = _matches.getGpuMat();
+
+            trainIdx = GpuMat(nQuery, k, CV_32SC1, matches.ptr(0), matches.step);
+            distance = GpuMat(nQuery, k, CV_32FC1, matches.ptr(nQuery), matches.step);
+
+            BufferPool pool(stream);
+            allDist = pool.getBuffer(nQuery, nTrain, CV_32FC1);
+        }
+
+        trainIdx.setTo(Scalar::all(-1), stream);
+
+        func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::knnMatchAsync(InputArray _queryDescriptors,
+                                       OutputArray _matches,
+                                       int k,
+                                       const std::vector<GpuMat>& masks,
+                                       Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_knnmatch;
+
+        if (k != 2)
+        {
+            CV_Error(Error::StsNotImplemented, "only k=2 mode is supported for now");
+        }
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+
+        if (query.empty() || trainDescCollection_.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+        GpuMat trainCollection, maskCollection;
+        makeGpuCollection(trainDescCollection_, masks, trainCollection, maskCollection);
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
+            match2L1_gpu<unsigned short>, match2L1_gpu<short>,
+            match2L1_gpu<int>, match2L1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
+            0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
+            0/*match2L2_gpu<int>*/, match2L2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
+            match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
+            match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(3, nQuery, CV_32SC2);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(1, nQuery, CV_32SC2, matches.ptr(0));
+        GpuMat imgIdx(1, nQuery, CV_32SC2, matches.ptr(1));
+        GpuMat distance(1, nQuery, CV_32FC2, matches.ptr(2));
+
+        trainIdx.setTo(Scalar::all(-1), stream);
+
+        func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::knnMatchConvert(InputArray _gpu_matches,
+                                         std::vector< std::vector<DMatch> >& matches,
+                                         bool compactResult)
+    {
+        Mat gpu_matches;
+        if (_gpu_matches.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_matches.getGpuMat().download(gpu_matches);
+        }
+        else
+        {
+            gpu_matches = _gpu_matches.getMat();
+        }
+
+        if (gpu_matches.empty())
+        {
+            matches.clear();
+            return;
+        }
+
+        CV_Assert( ((gpu_matches.type() == CV_32SC2) && (gpu_matches.rows == 2 || gpu_matches.rows == 3)) ||
+                   (gpu_matches.type() == CV_32SC1) );
+
+        int nQuery = -1, k = -1;
+
+        const int* trainIdxPtr = NULL;
+        const int* imgIdxPtr = NULL;
+        const float* distancePtr = NULL;
+
+        if (gpu_matches.type() == CV_32SC2)
+        {
+            nQuery = gpu_matches.cols;
+            k = 2;
+
+            if (gpu_matches.rows == 2)
+            {
+                trainIdxPtr = gpu_matches.ptr<int>(0);
+                distancePtr =  gpu_matches.ptr<float>(1);
+            }
+            else
+            {
+                trainIdxPtr = gpu_matches.ptr<int>(0);
+                imgIdxPtr =  gpu_matches.ptr<int>(1);
+                distancePtr =  gpu_matches.ptr<float>(2);
+            }
+        }
+        else
+        {
+            nQuery = gpu_matches.rows / 2;
+            k = gpu_matches.cols;
+
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            distancePtr =  gpu_matches.ptr<float>(nQuery);
+        }
+
+        matches.clear();
+        matches.reserve(nQuery);
+
+        for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+        {
+            matches.push_back(std::vector<DMatch>());
+            std::vector<DMatch>& curMatches = matches.back();
+            curMatches.reserve(k);
+
+            for (int i = 0; i < k; ++i)
+            {
+                const int trainIdx = *trainIdxPtr;
+                if (trainIdx == -1)
+                    continue;
+
+                const int imgIdx = imgIdxPtr ? *imgIdxPtr : 0;
+                const float distance = *distancePtr;
+
+                DMatch m(queryIdx, trainIdx, imgIdx, distance);
+
+                curMatches.push_back(m);
+
+                ++trainIdxPtr;
+                ++distancePtr;
+                if (imgIdxPtr)
+                    ++imgIdxPtr;
+            }
+
+            if (compactResult && curMatches.empty())
+            {
+                matches.pop_back();
+            }
+        }
+    }
+
+    //
+    // radius match
+    //
+
+    void BFMatcher_Impl::radiusMatch(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                     std::vector<std::vector<DMatch> >& matches,
+                                     float maxDistance,
+                                     InputArray _mask,
+                                     bool compactResult)
+    {
+        GpuMat d_matches;
+        radiusMatchAsync(_queryDescriptors, _trainDescriptors, d_matches, maxDistance, _mask);
+        radiusMatchConvert(d_matches, matches, compactResult);
+    }
+
+    void BFMatcher_Impl::radiusMatch(InputArray _queryDescriptors,
+                                     std::vector<std::vector<DMatch> >& matches,
+                                     float maxDistance,
+                                     const std::vector<GpuMat>& masks,
+                                     bool compactResult)
+    {
+        GpuMat d_matches;
+        radiusMatchAsync(_queryDescriptors, d_matches, maxDistance, masks);
+        radiusMatchConvert(d_matches, matches, compactResult);
+    }
+
+    void BFMatcher_Impl::radiusMatchAsync(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                          OutputArray _matches,
+                                          float maxDistance,
+                                          InputArray _mask,
+                                          Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_radius_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+        const GpuMat train = _trainDescriptors.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        if (query.empty() || train.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+        CV_Assert( train.cols == query.cols && train.type() == query.type() );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.rows == query.rows && mask.cols == train.rows) );
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+        const int nTrain = train.rows;
+
+        const int cols = std::max((nTrain / 100), nQuery);
+
+        _matches.create(2 * nQuery + 1, cols, CV_32SC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(nQuery, cols, CV_32SC1, matches.ptr(0), matches.step);
+        GpuMat distance(nQuery, cols, CV_32FC1, matches.ptr(nQuery), matches.step);
+        GpuMat nMatches(1, nQuery, CV_32SC1, matches.ptr(2 * nQuery));
+
+        nMatches.setTo(Scalar::all(0), stream);
+
+        func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::radiusMatchAsync(InputArray _queryDescriptors,
+                                          OutputArray _matches,
+                                          float maxDistance,
+                                          const std::vector<GpuMat>& masks,
+                                          Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_radius_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+
+        if (query.empty() || trainDescCollection_.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+        GpuMat trainCollection, maskCollection;
+        makeGpuCollection(trainDescCollection_, masks, trainCollection, maskCollection);
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(3 * nQuery + 1, nQuery, CV_32FC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(nQuery, nQuery, CV_32SC1, matches.ptr(0), matches.step);
+        GpuMat imgIdx(nQuery, nQuery, CV_32SC1, matches.ptr(nQuery), matches.step);
+        GpuMat distance(nQuery, nQuery, CV_32FC1, matches.ptr(2 * nQuery), matches.step);
+        GpuMat nMatches(1, nQuery, CV_32SC1, matches.ptr(3 * nQuery));
+
+        nMatches.setTo(Scalar::all(0), stream);
+
+        std::vector<PtrStepSzb> trains_(trainDescCollection_.begin(), trainDescCollection_.end());
+        std::vector<PtrStepSzb> masks_(masks.begin(), masks.end());
+
+        func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
+            trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::radiusMatchConvert(InputArray _gpu_matches,
+                                            std::vector< std::vector<DMatch> >& matches,
+                                            bool compactResult)
+    {
+        Mat gpu_matches;
+        if (_gpu_matches.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_matches.getGpuMat().download(gpu_matches);
+        }
+        else
+        {
+            gpu_matches = _gpu_matches.getMat();
+        }
+
+        if (gpu_matches.empty())
+        {
+            matches.clear();
+            return;
+        }
+
+        CV_Assert( gpu_matches.type() == CV_32SC1 || gpu_matches.type() == CV_32FC1 );
+
+        int nQuery = -1;
+
+        const int* trainIdxPtr = NULL;
+        const int* imgIdxPtr = NULL;
+        const float* distancePtr = NULL;
+        const int* nMatchesPtr = NULL;
+
+        if (gpu_matches.type() == CV_32SC1)
+        {
+            nQuery = (gpu_matches.rows - 1) / 2;
+
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            distancePtr =  gpu_matches.ptr<float>(nQuery);
+            nMatchesPtr = gpu_matches.ptr<int>(2 * nQuery);
+        }
+        else
+        {
+            nQuery = (gpu_matches.rows - 1) / 3;
+
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            imgIdxPtr = gpu_matches.ptr<int>(nQuery);
+            distancePtr =  gpu_matches.ptr<float>(2 * nQuery);
+            nMatchesPtr = gpu_matches.ptr<int>(3 * nQuery);
+        }
+
+        matches.clear();
+        matches.reserve(nQuery);
+
+        for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+        {
+            const int nMatched = std::min(nMatchesPtr[queryIdx], gpu_matches.cols);
+
+            if (nMatched == 0)
+            {
+                if (!compactResult)
+                {
+                    matches.push_back(std::vector<DMatch>());
+                }
+            }
+            else
+            {
+                matches.push_back(std::vector<DMatch>(nMatched));
+                std::vector<DMatch>& curMatches = matches.back();
+
+                for (int i = 0; i < nMatched; ++i)
+                {
+                    const int trainIdx = trainIdxPtr[i];
+
+                    const int imgIdx = imgIdxPtr ? imgIdxPtr[i] : 0;
+                    const float distance = distancePtr[i];
+
+                    DMatch m(queryIdx, trainIdx, imgIdx, distance);
+
+                    curMatches[i] = m;
+                }
+
+                std::sort(curMatches.begin(), curMatches.end());
+            }
+
+            trainIdxPtr += gpu_matches.cols;
+            distancePtr += gpu_matches.cols;
+            if (imgIdxPtr)
+                imgIdxPtr += gpu_matches.cols;
+        }
+    }
+}
+
+Ptr<cv::cuda::DescriptorMatcher> cv::cuda::DescriptorMatcher::createBFMatcher(int norm)
+{
+    return makePtr<BFMatcher_Impl>(norm);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudafeatures2d/src/cuda/bf_knnmatch.cu b/modules/cudafeatures2d/src/cuda/bf_knnmatch.cu
new file mode 100644
index 00000000000..40c8ac80923
--- /dev/null
+++ b/modules/cudafeatures2d/src/cuda/bf_knnmatch.cu
@@ -0,0 +1,1255 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_distance.hpp"
+#include "opencv2/core/cuda/datamov_utils.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace bf_knnmatch
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Reduction
+
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
+                                      int& bestTrainIdx1, int& bestTrainIdx2,
+                                      float* s_distance, int* s_trainIdx)
+        {
+        #if __CUDA_ARCH__ >= 300
+            CV_UNUSED(s_distance);
+            CV_UNUSED(s_trainIdx);
+
+            float d1, d2;
+            int i1, i2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                    }
+                }
+            }
+        #else
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+
+            s_distance[threadIdx.x] = bestDistance1;
+            s_trainIdx[threadIdx.x] = bestTrainIdx1;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance1)
+                    {
+                        myBestDistance2 = myBestDistance1;
+                        myBestTrainIdx2 = myBestTrainIdx1;
+
+                        myBestDistance1 = val;
+                        myBestTrainIdx1 = s_trainIdx[i];
+                    }
+                    else if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            s_distance[threadIdx.x] = bestDistance2;
+            s_trainIdx[threadIdx.x] = bestTrainIdx2;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                    }
+                }
+            }
+
+            bestDistance1 = myBestDistance1;
+            bestDistance2 = myBestDistance2;
+
+            bestTrainIdx1 = myBestTrainIdx1;
+            bestTrainIdx2 = myBestTrainIdx2;
+        #endif
+        }
+
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
+                                       int& bestTrainIdx1, int& bestTrainIdx2,
+                                       int& bestImgIdx1, int& bestImgIdx2,
+                                       float* s_distance, int* s_trainIdx, int* s_imgIdx)
+        {
+        #if __CUDA_ARCH__ >= 300
+            CV_UNUSED(s_distance);
+            CV_UNUSED(s_trainIdx);
+            CV_UNUSED(s_imgIdx);
+
+            float d1, d2;
+            int i1, i2;
+            int j1, j2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+                j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
+                j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                        bestImgIdx2 = j1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+                    bestImgIdx2 = bestImgIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+                    bestImgIdx1 = j1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                        bestImgIdx2 = j2;
+                    }
+                }
+            }
+        #else
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+            s_imgIdx   += threadIdx.y * BLOCK_SIZE;
+
+            s_distance[threadIdx.x] = bestDistance1;
+            s_trainIdx[threadIdx.x] = bestTrainIdx1;
+            s_imgIdx[threadIdx.x]   = bestImgIdx1;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance1)
+                    {
+                        myBestDistance2 = myBestDistance1;
+                        myBestTrainIdx2 = myBestTrainIdx1;
+                        myBestImgIdx2   = myBestImgIdx1;
+
+                        myBestDistance1 = val;
+                        myBestTrainIdx1 = s_trainIdx[i];
+                        myBestImgIdx1   = s_imgIdx[i];
+                    }
+                    else if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                        myBestImgIdx2   = s_imgIdx[i];
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            s_distance[threadIdx.x] = bestDistance2;
+            s_trainIdx[threadIdx.x] = bestTrainIdx2;
+            s_imgIdx[threadIdx.x]   = bestImgIdx2;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                        myBestImgIdx2   = s_imgIdx[i];
+                    }
+                }
+            }
+
+            bestDistance1 = myBestDistance1;
+            bestDistance2 = myBestDistance2;
+
+            bestTrainIdx1 = myBestTrainIdx1;
+            bestTrainIdx2 = myBestTrainIdx2;
+
+            bestImgIdx1 = myBestImgIdx1;
+            bestImgIdx2 = myBestImgIdx2;
+        #endif
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled Cached
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
+        __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
+        {
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+                s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                           float& bestDistance1, float& bestDistance2,
+                                           int& bestTrainIdx1, int& bestTrainIdx2,
+                                           int& bestImgIdx1, int& bestImgIdx2)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < train.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                                 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                     float& bestDistance1, float& bestDistance2,
+                                     int& bestTrainIdx1, int& bestTrainIdx2,
+                                     int& bestImgIdx1, int& bestImgIdx2)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                           const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                           const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                             float& bestDistance1, float& bestDistance2,
+                             int& bestTrainIdx1, int& bestTrainIdx2,
+                             int& bestImgIdx1, int& bestImgIdx2)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                   const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                   const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // knnMatch 2 dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                              const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
+                              cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+        }
+
+        template <typename Dist, typename T, typename Mask>
+        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                              const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+                              cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Calc distance kernel
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                if (loadX < query.cols)
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
+                }
+                else
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            if (queryIdx < query.rows && trainIdx < train.rows)
+            {
+                float distVal = numeric_limits<float>::max();
+
+                if (mask(queryIdx, trainIdx))
+                    distVal = (typename Dist::result_type)dist;
+
+                allDist.ptr(queryIdx)[trainIdx] = distVal;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                if (loadX < query.cols)
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
+                }
+                else
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            if (queryIdx < query.rows && trainIdx < train.rows)
+            {
+                float distVal = numeric_limits<float>::max();
+
+                if (mask(queryIdx, trainIdx))
+                    distVal = (typename Dist::result_type)dist;
+
+                allDist.ptr(queryIdx)[trainIdx] = distVal;
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Calc Distance dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                    const PtrStepSzf& allDist,
+                                    cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
+            }*/
+            else
+            {
+                calcDistance<16, Dist>(query, train, mask, allDist, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // find knn match kernel
+
+        template <int BLOCK_SIZE>
+        __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
+        {
+            const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
+            __shared__ float s_dist[SMEM_SIZE];
+            __shared__ int s_trainIdx[SMEM_SIZE];
+
+            const int queryIdx = blockIdx.x;
+
+            float* allDistRow = allDist.ptr(queryIdx);
+
+            float dist = numeric_limits<float>::max();
+            int bestIdx = -1;
+
+            for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
+            {
+                float reg = allDistRow[i];
+                if (reg < dist)
+                {
+                    dist = reg;
+                    bestIdx = i;
+                }
+            }
+
+            s_dist[threadIdx.x] = dist;
+            s_trainIdx[threadIdx.x] = bestIdx;
+            __syncthreads();
+
+            reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
+
+            if (threadIdx.x == 0)
+            {
+                if (dist < numeric_limits<float>::max())
+                {
+                    allDistRow[bestIdx] = numeric_limits<float>::max();
+                    trainIdx.ptr(queryIdx)[i] = bestIdx;
+                    distance.ptr(queryIdx)[i] = dist;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE>
+        void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, 1, 1);
+            const dim3 grid(trainIdx.rows, 1, 1);
+
+            for (int i = 0; i < k; ++i)
+            {
+                findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // knn match Dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (k == 2)
+            {
+                match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else
+            {
+                calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
+                findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // knn match caller
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
+            else
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
+            else
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
+            else
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+
+        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
+            else
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
+        }
+
+        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+
+        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
+            else
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
+        }
+
+        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+
+        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
+            else
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
+        }
+
+        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+    } // namespace bf_knnmatch
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafeatures2d/src/cuda/bf_match.cu b/modules/cudafeatures2d/src/cuda/bf_match.cu
new file mode 100644
index 00000000000..ec7df1ea848
--- /dev/null
+++ b/modules/cudafeatures2d/src/cuda/bf_match.cu
@@ -0,0 +1,774 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_distance.hpp"
+#include "opencv2/core/cuda/datamov_utils.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace bf_match
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Reduction
+
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
+        {
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
+        }
+
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
+        {
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+            s_imgIdx   += threadIdx.y * BLOCK_SIZE;
+
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled Cached
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
+        __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
+        {
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+                s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                           float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < train.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+                {
+                    bestImgIdx = imgIdx;
+                    bestDistance = distVal;
+                    bestTrainIdx = trainIdx;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+
+            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
+                                            int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+            int myBestImgIdx = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestImgIdx[queryIdx] = myBestImgIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                     float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+                {
+                    bestImgIdx = imgIdx;
+                    bestDistance = distVal;
+                    bestTrainIdx = trainIdx;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                           const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
+                                      int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+            int myBestImgIdx = -1;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestImgIdx[queryIdx] = myBestImgIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                           const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                             float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+                {
+                    bestImgIdx = imgIdx;
+                    bestDistance = distVal;
+                    bestTrainIdx = trainIdx;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                   const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
+                              int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+            int myBestImgIdx = -1;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Mask m = mask;
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestImgIdx[queryIdx] = myBestImgIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                   const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                             cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+        }
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                             cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match caller
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                               cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
+                    trainIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
+                    trainIdx, distance,
+                    stream);
+            }
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                               cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
+                    trainIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
+                    trainIdx, distance,
+                    stream);
+            }
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                                    const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                                    cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
+                    trainIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
+                    trainIdx, distance,
+                    stream);
+            }
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                                cudaStream_t stream)
+        {
+            if (masks.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                               cudaStream_t stream)
+        {
+            if (masks.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                                    cudaStream_t stream)
+        {
+            if (masks.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+    } // namespace bf_match
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafeatures2d/src/cuda/bf_radius_match.cu b/modules/cudafeatures2d/src/cuda/bf_radius_match.cu
new file mode 100644
index 00000000000..0121e81e2b8
--- /dev/null
+++ b/modules/cudafeatures2d/src/cuda/bf_radius_match.cu
@@ -0,0 +1,463 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_distance.hpp"
+#include "opencv2/core/cuda/datamov_utils.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace bf_radius_match
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                if (loadX < query.cols)
+                {
+                    T val;
+
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            float distVal = (typename Dist::result_type)dist;
+
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const PtrStepSz<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
+
+        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                if (loadX < query.cols)
+                {
+                    T val;
+
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            float distVal = (typename Dist::result_type)dist;
+
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const PtrStepSz<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                             cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+        }
+
+        template <typename Dist, typename T>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                             cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Radius Match caller
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                stream);
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                stream);
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                stream);
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+    } // namespace bf_radius_match
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafeatures2d/src/cuda/fast.cu b/modules/cudafeatures2d/src/cuda/fast.cu
new file mode 100644
index 00000000000..5da9e5ecdb0
--- /dev/null
+++ b/modules/cudafeatures2d/src/cuda/fast.cu
@@ -0,0 +1,377 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace fast
+    {
+        ///////////////////////////////////////////////////////////////////////////
+        // calcKeypoints
+
+        __constant__ uchar c_table[] = { 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0xff, 0xff, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0xff, 0xff, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+        // 1 -> v > x + th
+        // 2 -> v < x - th
+        // 0 -> x - th <= v <= x + th
+        __device__ __forceinline__ int diffType(const int v, const int x, const int th)
+        {
+            const int diff = x - v;
+
+            return static_cast<int>(diff < -th) + (static_cast<int>(diff > th) << 1);
+        }
+
+        __device__ void calcMask(const uint C[4], const int v, const int th, int& mask1, int& mask2)
+        {
+            mask1 = 0;
+            mask2 = 0;
+
+            int d1, d2;
+
+
+
+            d1 = diffType(v, C[0] & 0xff, th);
+            d2 = diffType(v, C[2] & 0xff, th);
+
+            if ((d1 | d2) == 0)
+                return;
+
+            mask1 |= (d1 & 1) << 0;
+            mask2 |= ((d1 & 2) >> 1) << 0;
+
+            mask1 |= (d2 & 1) << 8;
+            mask2 |= ((d2 & 2) >> 1) << 8;
+
+
+
+            d1 = diffType(v, C[1] & 0xff, th);
+            d2 = diffType(v, C[3] & 0xff, th);
+
+            if ((d1 | d2) == 0)
+                return;
+
+            mask1 |= (d1 & 1) << 4;
+            mask2 |= ((d1 & 2) >> 1) << 4;
+
+            mask1 |= (d2 & 1) << 12;
+            mask2 |= ((d2 & 2) >> 1) << 12;
+
+
+
+            d1 = diffType(v, (C[0] >> (2 * 8)) & 0xff, th);
+            d2 = diffType(v, (C[2] >> (2 * 8)) & 0xff, th);
+
+            if ((d1 | d2) == 0)
+                return;
+
+            mask1 |= (d1 & 1) << 2;
+            mask2 |= ((d1 & 2) >> 1) << 2;
+
+            mask1 |= (d2 & 1) << 10;
+            mask2 |= ((d2 & 2) >> 1) << 10;
+
+
+
+            d1 = diffType(v, (C[1] >> (2 * 8)) & 0xff, th);
+            d2 = diffType(v, (C[3] >> (2 * 8)) & 0xff, th);
+
+            if ((d1 | d2) == 0)
+                return;
+
+            mask1 |= (d1 & 1) << 6;
+            mask2 |= ((d1 & 2) >> 1) << 6;
+
+            mask1 |= (d2 & 1) << 14;
+            mask2 |= ((d2 & 2) >> 1) << 14;
+
+
+
+            d1 = diffType(v, (C[0] >> (1 * 8)) & 0xff, th);
+            d2 = diffType(v, (C[2] >> (1 * 8)) & 0xff, th);
+
+            /*if ((d1 | d2) == 0)
+                return;*/
+
+            mask1 |= (d1 & 1) << 1;
+            mask2 |= ((d1 & 2) >> 1) << 1;
+
+            mask1 |= (d2 & 1) << 9;
+            mask2 |= ((d2 & 2) >> 1) << 9;
+
+
+
+            d1 = diffType(v, (C[0] >> (3 * 8)) & 0xff, th);
+            d2 = diffType(v, (C[2] >> (3 * 8)) & 0xff, th);
+
+            /*if ((d1 | d2) == 0)
+                return;*/
+
+            mask1 |= (d1 & 1) << 3;
+            mask2 |= ((d1 & 2) >> 1) << 3;
+
+            mask1 |= (d2 & 1) << 11;
+            mask2 |= ((d2 & 2) >> 1) << 11;
+
+
+
+            d1 = diffType(v, (C[1] >> (1 * 8)) & 0xff, th);
+            d2 = diffType(v, (C[3] >> (1 * 8)) & 0xff, th);
+
+            /*if ((d1 | d2) == 0)
+                return;*/
+
+            mask1 |= (d1 & 1) << 5;
+            mask2 |= ((d1 & 2) >> 1) << 5;
+
+            mask1 |= (d2 & 1) << 13;
+            mask2 |= ((d2 & 2) >> 1) << 13;
+
+
+
+            d1 = diffType(v, (C[1] >> (3 * 8)) & 0xff, th);
+            d2 = diffType(v, (C[3] >> (3 * 8)) & 0xff, th);
+
+            mask1 |= (d1 & 1) << 7;
+            mask2 |= ((d1 & 2) >> 1) << 7;
+
+            mask1 |= (d2 & 1) << 15;
+            mask2 |= ((d2 & 2) >> 1) << 15;
+        }
+
+        // 1 -> v > x + th
+        // 2 -> v < x - th
+        // 0 -> not a keypoint
+        __device__ __forceinline__ bool isKeyPoint(int mask1, int mask2)
+        {
+            return (__popc(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) ||
+                   (__popc(mask2) > 8 && (c_table[(mask2 >> 3) - 63] & (1 << (mask2 & 7))));
+        }
+
+        __device__ int cornerScore(const uint C[4], const int v, const int threshold)
+        {
+            // binary search in [threshold + 1, 255]
+
+            int min = threshold + 1;
+            int max = 255;
+
+            while (min <= max)
+            {
+                const int mid = (min + max) >> 1;
+
+                int mask1 = 0;
+                int mask2 = 0;
+
+                calcMask(C, v, mid, mask1, mask2);
+
+                int isKp = static_cast<int>(isKeyPoint(mask1, mask2));
+
+                min = isKp * (mid + 1) + (isKp ^ 1) * min;
+                max = (isKp ^ 1) * (mid - 1) + isKp * max;
+            }
+
+            return min - 1;
+        }
+
+        template <bool calcScore, class Mask>
+        __global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold, unsigned int* d_counter)
+        {
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
+
+            const int j = threadIdx.x + blockIdx.x * blockDim.x + 3;
+            const int i = threadIdx.y + blockIdx.y * blockDim.y + 3;
+
+            if (i < img.rows - 3 && j < img.cols - 3 && mask(i, j))
+            {
+                int v;
+                uint C[4] = {0,0,0,0};
+
+                C[2] |= static_cast<uint>(img(i - 3, j - 1)) << 8;
+                C[2] |= static_cast<uint>(img(i - 3, j));
+                C[1] |= static_cast<uint>(img(i - 3, j + 1)) << (3 * 8);
+
+                C[2] |= static_cast<uint>(img(i - 2, j - 2)) << (2 * 8);
+                C[1] |= static_cast<uint>(img(i - 2, j + 2)) << (2 * 8);
+
+                C[2] |= static_cast<uint>(img(i - 1, j - 3)) << (3 * 8);
+                C[1] |= static_cast<uint>(img(i - 1, j + 3)) << 8;
+
+                C[3] |= static_cast<uint>(img(i, j - 3));
+                v     = static_cast<int>(img(i, j));
+                C[1] |= static_cast<uint>(img(i, j + 3));
+
+                int d1 = diffType(v, C[1] & 0xff, threshold);
+                int d2 = diffType(v, C[3] & 0xff, threshold);
+
+                if ((d1 | d2) == 0)
+                    return;
+
+                C[3] |= static_cast<uint>(img(i + 1, j - 3)) << 8;
+                C[0] |= static_cast<uint>(img(i + 1, j + 3)) << (3 * 8);
+
+                C[3] |= static_cast<uint>(img(i + 2, j - 2)) << (2 * 8);
+                C[0] |= static_cast<uint>(img(i + 2, j + 2)) << (2 * 8);
+
+                C[3] |= static_cast<uint>(img(i + 3, j - 1)) << (3 * 8);
+                C[0] |= static_cast<uint>(img(i + 3, j));
+                C[0] |= static_cast<uint>(img(i + 3, j + 1)) << 8;
+
+                int mask1 = 0;
+                int mask2 = 0;
+
+                calcMask(C, v, threshold, mask1, mask2);
+
+                if (isKeyPoint(mask1, mask2))
+                {
+                    if (calcScore) score(i, j) = cornerScore(C, v, threshold);
+
+                    const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1));
+
+                    if (ind < maxKeypoints)
+                        kpLoc[ind] = make_short2(j, i);
+                }
+            }
+
+            #endif
+        }
+
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(img.cols - 6, block.x);
+            grid.y = divUp(img.rows - 6, block.y);
+
+            cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) );
+
+            if (score.data)
+            {
+                if (mask.data)
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter);
+                else
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter);
+            }
+            else
+            {
+                if (mask.data)
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter);
+                else
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter);
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            unsigned int count;
+            cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );
+
+            return count;
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // nonmaxSuppression
+
+        __global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal, unsigned int* d_counter)
+        {
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
+
+            const int kpIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+            if (kpIdx < count)
+            {
+                short2 loc = kpLoc[kpIdx];
+
+                int score = scoreMat(loc.y, loc.x);
+
+                bool ismax =
+                    score > scoreMat(loc.y - 1, loc.x - 1) &&
+                    score > scoreMat(loc.y - 1, loc.x    ) &&
+                    score > scoreMat(loc.y - 1, loc.x + 1) &&
+
+                    score > scoreMat(loc.y    , loc.x - 1) &&
+                    score > scoreMat(loc.y    , loc.x + 1) &&
+
+                    score > scoreMat(loc.y + 1, loc.x - 1) &&
+                    score > scoreMat(loc.y + 1, loc.x    ) &&
+                    score > scoreMat(loc.y + 1, loc.x + 1);
+
+                if (ismax)
+                {
+                    const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1));
+
+                    locFinal[ind] = loc;
+                    responseFinal[ind] = static_cast<float>(score);
+                }
+            }
+
+            #endif
+        }
+
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream)
+        {
+            dim3 block(256);
+
+            dim3 grid;
+            grid.x = divUp(count, block.x);
+
+            cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) );
+
+            nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response, d_counter);
+            cudaSafeCall( cudaGetLastError() );
+
+            unsigned int new_count;
+            cudaSafeCall( cudaMemcpyAsync(&new_count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );
+
+            return new_count;
+        }
+    } // namespace fast
+}}}
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafeatures2d/src/cuda/orb.cu b/modules/cudafeatures2d/src/cuda/orb.cu
new file mode 100644
index 00000000000..182ca4fb867
--- /dev/null
+++ b/modules/cudafeatures2d/src/cuda/orb.cu
@@ -0,0 +1,446 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/version.h>
+
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+namespace cv { namespace cuda { namespace device
+{
+    namespace orb
+    {
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // cull
+
+        int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream)
+        {
+            thrust::device_ptr<int> loc_ptr(loc);
+            thrust::device_ptr<float> response_ptr(response);
+#if THRUST_VERSION >= 100800
+#if THRUST_VERSION >= 100802
+            if (stream)
+            {
+                thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+            }
+            else
+            {
+                thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+            }
+#else
+            if(stream)
+            {
+                thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+            }else
+            {
+                thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+            }
+#endif
+#else
+            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+#endif
+            return n_points;
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // HarrisResponses
+
+        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
+        {
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
+            __shared__ int smem2[8 * 32];
+
+            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints)
+            {
+                const short2 loc = loc_[ptidx];
+
+                const int r = blockSize / 2;
+                const int x0 = loc.x - r;
+                const int y0 = loc.y - r;
+
+                int a = 0, b = 0, c = 0;
+
+                for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
+                {
+                    const int i = ind / blockSize;
+                    const int j = ind % blockSize;
+
+                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
+                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
+                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
+
+                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
+                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
+                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
+
+                    a += Ix * Ix;
+                    b += Iy * Iy;
+                    c += Ix * Iy;
+                }
+
+                int* srow0 = smem0 + threadIdx.y * blockDim.x;
+                int* srow1 = smem1 + threadIdx.y * blockDim.x;
+                int* srow2 = smem2 + threadIdx.y * blockDim.x;
+
+                plus<int> op;
+                reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
+
+                if (threadIdx.x == 0)
+                {
+                    float scale = (1 << 2) * blockSize * 255.0f;
+                    scale = 1.0f / scale;
+                    const float scale_sq_sq = scale * scale * scale * scale;
+
+                    response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
+                }
+            }
+        }
+
+        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.y);
+
+            HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // IC_Angle
+
+        __constant__ int c_u_max[32];
+
+        void loadUMax(const int* u_max, int count)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
+        }
+
+        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
+        {
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
+
+            int* srow0 = smem0 + threadIdx.y * blockDim.x;
+            int* srow1 = smem1 + threadIdx.y * blockDim.x;
+
+            plus<int> op;
+
+            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints)
+            {
+                int m_01 = 0, m_10 = 0;
+
+                const short2 loc = loc_[ptidx];
+
+                // Treat the center line differently, v=0
+                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
+                    m_10 += u * image(loc.y, loc.x + u);
+
+                reduce<32>(srow0, m_10, threadIdx.x, op);
+
+                for (int v = 1; v <= half_k; ++v)
+                {
+                    // Proceed over the two lines
+                    int v_sum = 0;
+                    int m_sum = 0;
+                    const int d = c_u_max[v];
+
+                    for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
+                    {
+                        int val_plus = image(loc.y + v, loc.x + u);
+                        int val_minus = image(loc.y - v, loc.x + u);
+
+                        v_sum += (val_plus - val_minus);
+                        m_sum += u * (val_plus + val_minus);
+                    }
+
+                    reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
+
+                    m_10 += m_sum;
+                    m_01 += v * v_sum;
+                }
+
+                if (threadIdx.x == 0)
+                {
+                    float kp_dir = ::atan2f((float)m_01, (float)m_10);
+                    kp_dir += (kp_dir < 0) * (2.0f * CV_PI_F);
+                    kp_dir *= 180.0f / CV_PI_F;
+
+                    angle[ptidx] = kp_dir;
+                }
+            }
+        }
+
+        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.y);
+
+            IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // computeOrbDescriptor
+
+        template <int WTA_K> struct OrbDescriptor;
+
+        #define GET_VALUE(idx) \
+            img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
+                loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
+
+        template <> struct OrbDescriptor<2>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 16 * i;
+                pattern_y += 16 * i;
+
+                int t0, t1, val;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+                val = t0 < t1;
+
+                t0 = GET_VALUE(2); t1 = GET_VALUE(3);
+                val |= (t0 < t1) << 1;
+
+                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+                val |= (t0 < t1) << 2;
+
+                t0 = GET_VALUE(6); t1 = GET_VALUE(7);
+                val |= (t0 < t1) << 3;
+
+                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+                val |= (t0 < t1) << 4;
+
+                t0 = GET_VALUE(10); t1 = GET_VALUE(11);
+                val |= (t0 < t1) << 5;
+
+                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+                val |= (t0 < t1) << 6;
+
+                t0 = GET_VALUE(14); t1 = GET_VALUE(15);
+                val |= (t0 < t1) << 7;
+
+                return val;
+            }
+        };
+
+        template <> struct OrbDescriptor<3>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 12 * i;
+                pattern_y += 12 * i;
+
+                int t0, t1, t2, val;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
+                val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
+
+                t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
+
+                t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
+
+                t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
+
+                return val;
+            }
+        };
+
+        template <> struct OrbDescriptor<4>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 16 * i;
+                pattern_y += 16 * i;
+
+                int t0, t1, t2, t3, k, val;
+                int a, b;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+                t2 = GET_VALUE(2); t3 = GET_VALUE(3);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val = k;
+
+                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+                t2 = GET_VALUE(6); t3 = GET_VALUE(7);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 2;
+
+                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+                t2 = GET_VALUE(10); t3 = GET_VALUE(11);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 4;
+
+                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+                t2 = GET_VALUE(14); t3 = GET_VALUE(15);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 6;
+
+                return val;
+            }
+        };
+
+        #undef GET_VALUE
+
+        template <int WTA_K>
+        __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
+        {
+            const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints && descidx < dsize)
+            {
+                float angle = angle_[ptidx];
+                angle *= (float)(CV_PI_F / 180.f);
+
+                float sina, cosa;
+                ::sincosf(angle, &sina, &cosa);
+
+                desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
+            }
+        }
+
+        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(dsize, block.x);
+            grid.y = divUp(npoints, block.y);
+
+            switch (WTA_K)
+            {
+            case 2:
+                computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+
+            case 3:
+                computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+
+            case 4:
+                computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // mergeLocation
+
+        __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
+        {
+            const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (ptidx < npoints)
+            {
+                short2 loc = loc_[ptidx];
+
+                x[ptidx] = loc.x * scale;
+                y[ptidx] = loc.y * scale;
+            }
+        }
+
+        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
+        {
+            dim3 block(256);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.x);
+
+            mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafeatures2d/src/fast.cpp b/modules/cudafeatures2d/src/fast.cpp
new file mode 100644
index 00000000000..e2c13b06b2b
--- /dev/null
+++ b/modules/cudafeatures2d/src/fast.cpp
@@ -0,0 +1,214 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int, bool, int, int) { throw_no_cuda(); return Ptr<cv::cuda::FastFeatureDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace fast
+    {
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream);
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    class FAST_Impl : public cv::cuda::FastFeatureDetector
+    {
+    public:
+        FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints);
+
+        virtual void detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask);
+        virtual void detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream);
+
+        virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
+
+        virtual void setThreshold(int threshold) { threshold_ = threshold; }
+        virtual int getThreshold() const { return threshold_; }
+
+        virtual void setNonmaxSuppression(bool f) { nonmaxSuppression_ = f; }
+        virtual bool getNonmaxSuppression() const { return nonmaxSuppression_; }
+
+        virtual void setMaxNumPoints(int max_npoints) { max_npoints_ = max_npoints; }
+        virtual int getMaxNumPoints() const { return max_npoints_; }
+
+        virtual void setType(int type) { CV_Assert( type == cv::FastFeatureDetector::TYPE_9_16 ); }
+        virtual int getType() const { return cv::FastFeatureDetector::TYPE_9_16; }
+
+    private:
+        int threshold_;
+        bool nonmaxSuppression_;
+        int max_npoints_;
+
+        unsigned int* d_counter;
+    };
+
+    FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :
+        threshold_(threshold), nonmaxSuppression_(nonmaxSuppression), max_npoints_(max_npoints)
+    {
+    }
+
+    void FAST_Impl::detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask)
+    {
+        if (_image.empty())
+        {
+            keypoints.clear();
+            return;
+        }
+
+        BufferPool pool(Stream::Null());
+        GpuMat d_keypoints = pool.getBuffer(ROWS_COUNT, max_npoints_, CV_32FC1);
+
+        detectAsync(_image, d_keypoints, _mask, Stream::Null());
+        convert(d_keypoints, keypoints);
+    }
+
+    void FAST_Impl::detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream)
+    {
+        using namespace cv::cuda::device::fast;
+
+        cudaSafeCall( cudaMalloc(&d_counter, sizeof(unsigned int)) );
+
+        const GpuMat img = _image.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()) );
+
+        BufferPool pool(stream);
+
+        GpuMat kpLoc = pool.getBuffer(1, max_npoints_, CV_16SC2);
+
+        GpuMat score;
+        if (nonmaxSuppression_)
+        {
+            score = pool.getBuffer(img.size(), CV_32SC1);
+            score.setTo(Scalar::all(0), stream);
+        }
+
+        int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, d_counter, StreamAccessor::getStream(stream));
+        count = std::min(count, max_npoints_);
+
+        if (count == 0)
+        {
+            _keypoints.release();
+            return;
+        }
+
+        ensureSizeIsEnough(ROWS_COUNT, count, CV_32FC1, _keypoints);
+        GpuMat& keypoints = _keypoints.getGpuMatRef();
+
+        if (nonmaxSuppression_)
+        {
+            count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), d_counter, StreamAccessor::getStream(stream));
+            if (count == 0)
+            {
+                keypoints.release();
+            }
+            else
+            {
+                keypoints.cols = count;
+            }
+        }
+        else
+        {
+            GpuMat locRow(1, count, kpLoc.type(), keypoints.ptr(0));
+            kpLoc.colRange(0, count).copyTo(locRow, stream);
+            keypoints.row(1).setTo(Scalar::all(0), stream);
+        }
+
+        cudaSafeCall( cudaFree(d_counter) );
+    }
+
+    void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
+    {
+        if (_gpu_keypoints.empty())
+        {
+            keypoints.clear();
+            return;
+        }
+
+        Mat h_keypoints;
+        if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_keypoints.getGpuMat().download(h_keypoints);
+        }
+        else
+        {
+            h_keypoints = _gpu_keypoints.getMat();
+        }
+
+        CV_Assert( h_keypoints.rows == ROWS_COUNT );
+        CV_Assert( h_keypoints.elemSize() == 4 );
+
+        const int npoints = h_keypoints.cols;
+
+        keypoints.resize(npoints);
+
+        const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
+        const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
+
+        for (int i = 0; i < npoints; ++i)
+        {
+            KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
+            keypoints[i] = kp;
+        }
+    }
+}
+
+Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int threshold, bool nonmaxSuppression, int type, int max_npoints)
+{
+    CV_Assert( type == cv::FastFeatureDetector::TYPE_9_16 );
+    return makePtr<FAST_Impl>(threshold, nonmaxSuppression, max_npoints);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudafeatures2d/src/feature2d_async.cpp b/modules/cudafeatures2d/src/feature2d_async.cpp
new file mode 100644
index 00000000000..202a725376a
--- /dev/null
+++ b/modules/cudafeatures2d/src/feature2d_async.cpp
@@ -0,0 +1,85 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+cv::cuda::Feature2DAsync::~Feature2DAsync()
+{
+}
+
+void cv::cuda::Feature2DAsync::detectAsync(InputArray image,
+                                           OutputArray keypoints,
+                                           InputArray mask,
+                                           Stream& stream)
+{
+    if (image.empty())
+    {
+        keypoints.clear();
+        return;
+    }
+
+    detectAndComputeAsync(image, mask, keypoints, noArray(), false, stream);
+}
+
+void cv::cuda::Feature2DAsync::computeAsync(InputArray image,
+                                            OutputArray keypoints,
+                                            OutputArray descriptors,
+                                            Stream& stream)
+{
+    if (image.empty())
+    {
+        descriptors.release();
+        return;
+    }
+
+    detectAndComputeAsync(image, noArray(), keypoints, descriptors, true, stream);
+}
+
+void cv::cuda::Feature2DAsync::detectAndComputeAsync(InputArray /*image*/,
+                                                     InputArray /*mask*/,
+                                                     OutputArray /*keypoints*/,
+                                                     OutputArray /*descriptors*/,
+                                                     bool /*useProvidedKeypoints*/,
+                                                     Stream& /*stream*/)
+{
+    CV_Error(Error::StsNotImplemented, "");
+}
diff --git a/modules/cudafeatures2d/src/orb.cpp b/modules/cudafeatures2d/src/orb.cpp
new file mode 100644
index 00000000000..75cdd7efa88
--- /dev/null
+++ b/modules/cudafeatures2d/src/orb.cpp
@@ -0,0 +1,930 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int, float, int, int, int, int, int, int, int, bool) { throw_no_cuda(); return Ptr<cv::cuda::ORB>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace orb
+    {
+        int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream);
+
+        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream);
+
+        void loadUMax(const int* u_max, int count);
+
+        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream);
+
+        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream);
+
+        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    const float HARRIS_K = 0.04f;
+    const int DESCRIPTOR_SIZE = 32;
+
+    const int bit_pattern_31_[256 * 4] =
+    {
+        8,-3, 9,5/*mean (0), correlation (0)*/,
+        4,2, 7,-12/*mean (1.12461e-05), correlation (0.0437584)*/,
+        -11,9, -8,2/*mean (3.37382e-05), correlation (0.0617409)*/,
+        7,-12, 12,-13/*mean (5.62303e-05), correlation (0.0636977)*/,
+        2,-13, 2,12/*mean (0.000134953), correlation (0.085099)*/,
+        1,-7, 1,6/*mean (0.000528565), correlation (0.0857175)*/,
+        -2,-10, -2,-4/*mean (0.0188821), correlation (0.0985774)*/,
+        -13,-13, -11,-8/*mean (0.0363135), correlation (0.0899616)*/,
+        -13,-3, -12,-9/*mean (0.121806), correlation (0.099849)*/,
+        10,4, 11,9/*mean (0.122065), correlation (0.093285)*/,
+        -13,-8, -8,-9/*mean (0.162787), correlation (0.0942748)*/,
+        -11,7, -9,12/*mean (0.21561), correlation (0.0974438)*/,
+        7,7, 12,6/*mean (0.160583), correlation (0.130064)*/,
+        -4,-5, -3,0/*mean (0.228171), correlation (0.132998)*/,
+        -13,2, -12,-3/*mean (0.00997526), correlation (0.145926)*/,
+        -9,0, -7,5/*mean (0.198234), correlation (0.143636)*/,
+        12,-6, 12,-1/*mean (0.0676226), correlation (0.16689)*/,
+        -3,6, -2,12/*mean (0.166847), correlation (0.171682)*/,
+        -6,-13, -4,-8/*mean (0.101215), correlation (0.179716)*/,
+        11,-13, 12,-8/*mean (0.200641), correlation (0.192279)*/,
+        4,7, 5,1/*mean (0.205106), correlation (0.186848)*/,
+        5,-3, 10,-3/*mean (0.234908), correlation (0.192319)*/,
+        3,-7, 6,12/*mean (0.0709964), correlation (0.210872)*/,
+        -8,-7, -6,-2/*mean (0.0939834), correlation (0.212589)*/,
+        -2,11, -1,-10/*mean (0.127778), correlation (0.20866)*/,
+        -13,12, -8,10/*mean (0.14783), correlation (0.206356)*/,
+        -7,3, -5,-3/*mean (0.182141), correlation (0.198942)*/,
+        -4,2, -3,7/*mean (0.188237), correlation (0.21384)*/,
+        -10,-12, -6,11/*mean (0.14865), correlation (0.23571)*/,
+        5,-12, 6,-7/*mean (0.222312), correlation (0.23324)*/,
+        5,-6, 7,-1/*mean (0.229082), correlation (0.23389)*/,
+        1,0, 4,-5/*mean (0.241577), correlation (0.215286)*/,
+        9,11, 11,-13/*mean (0.00338507), correlation (0.251373)*/,
+        4,7, 4,12/*mean (0.131005), correlation (0.257622)*/,
+        2,-1, 4,4/*mean (0.152755), correlation (0.255205)*/,
+        -4,-12, -2,7/*mean (0.182771), correlation (0.244867)*/,
+        -8,-5, -7,-10/*mean (0.186898), correlation (0.23901)*/,
+        4,11, 9,12/*mean (0.226226), correlation (0.258255)*/,
+        0,-8, 1,-13/*mean (0.0897886), correlation (0.274827)*/,
+        -13,-2, -8,2/*mean (0.148774), correlation (0.28065)*/,
+        -3,-2, -2,3/*mean (0.153048), correlation (0.283063)*/,
+        -6,9, -4,-9/*mean (0.169523), correlation (0.278248)*/,
+        8,12, 10,7/*mean (0.225337), correlation (0.282851)*/,
+        0,9, 1,3/*mean (0.226687), correlation (0.278734)*/,
+        7,-5, 11,-10/*mean (0.00693882), correlation (0.305161)*/,
+        -13,-6, -11,0/*mean (0.0227283), correlation (0.300181)*/,
+        10,7, 12,1/*mean (0.125517), correlation (0.31089)*/,
+        -6,-3, -6,12/*mean (0.131748), correlation (0.312779)*/,
+        10,-9, 12,-4/*mean (0.144827), correlation (0.292797)*/,
+        -13,8, -8,-12/*mean (0.149202), correlation (0.308918)*/,
+        -13,0, -8,-4/*mean (0.160909), correlation (0.310013)*/,
+        3,3, 7,8/*mean (0.177755), correlation (0.309394)*/,
+        5,7, 10,-7/*mean (0.212337), correlation (0.310315)*/,
+        -1,7, 1,-12/*mean (0.214429), correlation (0.311933)*/,
+        3,-10, 5,6/*mean (0.235807), correlation (0.313104)*/,
+        2,-4, 3,-10/*mean (0.00494827), correlation (0.344948)*/,
+        -13,0, -13,5/*mean (0.0549145), correlation (0.344675)*/,
+        -13,-7, -12,12/*mean (0.103385), correlation (0.342715)*/,
+        -13,3, -11,8/*mean (0.134222), correlation (0.322922)*/,
+        -7,12, -4,7/*mean (0.153284), correlation (0.337061)*/,
+        6,-10, 12,8/*mean (0.154881), correlation (0.329257)*/,
+        -9,-1, -7,-6/*mean (0.200967), correlation (0.33312)*/,
+        -2,-5, 0,12/*mean (0.201518), correlation (0.340635)*/,
+        -12,5, -7,5/*mean (0.207805), correlation (0.335631)*/,
+        3,-10, 8,-13/*mean (0.224438), correlation (0.34504)*/,
+        -7,-7, -4,5/*mean (0.239361), correlation (0.338053)*/,
+        -3,-2, -1,-7/*mean (0.240744), correlation (0.344322)*/,
+        2,9, 5,-11/*mean (0.242949), correlation (0.34145)*/,
+        -11,-13, -5,-13/*mean (0.244028), correlation (0.336861)*/,
+        -1,6, 0,-1/*mean (0.247571), correlation (0.343684)*/,
+        5,-3, 5,2/*mean (0.000697256), correlation (0.357265)*/,
+        -4,-13, -4,12/*mean (0.00213675), correlation (0.373827)*/,
+        -9,-6, -9,6/*mean (0.0126856), correlation (0.373938)*/,
+        -12,-10, -8,-4/*mean (0.0152497), correlation (0.364237)*/,
+        10,2, 12,-3/*mean (0.0299933), correlation (0.345292)*/,
+        7,12, 12,12/*mean (0.0307242), correlation (0.366299)*/,
+        -7,-13, -6,5/*mean (0.0534975), correlation (0.368357)*/,
+        -4,9, -3,4/*mean (0.099865), correlation (0.372276)*/,
+        7,-1, 12,2/*mean (0.117083), correlation (0.364529)*/,
+        -7,6, -5,1/*mean (0.126125), correlation (0.369606)*/,
+        -13,11, -12,5/*mean (0.130364), correlation (0.358502)*/,
+        -3,7, -2,-6/*mean (0.131691), correlation (0.375531)*/,
+        7,-8, 12,-7/*mean (0.160166), correlation (0.379508)*/,
+        -13,-7, -11,-12/*mean (0.167848), correlation (0.353343)*/,
+        1,-3, 12,12/*mean (0.183378), correlation (0.371916)*/,
+        2,-6, 3,0/*mean (0.228711), correlation (0.371761)*/,
+        -4,3, -2,-13/*mean (0.247211), correlation (0.364063)*/,
+        -1,-13, 1,9/*mean (0.249325), correlation (0.378139)*/,
+        7,1, 8,-6/*mean (0.000652272), correlation (0.411682)*/,
+        1,-1, 3,12/*mean (0.00248538), correlation (0.392988)*/,
+        9,1, 12,6/*mean (0.0206815), correlation (0.386106)*/,
+        -1,-9, -1,3/*mean (0.0364485), correlation (0.410752)*/,
+        -13,-13, -10,5/*mean (0.0376068), correlation (0.398374)*/,
+        7,7, 10,12/*mean (0.0424202), correlation (0.405663)*/,
+        12,-5, 12,9/*mean (0.0942645), correlation (0.410422)*/,
+        6,3, 7,11/*mean (0.1074), correlation (0.413224)*/,
+        5,-13, 6,10/*mean (0.109256), correlation (0.408646)*/,
+        2,-12, 2,3/*mean (0.131691), correlation (0.416076)*/,
+        3,8, 4,-6/*mean (0.165081), correlation (0.417569)*/,
+        2,6, 12,-13/*mean (0.171874), correlation (0.408471)*/,
+        9,-12, 10,3/*mean (0.175146), correlation (0.41296)*/,
+        -8,4, -7,9/*mean (0.183682), correlation (0.402956)*/,
+        -11,12, -4,-6/*mean (0.184672), correlation (0.416125)*/,
+        1,12, 2,-8/*mean (0.191487), correlation (0.386696)*/,
+        6,-9, 7,-4/*mean (0.192668), correlation (0.394771)*/,
+        2,3, 3,-2/*mean (0.200157), correlation (0.408303)*/,
+        6,3, 11,0/*mean (0.204588), correlation (0.411762)*/,
+        3,-3, 8,-8/*mean (0.205904), correlation (0.416294)*/,
+        7,8, 9,3/*mean (0.213237), correlation (0.409306)*/,
+        -11,-5, -6,-4/*mean (0.243444), correlation (0.395069)*/,
+        -10,11, -5,10/*mean (0.247672), correlation (0.413392)*/,
+        -5,-8, -3,12/*mean (0.24774), correlation (0.411416)*/,
+        -10,5, -9,0/*mean (0.00213675), correlation (0.454003)*/,
+        8,-1, 12,-6/*mean (0.0293635), correlation (0.455368)*/,
+        4,-6, 6,-11/*mean (0.0404971), correlation (0.457393)*/,
+        -10,12, -8,7/*mean (0.0481107), correlation (0.448364)*/,
+        4,-2, 6,7/*mean (0.050641), correlation (0.455019)*/,
+        -2,0, -2,12/*mean (0.0525978), correlation (0.44338)*/,
+        -5,-8, -5,2/*mean (0.0629667), correlation (0.457096)*/,
+        7,-6, 10,12/*mean (0.0653846), correlation (0.445623)*/,
+        -9,-13, -8,-8/*mean (0.0858749), correlation (0.449789)*/,
+        -5,-13, -5,-2/*mean (0.122402), correlation (0.450201)*/,
+        8,-8, 9,-13/*mean (0.125416), correlation (0.453224)*/,
+        -9,-11, -9,0/*mean (0.130128), correlation (0.458724)*/,
+        1,-8, 1,-2/*mean (0.132467), correlation (0.440133)*/,
+        7,-4, 9,1/*mean (0.132692), correlation (0.454)*/,
+        -2,1, -1,-4/*mean (0.135695), correlation (0.455739)*/,
+        11,-6, 12,-11/*mean (0.142904), correlation (0.446114)*/,
+        -12,-9, -6,4/*mean (0.146165), correlation (0.451473)*/,
+        3,7, 7,12/*mean (0.147627), correlation (0.456643)*/,
+        5,5, 10,8/*mean (0.152901), correlation (0.455036)*/,
+        0,-4, 2,8/*mean (0.167083), correlation (0.459315)*/,
+        -9,12, -5,-13/*mean (0.173234), correlation (0.454706)*/,
+        0,7, 2,12/*mean (0.18312), correlation (0.433855)*/,
+        -1,2, 1,7/*mean (0.185504), correlation (0.443838)*/,
+        5,11, 7,-9/*mean (0.185706), correlation (0.451123)*/,
+        3,5, 6,-8/*mean (0.188968), correlation (0.455808)*/,
+        -13,-4, -8,9/*mean (0.191667), correlation (0.459128)*/,
+        -5,9, -3,-3/*mean (0.193196), correlation (0.458364)*/,
+        -4,-7, -3,-12/*mean (0.196536), correlation (0.455782)*/,
+        6,5, 8,0/*mean (0.1972), correlation (0.450481)*/,
+        -7,6, -6,12/*mean (0.199438), correlation (0.458156)*/,
+        -13,6, -5,-2/*mean (0.211224), correlation (0.449548)*/,
+        1,-10, 3,10/*mean (0.211718), correlation (0.440606)*/,
+        4,1, 8,-4/*mean (0.213034), correlation (0.443177)*/,
+        -2,-2, 2,-13/*mean (0.234334), correlation (0.455304)*/,
+        2,-12, 12,12/*mean (0.235684), correlation (0.443436)*/,
+        -2,-13, 0,-6/*mean (0.237674), correlation (0.452525)*/,
+        4,1, 9,3/*mean (0.23962), correlation (0.444824)*/,
+        -6,-10, -3,-5/*mean (0.248459), correlation (0.439621)*/,
+        -3,-13, -1,1/*mean (0.249505), correlation (0.456666)*/,
+        7,5, 12,-11/*mean (0.00119208), correlation (0.495466)*/,
+        4,-2, 5,-7/*mean (0.00372245), correlation (0.484214)*/,
+        -13,9, -9,-5/*mean (0.00741116), correlation (0.499854)*/,
+        7,1, 8,6/*mean (0.0208952), correlation (0.499773)*/,
+        7,-8, 7,6/*mean (0.0220085), correlation (0.501609)*/,
+        -7,-4, -7,1/*mean (0.0233806), correlation (0.496568)*/,
+        -8,11, -7,-8/*mean (0.0236505), correlation (0.489719)*/,
+        -13,6, -12,-8/*mean (0.0268781), correlation (0.503487)*/,
+        2,4, 3,9/*mean (0.0323324), correlation (0.501938)*/,
+        10,-5, 12,3/*mean (0.0399235), correlation (0.494029)*/,
+        -6,-5, -6,7/*mean (0.0420153), correlation (0.486579)*/,
+        8,-3, 9,-8/*mean (0.0548021), correlation (0.484237)*/,
+        2,-12, 2,8/*mean (0.0616622), correlation (0.496642)*/,
+        -11,-2, -10,3/*mean (0.0627755), correlation (0.498563)*/,
+        -12,-13, -7,-9/*mean (0.0829622), correlation (0.495491)*/,
+        -11,0, -10,-5/*mean (0.0843342), correlation (0.487146)*/,
+        5,-3, 11,8/*mean (0.0929937), correlation (0.502315)*/,
+        -2,-13, -1,12/*mean (0.113327), correlation (0.48941)*/,
+        -1,-8, 0,9/*mean (0.132119), correlation (0.467268)*/,
+        -13,-11, -12,-5/*mean (0.136269), correlation (0.498771)*/,
+        -10,-2, -10,11/*mean (0.142173), correlation (0.498714)*/,
+        -3,9, -2,-13/*mean (0.144141), correlation (0.491973)*/,
+        2,-3, 3,2/*mean (0.14892), correlation (0.500782)*/,
+        -9,-13, -4,0/*mean (0.150371), correlation (0.498211)*/,
+        -4,6, -3,-10/*mean (0.152159), correlation (0.495547)*/,
+        -4,12, -2,-7/*mean (0.156152), correlation (0.496925)*/,
+        -6,-11, -4,9/*mean (0.15749), correlation (0.499222)*/,
+        6,-3, 6,11/*mean (0.159211), correlation (0.503821)*/,
+        -13,11, -5,5/*mean (0.162427), correlation (0.501907)*/,
+        11,11, 12,6/*mean (0.16652), correlation (0.497632)*/,
+        7,-5, 12,-2/*mean (0.169141), correlation (0.484474)*/,
+        -1,12, 0,7/*mean (0.169456), correlation (0.495339)*/,
+        -4,-8, -3,-2/*mean (0.171457), correlation (0.487251)*/,
+        -7,1, -6,7/*mean (0.175), correlation (0.500024)*/,
+        -13,-12, -8,-13/*mean (0.175866), correlation (0.497523)*/,
+        -7,-2, -6,-8/*mean (0.178273), correlation (0.501854)*/,
+        -8,5, -6,-9/*mean (0.181107), correlation (0.494888)*/,
+        -5,-1, -4,5/*mean (0.190227), correlation (0.482557)*/,
+        -13,7, -8,10/*mean (0.196739), correlation (0.496503)*/,
+        1,5, 5,-13/*mean (0.19973), correlation (0.499759)*/,
+        1,0, 10,-13/*mean (0.204465), correlation (0.49873)*/,
+        9,12, 10,-1/*mean (0.209334), correlation (0.49063)*/,
+        5,-8, 10,-9/*mean (0.211134), correlation (0.503011)*/,
+        -1,11, 1,-13/*mean (0.212), correlation (0.499414)*/,
+        -9,-3, -6,2/*mean (0.212168), correlation (0.480739)*/,
+        -1,-10, 1,12/*mean (0.212731), correlation (0.502523)*/,
+        -13,1, -8,-10/*mean (0.21327), correlation (0.489786)*/,
+        8,-11, 10,-6/*mean (0.214159), correlation (0.488246)*/,
+        2,-13, 3,-6/*mean (0.216993), correlation (0.50287)*/,
+        7,-13, 12,-9/*mean (0.223639), correlation (0.470502)*/,
+        -10,-10, -5,-7/*mean (0.224089), correlation (0.500852)*/,
+        -10,-8, -8,-13/*mean (0.228666), correlation (0.502629)*/,
+        4,-6, 8,5/*mean (0.22906), correlation (0.498305)*/,
+        3,12, 8,-13/*mean (0.233378), correlation (0.503825)*/,
+        -4,2, -3,-3/*mean (0.234323), correlation (0.476692)*/,
+        5,-13, 10,-12/*mean (0.236392), correlation (0.475462)*/,
+        4,-13, 5,-1/*mean (0.236842), correlation (0.504132)*/,
+        -9,9, -4,3/*mean (0.236977), correlation (0.497739)*/,
+        0,3, 3,-9/*mean (0.24314), correlation (0.499398)*/,
+        -12,1, -6,1/*mean (0.243297), correlation (0.489447)*/,
+        3,2, 4,-8/*mean (0.00155196), correlation (0.553496)*/,
+        -10,-10, -10,9/*mean (0.00239541), correlation (0.54297)*/,
+        8,-13, 12,12/*mean (0.0034413), correlation (0.544361)*/,
+        -8,-12, -6,-5/*mean (0.003565), correlation (0.551225)*/,
+        2,2, 3,7/*mean (0.00835583), correlation (0.55285)*/,
+        10,6, 11,-8/*mean (0.00885065), correlation (0.540913)*/,
+        6,8, 8,-12/*mean (0.0101552), correlation (0.551085)*/,
+        -7,10, -6,5/*mean (0.0102227), correlation (0.533635)*/,
+        -3,-9, -3,9/*mean (0.0110211), correlation (0.543121)*/,
+        -1,-13, -1,5/*mean (0.0113473), correlation (0.550173)*/,
+        -3,-7, -3,4/*mean (0.0140913), correlation (0.554774)*/,
+        -8,-2, -8,3/*mean (0.017049), correlation (0.55461)*/,
+        4,2, 12,12/*mean (0.01778), correlation (0.546921)*/,
+        2,-5, 3,11/*mean (0.0224022), correlation (0.549667)*/,
+        6,-9, 11,-13/*mean (0.029161), correlation (0.546295)*/,
+        3,-1, 7,12/*mean (0.0303081), correlation (0.548599)*/,
+        11,-1, 12,4/*mean (0.0355151), correlation (0.523943)*/,
+        -3,0, -3,6/*mean (0.0417904), correlation (0.543395)*/,
+        4,-11, 4,12/*mean (0.0487292), correlation (0.542818)*/,
+        2,-4, 2,1/*mean (0.0575124), correlation (0.554888)*/,
+        -10,-6, -8,1/*mean (0.0594242), correlation (0.544026)*/,
+        -13,7, -11,1/*mean (0.0597391), correlation (0.550524)*/,
+        -13,12, -11,-13/*mean (0.0608974), correlation (0.55383)*/,
+        6,0, 11,-13/*mean (0.065126), correlation (0.552006)*/,
+        0,-1, 1,4/*mean (0.074224), correlation (0.546372)*/,
+        -13,3, -9,-2/*mean (0.0808592), correlation (0.554875)*/,
+        -9,8, -6,-3/*mean (0.0883378), correlation (0.551178)*/,
+        -13,-6, -8,-2/*mean (0.0901035), correlation (0.548446)*/,
+        5,-9, 8,10/*mean (0.0949843), correlation (0.554694)*/,
+        2,7, 3,-9/*mean (0.0994152), correlation (0.550979)*/,
+        -1,-6, -1,-1/*mean (0.10045), correlation (0.552714)*/,
+        9,5, 11,-2/*mean (0.100686), correlation (0.552594)*/,
+        11,-3, 12,-8/*mean (0.101091), correlation (0.532394)*/,
+        3,0, 3,5/*mean (0.101147), correlation (0.525576)*/,
+        -1,4, 0,10/*mean (0.105263), correlation (0.531498)*/,
+        3,-6, 4,5/*mean (0.110785), correlation (0.540491)*/,
+        -13,0, -10,5/*mean (0.112798), correlation (0.536582)*/,
+        5,8, 12,11/*mean (0.114181), correlation (0.555793)*/,
+        8,9, 9,-6/*mean (0.117431), correlation (0.553763)*/,
+        7,-4, 8,-12/*mean (0.118522), correlation (0.553452)*/,
+        -10,4, -10,9/*mean (0.12094), correlation (0.554785)*/,
+        7,3, 12,4/*mean (0.122582), correlation (0.555825)*/,
+        9,-7, 10,-2/*mean (0.124978), correlation (0.549846)*/,
+        7,0, 12,-2/*mean (0.127002), correlation (0.537452)*/,
+        -1,-6, 0,-11/*mean (0.127148), correlation (0.547401)*/
+    };
+
+    class ORB_Impl : public cv::cuda::ORB
+    {
+    public:
+        ORB_Impl(int nfeatures,
+                 float scaleFactor,
+                 int nlevels,
+                 int edgeThreshold,
+                 int firstLevel,
+                 int WTA_K,
+                 int scoreType,
+                 int patchSize,
+                 int fastThreshold,
+                 bool blurForDescriptor);
+
+        virtual void detectAndCompute(InputArray _image, InputArray _mask, std::vector<KeyPoint>& keypoints, OutputArray _descriptors, bool useProvidedKeypoints);
+        virtual void detectAndComputeAsync(InputArray _image, InputArray _mask, OutputArray _keypoints, OutputArray _descriptors, bool useProvidedKeypoints, Stream& stream);
+
+        virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
+
+        virtual int descriptorSize() const { return cv::ORB::kBytes; }
+        virtual int descriptorType() const { return CV_8U; }
+        virtual int defaultNorm() const { return NORM_HAMMING; }
+
+        virtual void setMaxFeatures(int maxFeatures) { nFeatures_ = maxFeatures; }
+        virtual int getMaxFeatures() const { return nFeatures_; }
+
+        virtual void setScaleFactor(double scaleFactor) { scaleFactor_ = scaleFactor; }
+        virtual double getScaleFactor() const { return scaleFactor_; }
+
+        virtual void setNLevels(int nlevels) { nLevels_ = nlevels; }
+        virtual int getNLevels() const { return nLevels_; }
+
+        virtual void setEdgeThreshold(int edgeThreshold) { edgeThreshold_ = edgeThreshold; }
+        virtual int getEdgeThreshold() const { return edgeThreshold_; }
+
+        virtual void setFirstLevel(int firstLevel) { firstLevel_ = firstLevel; }
+        virtual int getFirstLevel() const { return firstLevel_; }
+
+        virtual void setWTA_K(int wta_k) { WTA_K_ = wta_k; }
+        virtual int getWTA_K() const { return WTA_K_; }
+
+        virtual void setScoreType(int scoreType) { scoreType_ = scoreType; }
+        virtual int getScoreType() const { return scoreType_; }
+
+        virtual void setPatchSize(int patchSize) { patchSize_ = patchSize; }
+        virtual int getPatchSize() const { return patchSize_; }
+
+        virtual void setFastThreshold(int fastThreshold) { fastThreshold_ = fastThreshold; }
+        virtual int getFastThreshold() const { return fastThreshold_; }
+
+        virtual void setBlurForDescriptor(bool blurForDescriptor) { blurForDescriptor_ = blurForDescriptor; }
+        virtual bool getBlurForDescriptor() const { return blurForDescriptor_; }
+
+    private:
+        int nFeatures_;
+        float scaleFactor_;
+        int nLevels_;
+        int edgeThreshold_;
+        int firstLevel_;
+        int WTA_K_;
+        int scoreType_;
+        int patchSize_;
+        int fastThreshold_;
+        bool blurForDescriptor_;
+
+    private:
+        void buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream);
+        void computeKeyPointsPyramid(Stream& stream);
+        void computeDescriptors(OutputArray _descriptors, Stream& stream);
+        void mergeKeyPoints(OutputArray _keypoints, Stream& stream);
+
+    private:
+        Ptr<cv::cuda::FastFeatureDetector> fastDetector_;
+
+        //! The number of desired features per scale
+        std::vector<size_t> n_features_per_level_;
+
+        //! Points to compute BRIEF descriptors from
+        GpuMat pattern_;
+
+        std::vector<GpuMat> imagePyr_;
+        std::vector<GpuMat> maskPyr_;
+
+        GpuMat buf_;
+
+        std::vector<GpuMat> keyPointsPyr_;
+        std::vector<int> keyPointsCount_;
+
+        Ptr<cuda::Filter> blurFilter_;
+
+        GpuMat d_keypoints_;
+    };
+
+    static void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
+    {
+        RNG rng(0x12345678);
+
+        pattern.create(2, ntuples * tupleSize, CV_32SC1);
+        pattern.setTo(Scalar::all(0));
+
+        int* pattern_x_ptr = pattern.ptr<int>(0);
+        int* pattern_y_ptr = pattern.ptr<int>(1);
+
+        for (int i = 0; i < ntuples; i++)
+        {
+            for (int k = 0; k < tupleSize; k++)
+            {
+                for(;;)
+                {
+                    int idx = rng.uniform(0, poolSize);
+                    Point pt = pattern0[idx];
+
+                    int k1;
+                    for (k1 = 0; k1 < k; k1++)
+                        if (pattern_x_ptr[tupleSize * i + k1] == pt.x && pattern_y_ptr[tupleSize * i + k1] == pt.y)
+                            break;
+
+                    if (k1 == k)
+                    {
+                        pattern_x_ptr[tupleSize * i + k] = pt.x;
+                        pattern_y_ptr[tupleSize * i + k] = pt.y;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    static void makeRandomPattern(int patchSize, Point* pattern, int npoints)
+    {
+        // we always start with a fixed seed,
+        // to make patterns the same on each run
+        RNG rng(0x34985739);
+
+        for (int i = 0; i < npoints; i++)
+        {
+            pattern[i].x = rng.uniform(-patchSize / 2, patchSize / 2 + 1);
+            pattern[i].y = rng.uniform(-patchSize / 2, patchSize / 2 + 1);
+        }
+    }
+
+    ORB_Impl::ORB_Impl(int nFeatures,
+                       float scaleFactor,
+                       int nLevels,
+                       int edgeThreshold,
+                       int firstLevel,
+                       int WTA_K,
+                       int scoreType,
+                       int patchSize,
+                       int fastThreshold,
+                       bool blurForDescriptor) :
+        nFeatures_(nFeatures),
+        scaleFactor_(scaleFactor),
+        nLevels_(nLevels),
+        edgeThreshold_(edgeThreshold),
+        firstLevel_(firstLevel),
+        WTA_K_(WTA_K),
+        scoreType_(scoreType),
+        patchSize_(patchSize),
+        fastThreshold_(fastThreshold),
+        blurForDescriptor_(blurForDescriptor)
+    {
+        CV_Assert( patchSize_ >= 2 );
+        CV_Assert( WTA_K_ == 2 || WTA_K_ == 3 || WTA_K_ == 4 );
+
+        fastDetector_ = cuda::FastFeatureDetector::create(fastThreshold_);
+
+        // fill the extractors and descriptors for the corresponding scales
+        float factor = 1.0f / scaleFactor_;
+        float n_desired_features_per_scale = nFeatures_ * (1.0f - factor) / (1.0f - std::pow(factor, nLevels_));
+
+        n_features_per_level_.resize(nLevels_);
+        size_t sum_n_features = 0;
+        for (int level = 0; level < nLevels_ - 1; ++level)
+        {
+            n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
+            sum_n_features += n_features_per_level_[level];
+            n_desired_features_per_scale *= factor;
+        }
+        n_features_per_level_[nLevels_ - 1] = nFeatures - sum_n_features;
+
+        // pre-compute the end of a row in a circular patch
+        int half_patch_size = patchSize_ / 2;
+        std::vector<int> u_max(half_patch_size + 2);
+        for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
+        {
+            u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
+        }
+
+        // Make sure we are symmetric
+        for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
+        {
+            while (u_max[v_0] == u_max[v_0 + 1])
+                ++v_0;
+            u_max[v] = v_0;
+            ++v_0;
+        }
+        CV_Assert( u_max.size() < 32 );
+        cv::cuda::device::orb::loadUMax(&u_max[0], static_cast<int>(u_max.size()));
+
+        // Calc pattern
+        const int npoints = 512;
+        Point pattern_buf[npoints];
+        const Point* pattern0 = (const Point*)bit_pattern_31_;
+        if (patchSize_ != 31)
+        {
+            pattern0 = pattern_buf;
+            makeRandomPattern(patchSize_, pattern_buf, npoints);
+        }
+
+        Mat h_pattern;
+        if (WTA_K_ == 2)
+        {
+            h_pattern.create(2, npoints, CV_32SC1);
+
+            int* pattern_x_ptr = h_pattern.ptr<int>(0);
+            int* pattern_y_ptr = h_pattern.ptr<int>(1);
+
+            for (int i = 0; i < npoints; ++i)
+            {
+                pattern_x_ptr[i] = pattern0[i].x;
+                pattern_y_ptr[i] = pattern0[i].y;
+            }
+        }
+        else
+        {
+            int ntuples = descriptorSize() * 4;
+            initializeOrbPattern(pattern0, h_pattern, ntuples, WTA_K_, npoints);
+        }
+
+        pattern_.upload(h_pattern);
+
+        blurFilter_ = cuda::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
+    }
+
+    static float getScale(float scaleFactor, int firstLevel, int level)
+    {
+        return pow(scaleFactor, level - firstLevel);
+    }
+
+    void ORB_Impl::detectAndCompute(InputArray _image, InputArray _mask, std::vector<KeyPoint>& keypoints, OutputArray _descriptors, bool useProvidedKeypoints)
+    {
+        using namespace cv::cuda::device::orb;
+        if (useProvidedKeypoints)
+        {
+            d_keypoints_.release();
+            keyPointsPyr_.clear();
+
+            int j, level, nkeypoints = (int)keypoints.size();
+            nLevels_ = 0;
+            for( j = 0; j < nkeypoints; j++ )
+            {
+                level = keypoints[j].octave;
+                CV_Assert(level >= 0);
+                nLevels_ = std::max(nLevels_, level);
+            }
+            nLevels_ ++;
+            std::vector<std::vector<KeyPoint> > oKeypoints(nLevels_);
+            for( j = 0; j < nkeypoints; j++ )
+            {
+                level = keypoints[j].octave;
+                oKeypoints[level].push_back(keypoints[j]);
+            }
+            if (!keypoints.empty())
+            {
+                keyPointsPyr_.resize(nLevels_);
+                keyPointsCount_.resize(nLevels_);
+                int t;
+                for(t = 0; t < nLevels_; t++) {
+                    const std::vector<KeyPoint>& ks = oKeypoints[t];
+                    if (!ks.empty()){
+
+                        Mat h_keypoints(ROWS_COUNT, static_cast<int>(ks.size()), CV_32FC1);
+
+                        float sf = getScale(scaleFactor_, firstLevel_, t);
+                        float locScale = t != firstLevel_ ? sf : 1.0f;
+                        float scale = 1.f/locScale;
+
+                        short2* x_loc_row = h_keypoints.ptr<short2>(0);
+                        float* x_kp_hessian = h_keypoints.ptr<float>(1);
+                        float* x_kp_dir = h_keypoints.ptr<float>(2);
+
+                        for (size_t i = 0, size = ks.size(); i < size; ++i)
+                        {
+                            const KeyPoint& kp = ks[i];
+                            x_kp_hessian[i] = kp.response;
+                            x_loc_row[i].x = cvRound(kp.pt.x * scale);
+                            x_loc_row[i].y = cvRound(kp.pt.y * scale);
+                            x_kp_dir[i] = kp.angle;
+
+                        }
+
+                        keyPointsPyr_[t].upload(h_keypoints.rowRange(0,3));
+                        keyPointsCount_[t] = h_keypoints.cols;
+                    }
+                }
+            }
+        }
+
+        detectAndComputeAsync(_image, _mask, d_keypoints_, _descriptors, useProvidedKeypoints, Stream::Null());
+
+        if (!useProvidedKeypoints) {
+            convert(d_keypoints_, keypoints);
+        }
+    }
+
+    void ORB_Impl::detectAndComputeAsync(InputArray _image, InputArray _mask, OutputArray _keypoints, OutputArray _descriptors, bool useProvidedKeypoints, Stream& stream)
+    {
+        buildScalePyramids(_image, _mask, stream);
+        if (!useProvidedKeypoints)
+        {
+           computeKeyPointsPyramid(stream);
+        }
+        if (_descriptors.needed())
+        {
+            computeDescriptors(_descriptors, stream);
+        }
+        if (!useProvidedKeypoints)
+        {
+            mergeKeyPoints(_keypoints, stream);
+        }
+    }
+
+    void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask, Stream& stream)
+    {
+        const GpuMat image = _image.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        CV_Assert( image.type() == CV_8UC1 );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
+
+        imagePyr_.resize(nLevels_);
+        maskPyr_.resize(nLevels_);
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
+
+            Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
+
+            ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
+            ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
+            maskPyr_[level].setTo(Scalar::all(255));
+
+            // Compute the resized image
+            if (level != firstLevel_)
+            {
+                if (level < firstLevel_)
+                {
+                    cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR, stream);
+
+                    if (!mask.empty())
+                        cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR, stream);
+                }
+                else
+                {
+                    cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR, stream);
+
+                    if (!mask.empty())
+                    {
+                        cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR, stream);
+                        cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO, stream);
+                    }
+                }
+            }
+            else
+            {
+                image.copyTo(imagePyr_[level], stream);
+
+                if (!mask.empty())
+                    mask.copyTo(maskPyr_[level], stream);
+            }
+
+            // Filter keypoints by image border
+            ensureSizeIsEnough(sz, CV_8UC1, buf_);
+            buf_.setTo(Scalar::all(0), stream);
+            Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
+            buf_(inner).setTo(Scalar::all(255), stream);
+
+            cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level], cv::noArray(), stream);
+        }
+    }
+
+    // takes keypoints and culls them by the response
+    static void cull(GpuMat& keypoints, int& count, int n_points, Stream& stream)
+    {
+        using namespace cv::cuda::device::orb;
+
+        //this is only necessary if the keypoints size is greater than the number of desired points.
+        if (count > n_points)
+        {
+            if (n_points == 0)
+            {
+                keypoints.release();
+                return;
+            }
+
+            count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points, StreamAccessor::getStream(stream));
+        }
+    }
+
+    void ORB_Impl::computeKeyPointsPyramid(Stream& stream)
+    {
+        using namespace cv::cuda::device::orb;
+
+        int half_patch_size = patchSize_ / 2;
+
+        keyPointsPyr_.resize(nLevels_);
+        keyPointsCount_.resize(nLevels_);
+
+        fastDetector_->setThreshold(fastThreshold_);
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());
+
+            GpuMat fastKpRange;
+            fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], stream);
+
+            keyPointsCount_[level] = fastKpRange.cols;
+
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
+            fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2), stream);
+
+            const int n_features = static_cast<int>(n_features_per_level_[level]);
+
+            if (scoreType_ == cv::ORB::HARRIS_SCORE)
+            {
+                // Keep more points than necessary as FAST does not give amazing corners
+                cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features, stream);
+
+                // Compute the Harris cornerness (better scoring than FAST)
+                HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, StreamAccessor::getStream(stream));
+            }
+
+            //cull to the final desired level, using the new Harris scores or the original FAST scores.
+            cull(keyPointsPyr_[level], keyPointsCount_[level], n_features, stream);
+
+            // Compute orientation
+            IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, StreamAccessor::getStream(stream));
+        }
+    }
+
+    void ORB_Impl::computeDescriptors(OutputArray _descriptors, Stream& stream)
+    {
+        using namespace cv::cuda::device::orb;
+
+        int nAllkeypoints = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+            nAllkeypoints += keyPointsCount_[level];
+
+        if (nAllkeypoints == 0)
+        {
+            _descriptors.release();
+            return;
+        }
+
+        ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, _descriptors);
+        GpuMat descriptors = _descriptors.getGpuMat();
+
+        int offset = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
+
+            if (blurForDescriptor_)
+            {
+                // preprocess the resized image
+                ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
+                blurFilter_->apply(imagePyr_[level], buf_, stream);
+            }
+
+            computeOrbDescriptor_gpu(blurForDescriptor_ ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
+                keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, StreamAccessor::getStream(stream));
+
+            offset += keyPointsCount_[level];
+        }
+    }
+
+    void ORB_Impl::mergeKeyPoints(OutputArray _keypoints, Stream& stream)
+    {
+        using namespace cv::cuda::device::orb;
+
+        int nAllkeypoints = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+            nAllkeypoints += keyPointsCount_[level];
+
+        if (nAllkeypoints == 0)
+        {
+            _keypoints.release();
+            return;
+        }
+
+        ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, _keypoints);
+        GpuMat& keypoints = _keypoints.getGpuMatRef();
+
+        int offset = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            float sf = getScale(scaleFactor_, firstLevel_, level);
+
+            GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);
+
+            float locScale = level != firstLevel_ ? sf : 1.0f;
+
+            mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, StreamAccessor::getStream(stream));
+
+            GpuMat range = keyPointsRange.rowRange(2, 4);
+            keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range, stream);
+
+            keyPointsRange.row(4).setTo(Scalar::all(level), stream);
+            keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf), stream);
+
+            offset += keyPointsCount_[level];
+        }
+    }
+
+    void ORB_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
+    {
+        if (_gpu_keypoints.empty())
+        {
+            keypoints.clear();
+            return;
+        }
+
+        Mat h_keypoints;
+        if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_keypoints.getGpuMat().download(h_keypoints);
+        }
+        else
+        {
+            h_keypoints = _gpu_keypoints.getMat();
+        }
+
+        CV_Assert( h_keypoints.rows == ROWS_COUNT );
+        CV_Assert( h_keypoints.type() == CV_32FC1 );
+
+        const int npoints = h_keypoints.cols;
+
+        keypoints.resize(npoints);
+
+        const float* x_ptr = h_keypoints.ptr<float>(X_ROW);
+        const float* y_ptr = h_keypoints.ptr<float>(Y_ROW);
+        const float* response_ptr = h_keypoints.ptr<float>(RESPONSE_ROW);
+        const float* angle_ptr = h_keypoints.ptr<float>(ANGLE_ROW);
+        const float* octave_ptr = h_keypoints.ptr<float>(OCTAVE_ROW);
+        const float* size_ptr = h_keypoints.ptr<float>(SIZE_ROW);
+
+        for (int i = 0; i < npoints; ++i)
+        {
+            KeyPoint kp;
+
+            kp.pt.x = x_ptr[i];
+            kp.pt.y = y_ptr[i];
+            kp.response = response_ptr[i];
+            kp.angle = angle_ptr[i];
+            kp.octave = static_cast<int>(octave_ptr[i]);
+            kp.size = size_ptr[i];
+
+            keypoints[i] = kp;
+        }
+    }
+}
+
+Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int nfeatures,
+                                         float scaleFactor,
+                                         int nlevels,
+                                         int edgeThreshold,
+                                         int firstLevel,
+                                         int WTA_K,
+                                         int scoreType,
+                                         int patchSize,
+                                         int fastThreshold,
+                                         bool blurForDescriptor)
+{
+    return makePtr<ORB_Impl>(nfeatures, scaleFactor, nlevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize, fastThreshold, blurForDescriptor);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudafeatures2d/src/precomp.hpp b/modules/cudafeatures2d/src/precomp.hpp
new file mode 100644
index 00000000000..da64ba4a1c7
--- /dev/null
+++ b/modules/cudafeatures2d/src/precomp.hpp
@@ -0,0 +1,57 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+
+#include "opencv2/cudafeatures2d.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudawarping.hpp"
+#include "opencv2/features2d.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudafeatures2d/test/test_features2d.cpp b/modules/cudafeatures2d/test/test_features2d.cpp
new file mode 100644
index 00000000000..787b4fb3450
--- /dev/null
+++ b/modules/cudafeatures2d/test/test_features2d.cpp
@@ -0,0 +1,762 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#include <cuda_runtime_api.h>
+
+namespace opencv_test { namespace {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// FAST
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(FAST_Threshold, int)
+    IMPLEMENT_PARAM_CLASS(FAST_NonmaxSuppression, bool)
+}
+
+PARAM_TEST_CASE(FAST, cv::cuda::DeviceInfo, FAST_Threshold, FAST_NonmaxSuppression)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int threshold;
+    bool nonmaxSuppression;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        threshold = GET_PARAM(1);
+        nonmaxSuppression = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(FAST, Accuracy)
+{
+    cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    cv::Ptr<cv::cuda::FastFeatureDetector> fast = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
+
+    if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
+    {
+        throw SkipTestException("CUDA device doesn't support global atomics");
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        fast->detect(loadMat(image), keypoints);
+
+        std::vector<cv::KeyPoint> keypoints_gold;
+        cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression);
+
+        ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints);
+    }
+}
+
+class FastAsyncParallelLoopBody : public cv::ParallelLoopBody
+{
+public:
+    FastAsyncParallelLoopBody(cv::cuda::HostMem& src, cv::cuda::GpuMat* d_kpts, cv::Ptr<cv::cuda::FastFeatureDetector>* d_fast)
+        : src_(src), kpts_(d_kpts), fast_(d_fast) {}
+    ~FastAsyncParallelLoopBody() {};
+    void operator()(const cv::Range& r) const
+    {
+        for (int i = r.start; i < r.end; i++) {
+            cv::cuda::Stream stream;
+            cv::cuda::GpuMat d_src_(src_.rows, src_.cols, CV_8UC1);
+            d_src_.upload(src_);
+            fast_[i]->detectAsync(d_src_, kpts_[i], noArray(), stream);
+        }
+    }
+protected:
+    cv::cuda::HostMem src_;
+    cv::cuda::GpuMat* kpts_;
+    cv::Ptr<cv::cuda::FastFeatureDetector>* fast_;
+};
+
+CUDA_TEST_P(FAST, Async)
+{
+    if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
+    {
+        throw SkipTestException("CUDA device doesn't support global atomics");
+    }
+    else
+    {
+        cv::Mat image_ = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(image_.empty());
+
+        cv::cuda::HostMem image(image_);
+
+        cv::cuda::GpuMat d_keypoints[2];
+        cv::Ptr<cv::cuda::FastFeatureDetector> d_fast[2];
+
+        d_fast[0] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
+        d_fast[1] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
+
+        cv::parallel_for_(cv::Range(0, 2), FastAsyncParallelLoopBody(image, d_keypoints, d_fast));
+
+        cudaDeviceSynchronize();
+
+        std::vector<cv::KeyPoint> keypoints[2];
+        d_fast[0]->convert(d_keypoints[0], keypoints[0]);
+        d_fast[1]->convert(d_keypoints[1], keypoints[1]);
+
+        std::vector<cv::KeyPoint> keypoints_gold;
+        cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression);
+
+        ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[0]);
+        ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[1]);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Features2D, FAST, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(FAST_Threshold(25), FAST_Threshold(50)),
+    testing::Values(FAST_NonmaxSuppression(false), FAST_NonmaxSuppression(true))));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// ORB
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(ORB_FeaturesCount, int)
+    IMPLEMENT_PARAM_CLASS(ORB_ScaleFactor, float)
+    IMPLEMENT_PARAM_CLASS(ORB_LevelsCount, int)
+    IMPLEMENT_PARAM_CLASS(ORB_EdgeThreshold, int)
+    IMPLEMENT_PARAM_CLASS(ORB_firstLevel, int)
+    IMPLEMENT_PARAM_CLASS(ORB_WTA_K, int)
+    IMPLEMENT_PARAM_CLASS(ORB_PatchSize, int)
+    IMPLEMENT_PARAM_CLASS(ORB_BlurForDescriptor, bool)
+}
+
+CV_ENUM(ORB_ScoreType, cv::ORB::HARRIS_SCORE, cv::ORB::FAST_SCORE)
+
+PARAM_TEST_CASE(ORB, cv::cuda::DeviceInfo, ORB_FeaturesCount, ORB_ScaleFactor, ORB_LevelsCount, ORB_EdgeThreshold, ORB_firstLevel, ORB_WTA_K, ORB_ScoreType, ORB_PatchSize, ORB_BlurForDescriptor)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int nFeatures;
+    float scaleFactor;
+    int nLevels;
+    int edgeThreshold;
+    int firstLevel;
+    int WTA_K;
+    int scoreType;
+    int patchSize;
+    bool blurForDescriptor;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        nFeatures = GET_PARAM(1);
+        scaleFactor = GET_PARAM(2);
+        nLevels = GET_PARAM(3);
+        edgeThreshold = GET_PARAM(4);
+        firstLevel = GET_PARAM(5);
+        WTA_K = GET_PARAM(6);
+        scoreType = GET_PARAM(7);
+        patchSize = GET_PARAM(8);
+        blurForDescriptor = GET_PARAM(9);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(ORB, Accuracy)
+{
+    cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    cv::Mat mask(image.size(), CV_8UC1, cv::Scalar::all(1));
+    mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));
+
+    cv::Ptr<cv::cuda::ORB> orb =
+            cv::cuda::ORB::create(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel,
+                                  WTA_K, scoreType, patchSize, 20, blurForDescriptor);
+
+    if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            cv::cuda::GpuMat descriptors;
+            orb->detectAndComputeAsync(loadMat(image), loadMat(mask), rawOut(keypoints), descriptors);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        cv::cuda::GpuMat descriptors;
+        orb->detectAndCompute(loadMat(image), loadMat(mask), keypoints, descriptors);
+
+        cv::Ptr<cv::ORB> orb_gold = cv::ORB::create(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
+
+        std::vector<cv::KeyPoint> keypoints_gold;
+        cv::Mat descriptors_gold;
+        orb_gold->detectAndCompute(image, mask, keypoints_gold, descriptors_gold);
+
+        cv::BFMatcher matcher(cv::NORM_HAMMING);
+        std::vector<cv::DMatch> matches;
+        matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
+
+        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints, matches);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
+
+        EXPECT_GT(matchedRatio, 0.35);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Features2D, ORB,  testing::Combine(
+    ALL_DEVICES,
+    testing::Values(ORB_FeaturesCount(1000)),
+    testing::Values(ORB_ScaleFactor(1.2f)),
+    testing::Values(ORB_LevelsCount(4), ORB_LevelsCount(8)),
+    testing::Values(ORB_EdgeThreshold(31)),
+    testing::Values(ORB_firstLevel(0)),
+    testing::Values(ORB_WTA_K(2), ORB_WTA_K(3), ORB_WTA_K(4)),
+    testing::Values(ORB_ScoreType(cv::ORB::HARRIS_SCORE)),
+    testing::Values(ORB_PatchSize(31), ORB_PatchSize(29)),
+    testing::Values(ORB_BlurForDescriptor(false), ORB_BlurForDescriptor(true))));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// BruteForceMatcher
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
+    IMPLEMENT_PARAM_CLASS(UseMask, bool)
+}
+
+PARAM_TEST_CASE(BruteForceMatcher, cv::cuda::DeviceInfo, NormCode, DescriptorSize, UseMask)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int normCode;
+    int dim;
+    bool useMask;
+
+    int queryDescCount;
+    int countFactor;
+
+    cv::Mat query, train;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        normCode = GET_PARAM(1);
+        dim = GET_PARAM(2);
+        useMask = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        queryDescCount = 300; // must be even number because we split train data in some cases in two
+        countFactor = 4; // do not change it
+
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+
+        cv::Mat queryBuf, trainBuf;
+
+        // Generate query descriptors randomly.
+        // Descriptor vector elements are integer values.
+        queryBuf.create(queryDescCount, dim, CV_32SC1);
+        rng.fill(queryBuf, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3));
+        queryBuf.convertTo(queryBuf, CV_32FC1);
+
+        // Generate train descriptors as follows:
+        // copy each query descriptor to train set countFactor times
+        // and perturb some one element of the copied descriptors in
+        // in ascending order. General boundaries of the perturbation
+        // are (0.f, 1.f).
+        trainBuf.create(queryDescCount * countFactor, dim, CV_32FC1);
+        float step = 1.f / countFactor;
+        for (int qIdx = 0; qIdx < queryDescCount; qIdx++)
+        {
+            cv::Mat queryDescriptor = queryBuf.row(qIdx);
+            for (int c = 0; c < countFactor; c++)
+            {
+                int tIdx = qIdx * countFactor + c;
+                cv::Mat trainDescriptor = trainBuf.row(tIdx);
+                queryDescriptor.copyTo(trainDescriptor);
+                int elem = rng(dim);
+                float diff = rng.uniform(step * c, step * (c + 1));
+                trainDescriptor.at<float>(0, elem) += diff;
+            }
+        }
+
+        queryBuf.convertTo(query, CV_32F);
+        trainBuf.convertTo(train, CV_32F);
+    }
+};
+
+CUDA_TEST_P(BruteForceMatcher, Match_Single)
+{
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
+
+    cv::cuda::GpuMat mask;
+    if (useMask)
+    {
+        mask.create(query.rows, train.rows, CV_8UC1);
+        mask.setTo(cv::Scalar::all(1));
+    }
+
+    std::vector<cv::DMatch> matches;
+    matcher->match(loadMat(query), loadMat(train), matches, mask);
+
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+    int badCount = 0;
+    for (size_t i = 0; i < matches.size(); i++)
+    {
+        cv::DMatch match = matches[i];
+        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
+            badCount++;
+    }
+
+    ASSERT_EQ(0, badCount);
+}
+
+CUDA_TEST_P(BruteForceMatcher, Match_Collection)
+{
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
+
+    cv::cuda::GpuMat d_train(train);
+
+    // make add() twice to test such case
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+
+    // prepare masks (make first nearest match illegal)
+    std::vector<cv::cuda::GpuMat> masks(2);
+    for (int mi = 0; mi < 2; mi++)
+    {
+        masks[mi] = cv::cuda::GpuMat(query.rows, train.rows/2, CV_8UC1, cv::Scalar::all(1));
+        for (int di = 0; di < queryDescCount/2; di++)
+            masks[mi].col(di * countFactor).setTo(cv::Scalar::all(0));
+    }
+
+    std::vector<cv::DMatch> matches;
+    if (useMask)
+        matcher->match(cv::cuda::GpuMat(query), matches, masks);
+    else
+        matcher->match(cv::cuda::GpuMat(query), matches);
+
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+    int badCount = 0;
+    int shift = useMask ? 1 : 0;
+    for (size_t i = 0; i < matches.size(); i++)
+    {
+        cv::DMatch match = matches[i];
+
+        if ((int)i < queryDescCount / 2)
+        {
+            bool validQueryIdx = (match.queryIdx == (int)i);
+            bool validTrainIdx = (match.trainIdx == (int)i * countFactor + shift);
+            bool validImgIdx = (match.imgIdx == 0);
+            if (!validQueryIdx || !validTrainIdx || !validImgIdx)
+                badCount++;
+        }
+        else
+        {
+            bool validQueryIdx = (match.queryIdx == (int)i);
+            bool validTrainIdx = (match.trainIdx == ((int)i - queryDescCount / 2) * countFactor + shift);
+            bool validImgIdx = (match.imgIdx == 1);
+            if (!validQueryIdx || !validTrainIdx || !validImgIdx)
+                badCount++;
+        }
+    }
+
+    ASSERT_EQ(0, badCount);
+}
+
+CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
+{
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
+
+    const int knn = 2;
+
+    cv::cuda::GpuMat mask;
+    if (useMask)
+    {
+        mask.create(query.rows, train.rows, CV_8UC1);
+        mask.setTo(cv::Scalar::all(1));
+    }
+
+    std::vector< std::vector<cv::DMatch> > matches;
+    matcher->knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
+
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+    int badCount = 0;
+    for (size_t i = 0; i < matches.size(); i++)
+    {
+        if ((int)matches[i].size() != knn)
+            badCount++;
+        else
+        {
+            int localBadCount = 0;
+            for (int k = 0; k < knn; k++)
+            {
+                cv::DMatch match = matches[i][k];
+                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0))
+                    localBadCount++;
+            }
+            badCount += localBadCount > 0 ? 1 : 0;
+        }
+    }
+
+    ASSERT_EQ(0, badCount);
+}
+
+CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
+{
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
+
+    const int knn = 3;
+
+    cv::cuda::GpuMat mask;
+    if (useMask)
+    {
+        mask.create(query.rows, train.rows, CV_8UC1);
+        mask.setTo(cv::Scalar::all(1));
+    }
+
+    std::vector< std::vector<cv::DMatch> > matches;
+    matcher->knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
+
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+    int badCount = 0;
+    for (size_t i = 0; i < matches.size(); i++)
+    {
+        if ((int)matches[i].size() != knn)
+            badCount++;
+        else
+        {
+            int localBadCount = 0;
+            for (int k = 0; k < knn; k++)
+            {
+                cv::DMatch match = matches[i][k];
+                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0))
+                    localBadCount++;
+            }
+            badCount += localBadCount > 0 ? 1 : 0;
+        }
+    }
+
+    ASSERT_EQ(0, badCount);
+}
+
+CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
+{
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
+
+    const int knn = 2;
+
+    cv::cuda::GpuMat d_train(train);
+
+    // make add() twice to test such case
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+
+    // prepare masks (make first nearest match illegal)
+    std::vector<cv::cuda::GpuMat> masks(2);
+    for (int mi = 0; mi < 2; mi++ )
+    {
+        masks[mi] = cv::cuda::GpuMat(query.rows, train.rows / 2, CV_8UC1, cv::Scalar::all(1));
+        for (int di = 0; di < queryDescCount / 2; di++)
+            masks[mi].col(di * countFactor).setTo(cv::Scalar::all(0));
+    }
+
+    std::vector< std::vector<cv::DMatch> > matches;
+
+    if (useMask)
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
+    else
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn);
+
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+    int badCount = 0;
+    int shift = useMask ? 1 : 0;
+    for (size_t i = 0; i < matches.size(); i++)
+    {
+        if ((int)matches[i].size() != knn)
+            badCount++;
+        else
+        {
+            int localBadCount = 0;
+            for (int k = 0; k < knn; k++)
+            {
+                cv::DMatch match = matches[i][k];
+                {
+                    if ((int)i < queryDescCount / 2)
+                    {
+                        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
+                            localBadCount++;
+                    }
+                    else
+                    {
+                        if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
+                            localBadCount++;
+                    }
+                }
+            }
+            badCount += localBadCount > 0 ? 1 : 0;
+        }
+    }
+
+    ASSERT_EQ(0, badCount);
+}
+
+CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
+{
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
+
+    const int knn = 3;
+
+    cv::cuda::GpuMat d_train(train);
+
+    // make add() twice to test such case
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+
+    // prepare masks (make first nearest match illegal)
+    std::vector<cv::cuda::GpuMat> masks(2);
+    for (int mi = 0; mi < 2; mi++ )
+    {
+        masks[mi] = cv::cuda::GpuMat(query.rows, train.rows / 2, CV_8UC1, cv::Scalar::all(1));
+        for (int di = 0; di < queryDescCount / 2; di++)
+            masks[mi].col(di * countFactor).setTo(cv::Scalar::all(0));
+    }
+
+    std::vector< std::vector<cv::DMatch> > matches;
+
+    if (useMask)
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
+    else
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn);
+
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+    int badCount = 0;
+    int shift = useMask ? 1 : 0;
+    for (size_t i = 0; i < matches.size(); i++)
+    {
+        if ((int)matches[i].size() != knn)
+            badCount++;
+        else
+        {
+            int localBadCount = 0;
+            for (int k = 0; k < knn; k++)
+            {
+                cv::DMatch match = matches[i][k];
+                {
+                    if ((int)i < queryDescCount / 2)
+                    {
+                        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
+                            localBadCount++;
+                    }
+                    else
+                    {
+                        if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
+                            localBadCount++;
+                    }
+                }
+            }
+            badCount += localBadCount > 0 ? 1 : 0;
+        }
+    }
+
+    ASSERT_EQ(0, badCount);
+}
+
+CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
+{
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
+
+    const float radius = 1.f / countFactor;
+
+    if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector< std::vector<cv::DMatch> > matches;
+            matcher->radiusMatch(loadMat(query), loadMat(train), matches, radius);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat mask;
+        if (useMask)
+        {
+            mask.create(query.rows, train.rows, CV_8UC1);
+            mask.setTo(cv::Scalar::all(1));
+        }
+
+        std::vector< std::vector<cv::DMatch> > matches;
+        matcher->radiusMatch(loadMat(query), loadMat(train), matches, radius, mask);
+
+        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+        int badCount = 0;
+        for (size_t i = 0; i < matches.size(); i++)
+        {
+            if ((int)matches[i].size() != 1)
+                badCount++;
+            else
+            {
+                cv::DMatch match = matches[i][0];
+                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0))
+                    badCount++;
+            }
+        }
+
+        ASSERT_EQ(0, badCount);
+    }
+}
+
+CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
+{
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
+
+    const int n = 3;
+    const float radius = 1.f / countFactor * n;
+
+    cv::cuda::GpuMat d_train(train);
+
+    // make add() twice to test such case
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+
+    // prepare masks (make first nearest match illegal)
+    std::vector<cv::cuda::GpuMat> masks(2);
+    for (int mi = 0; mi < 2; mi++)
+    {
+        masks[mi] = cv::cuda::GpuMat(query.rows, train.rows / 2, CV_8UC1, cv::Scalar::all(1));
+        for (int di = 0; di < queryDescCount / 2; di++)
+            masks[mi].col(di * countFactor).setTo(cv::Scalar::all(0));
+    }
+
+    if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector< std::vector<cv::DMatch> > matches;
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector< std::vector<cv::DMatch> > matches;
+
+        if (useMask)
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
+        else
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius);
+
+        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+        int badCount = 0;
+        int shift = useMask ? 1 : 0;
+        int needMatchCount = useMask ? n-1 : n;
+        for (size_t i = 0; i < matches.size(); i++)
+        {
+            if ((int)matches[i].size() != needMatchCount)
+                badCount++;
+            else
+            {
+                int localBadCount = 0;
+                for (int k = 0; k < needMatchCount; k++)
+                {
+                    cv::DMatch match = matches[i][k];
+                    {
+                        if ((int)i < queryDescCount / 2)
+                        {
+                            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
+                                localBadCount++;
+                        }
+                        else
+                        {
+                            if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
+                                localBadCount++;
+                        }
+                    }
+                }
+                badCount += localBadCount > 0 ? 1 : 0;
+            }
+        }
+
+        ASSERT_EQ(0, badCount);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Features2D, BruteForceMatcher, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2)),
+    testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)),
+    testing::Values(UseMask(false), UseMask(true))));
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudafeatures2d/test/test_main.cpp b/modules/cudafeatures2d/test/test_main.cpp
new file mode 100644
index 00000000000..04f4fcf6e60
--- /dev/null
+++ b/modules/cudafeatures2d/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cudafeatures2d/test/test_precomp.hpp b/modules/cudafeatures2d/test/test_precomp.hpp
new file mode 100644
index 00000000000..15283f30faf
--- /dev/null
+++ b/modules/cudafeatures2d/test/test_precomp.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudafeatures2d.hpp"
+#include "opencv2/features2d.hpp"
+
+#include "cvconfig.h"
+
+#endif
diff --git a/modules/cudafilters/CMakeLists.txt b/modules/cudafilters/CMakeLists.txt
new file mode 100644
index 00000000000..08281c135ce
--- /dev/null
+++ b/modules/cudafilters/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR WINRT OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudafilters)
+endif()
+
+set(the_description "CUDA-accelerated Image Filtering")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudafilters opencv_imgproc opencv_cudaarithm WRAP python)
diff --git a/modules/cudafilters/include/opencv2/cudafilters.hpp b/modules/cudafilters/include/opencv2/cudafilters.hpp
new file mode 100644
index 00000000000..fd28150f310
--- /dev/null
+++ b/modules/cudafilters/include/opencv2/cudafilters.hpp
@@ -0,0 +1,331 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDAFILTERS_HPP
+#define OPENCV_CUDAFILTERS_HPP
+
+#ifndef __cplusplus
+#  error cudafilters.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/imgproc.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudafilters Image Filtering
+
+Functions and classes described in this section are used to perform various linear or non-linear
+filtering operations on 2D images.
+
+@note
+   -   An example containing all basic morphology operators like erode and dilate can be found at
+        opencv_source_code/samples/gpu/morphology.cpp
+
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudafilters
+//! @{
+
+/** @brief Common interface for all CUDA filters :
+ */
+class CV_EXPORTS_W Filter : public Algorithm
+{
+public:
+    /** @brief Applies the specified filter to the image.
+
+    @param src Input image.
+    @param dst Output image.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Box Filter
+
+/** @brief Creates a normalized 2D box filter.
+
+@param srcType Input image type. Only CV_8UC1, CV_8UC4 and CV_32FC1 are supported for now.
+@param dstType Output image type. Only the same type as src is supported for now.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel
+center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+
+@sa boxFilter
+ */
+CV_EXPORTS_W Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1, -1),
+                                       int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear Filter
+
+/** @brief Creates a non-separable linear 2D filter.
+
+@param srcType Input image type. Supports CV_8U , CV_16U and CV_32F one and four channel image.
+@param dstType Output image type. Only the same type as src is supported for now.
+@param kernel 2D array of filter coefficients.
+@param anchor Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel
+center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+
+@sa filter2D
+ */
+CV_EXPORTS_W Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian Filter
+
+/** @brief Creates a Laplacian operator.
+
+@param srcType Input image type. Supports CV_8U , CV_16U and CV_32F one and four channel image.
+@param dstType Output image type. Only the same type as src is supported for now.
+@param ksize Aperture size used to compute the second-derivative filters (see getDerivKernels). It
+must be positive and odd. Only ksize = 1 and ksize = 3 are supported.
+@param scale Optional scale factor for the computed Laplacian values. By default, no scaling is
+applied (see getDerivKernels ).
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+
+@sa Laplacian
+ */
+CV_EXPORTS_W Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
+                                             int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Separable Linear Filter
+
+/** @brief Creates a separable linear filter.
+
+@param srcType Source array type.
+@param dstType Destination array type.
+@param rowKernel Horizontal filter coefficients. Support kernels with size \<= 32 .
+@param columnKernel Vertical filter coefficients. Support kernels with size \<= 32 .
+@param anchor Anchor position within the kernel. Negative values mean that anchor is positioned at
+the aperture center.
+@param rowBorderMode Pixel extrapolation method in the vertical direction For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa sepFilter2D
+ */
+CV_EXPORTS_W Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
+                                                   Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Deriv Filter
+
+/** @brief Creates a generalized Deriv operator.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param dx Derivative order in respect of x.
+@param dy Derivative order in respect of y.
+@param ksize Aperture size. See getDerivKernels for details.
+@param normalize Flag indicating whether to normalize (scale down) the filter coefficients or not.
+See getDerivKernels for details.
+@param scale Optional scale factor for the computed derivative values. By default, no scaling is
+applied. For details, see getDerivKernels .
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+ */
+CV_EXPORTS_W Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
+                                         int ksize, bool normalize = false, double scale = 1,
+                                         int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+/** @brief Creates a Sobel operator.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param dx Derivative order in respect of x.
+@param dy Derivative order in respect of y.
+@param ksize Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
+@param scale Optional scale factor for the computed derivative values. By default, no scaling is
+applied. For details, see getDerivKernels .
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa Sobel
+ */
+CV_EXPORTS_W Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
+                                         double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+/** @brief Creates a vertical or horizontal Scharr operator.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param dx Order of the derivative x.
+@param dy Order of the derivative y.
+@param scale Optional scale factor for the computed derivative values. By default, no scaling is
+applied. See getDerivKernels for details.
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa Scharr
+ */
+CV_EXPORTS_W Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
+                                          double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gaussian Filter
+
+/** @brief Creates a Gaussian filter.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param ksize Aperture size. See getGaussianKernel for details.
+@param sigma1 Gaussian sigma in the horizontal direction. See getGaussianKernel for details.
+@param sigma2 Gaussian sigma in the vertical direction. If 0, then
+\f$\texttt{sigma2}\leftarrow\texttt{sigma1}\f$ .
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa GaussianBlur
+ */
+CV_EXPORTS_W Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
+                                            double sigma1, double sigma2 = 0,
+                                            int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Morphology Filter
+
+/** @brief Creates a 2D morphological filter.
+
+@param op Type of morphological operation. The following types are possible:
+-   **MORPH_ERODE** erode
+-   **MORPH_DILATE** dilate
+-   **MORPH_OPEN** opening
+-   **MORPH_CLOSE** closing
+-   **MORPH_GRADIENT** morphological gradient
+-   **MORPH_TOPHAT** "top hat"
+-   **MORPH_BLACKHAT** "black hat"
+@param srcType Input/output image type. Only CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 are supported.
+@param kernel 2D 8-bit structuring element for the morphological operation.
+@param anchor Anchor position within the structuring element. Negative values mean that the anchor
+is at the center.
+@param iterations Number of times erosion and dilation to be applied.
+
+@sa morphologyEx
+ */
+CV_EXPORTS_W Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Image Rank Filter
+
+/** @brief Creates the maximum filter.
+
+@param srcType Input/output image type. Only CV_8UC1 and CV_8UC4 are supported.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
+CV_EXPORTS_W Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
+                                          Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+/** @brief Creates the minimum filter.
+
+@param srcType Input/output image type. Only CV_8UC1 and CV_8UC4 are supported.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
+CV_EXPORTS_W Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
+                                          Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 1D Sum Filter
+
+/** @brief Creates a horizontal 1D box filter.
+
+@param srcType Input image type. Only CV_8UC1 type is supported for now.
+@param dstType Output image type. Only CV_32FC1 type is supported for now.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
+CV_EXPORTS_W Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+/** @brief Creates a vertical 1D box filter.
+
+@param srcType Input image type. Only CV_8UC1 type is supported for now.
+@param dstType Output image type. Only CV_32FC1 type is supported for now.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
+CV_EXPORTS_W Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+//! @}
+
+///////////////////////////// Median Filtering //////////////////////////////
+
+/** @brief Performs median filtering for each point of the source image.
+
+@param srcType type of of source image. Only CV_8UC1 images are supported for now.
+@param windowSize Size of the kernerl used for the filtering. Uses a (windowSize x windowSize) filter.
+@param partition Specifies the parallel granularity of the workload. This parameter should be used GPU experts when optimizing performance.
+
+Outputs an image that has been filtered using median-filtering formulation.
+ */
+CV_EXPORTS_W Ptr<Filter> createMedianFilter(int srcType, int windowSize, int partition = 128);
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_CUDAFILTERS_HPP */
diff --git a/modules/cudafilters/perf/perf_filters.cpp b/modules/cudafilters/perf/perf_filters.cpp
new file mode 100644
index 00000000000..e1970f441ea
--- /dev/null
+++ b/modules/cudafilters/perf/perf_filters.cpp
@@ -0,0 +1,417 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// Blur
+
+DEF_PARAM_TEST(Sz_Type_KernelSz, cv::Size, MatType, int);
+
+PERF_TEST_P(Sz_Type_KernelSz, Blur,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8UC1, CV_8UC4, CV_32FC1),
+                    Values(3, 5, 7)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::Filter> blurFilter = cv::cuda::createBoxFilter(d_src.type(), -1, cv::Size(ksize, ksize));
+
+        TEST_CYCLE() blurFilter->apply(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst, 1);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::blur(src, dst, cv::Size(ksize, ksize));
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(CUDA_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat kernel(ksize, ksize, CV_32FC1);
+    declare.in(kernel, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::Filter> filter2D = cv::cuda::createLinearFilter(d_src.type(), -1, kernel);
+
+        TEST_CYCLE() filter2D->apply(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(CUDA_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::Filter> laplacian = cv::cuda::createLaplacianFilter(d_src.type(), -1, ksize);
+
+        TEST_CYCLE() laplacian->apply(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Sobel
+
+PERF_TEST_P(Sz_Type_KernelSz, Sobel, Combine(CUDA_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::Filter> sobel = cv::cuda::createSobelFilter(d_src.type(), -1, 1, 1, ksize);
+
+        TEST_CYCLE() sobel->apply(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Sobel(src, dst, -1, 1, 1, ksize);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Scharr
+
+PERF_TEST_P(Sz_Type, Scharr, Combine(CUDA_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::Filter> scharr = cv::cuda::createScharrFilter(d_src.type(), -1, 1, 0);
+
+        TEST_CYCLE() scharr->apply(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Scharr(src, dst, -1, 1, 0);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// GaussianBlur
+
+PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(CUDA_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::Filter> gauss = cv::cuda::createGaussianFilter(d_src.type(), -1, cv::Size(ksize, ksize), 0.5);
+
+        TEST_CYCLE() gauss->apply(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Erode
+
+PERF_TEST_P(Sz_Type, Erode, Combine(CUDA_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::Filter> erode = cv::cuda::createMorphologyFilter(cv::MORPH_ERODE, src.type(), ker);
+
+        TEST_CYCLE() erode->apply(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::erode(src, dst, ker);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Dilate
+
+PERF_TEST_P(Sz_Type, Dilate, Combine(CUDA_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::Filter> dilate = cv::cuda::createMorphologyFilter(cv::MORPH_DILATE, src.type(), ker);
+
+        TEST_CYCLE() dilate->apply(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::dilate(src, dst, ker);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MorphologyEx
+
+CV_ENUM(MorphOp, MORPH_OPEN, MORPH_CLOSE, MORPH_GRADIENT, MORPH_TOPHAT, MORPH_BLACKHAT)
+
+DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, MorphOp);
+
+PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(CUDA_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), MorphOp::all()))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int morphOp = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::Filter> morph = cv::cuda::createMorphologyFilter(morphOp, src.type(), ker);
+
+        TEST_CYCLE() morph->apply(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::morphologyEx(src, dst, morphOp, ker);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+//////////////////////////////////////////////////////////////////////
+// MedianFilter
+//////////////////////////////////////////////////////////////////////
+// Median
+
+DEF_PARAM_TEST(Sz_KernelSz, cv::Size, int);
+
+//PERF_TEST_P(Sz_Type_KernelSz, Median, Combine(CUDA_TYPICAL_MAT_SIZES, Values(CV_8UC1,CV_8UC1), Values(3, 5, 7, 9, 11, 13, 15)))
+PERF_TEST_P(Sz_KernelSz, Median, Combine(CUDA_TYPICAL_MAT_SIZES, Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    // const int type = GET_PARAM(1);
+    const int type = CV_8UC1;
+    const int kernel = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::Filter> median = cv::cuda::createMedianFilter(d_src.type(), kernel);
+
+        TEST_CYCLE() median->apply(d_src, dst);
+
+        SANITY_CHECK_NOTHING();
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::medianBlur(src,dst,kernel);
+
+        SANITY_CHECK_NOTHING();
+    }
+}
+
+}} // namespace
diff --git a/modules/cudafilters/perf/perf_main.cpp b/modules/cudafilters/perf/perf_main.cpp
new file mode 100644
index 00000000000..b49b5242480
--- /dev/null
+++ b/modules/cudafilters/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudafilters)
diff --git a/modules/cudafilters/perf/perf_precomp.hpp b/modules/cudafilters/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..bf5186482b4
--- /dev/null
+++ b/modules/cudafilters/perf/perf_precomp.hpp
@@ -0,0 +1,55 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudafilters.hpp"
+
+namespace opencv_test {
+using namespace perf;
+}
+
+#endif
diff --git a/modules/cudafilters/src/cuda/column_filter.16sc1.cu b/modules/cudafilters/src/cuda/column_filter.16sc1.cu
new file mode 100644
index 00000000000..d4c6d19ab80
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.16sc1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.16sc3.cu b/modules/cudafilters/src/cuda/column_filter.16sc3.cu
new file mode 100644
index 00000000000..419fdea6528
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.16sc3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.16sc4.cu b/modules/cudafilters/src/cuda/column_filter.16sc4.cu
new file mode 100644
index 00000000000..1caeb877582
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.16sc4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.16uc1.cu b/modules/cudafilters/src/cuda/column_filter.16uc1.cu
new file mode 100644
index 00000000000..dc68b710f51
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.16uc1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.16uc3.cu b/modules/cudafilters/src/cuda/column_filter.16uc3.cu
new file mode 100644
index 00000000000..f0a07d6dddb
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.16uc3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.16uc4.cu b/modules/cudafilters/src/cuda/column_filter.16uc4.cu
new file mode 100644
index 00000000000..638ef794adc
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.16uc4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.32fc1.cu b/modules/cudafilters/src/cuda/column_filter.32fc1.cu
new file mode 100644
index 00000000000..aa30933e69e
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.32fc1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.32fc3.cu b/modules/cudafilters/src/cuda/column_filter.32fc3.cu
new file mode 100644
index 00000000000..c0ed3ac3cd0
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.32fc3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.32fc4.cu b/modules/cudafilters/src/cuda/column_filter.32fc4.cu
new file mode 100644
index 00000000000..f37f71792b8
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.32fc4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.32sc1.cu b/modules/cudafilters/src/cuda/column_filter.32sc1.cu
new file mode 100644
index 00000000000..ee052050d61
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.32sc1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.32sc3.cu b/modules/cudafilters/src/cuda/column_filter.32sc3.cu
new file mode 100644
index 00000000000..b921d961032
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.32sc3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.32sc4.cu b/modules/cudafilters/src/cuda/column_filter.32sc4.cu
new file mode 100644
index 00000000000..dd21524c5db
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.32sc4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.8uc1.cu b/modules/cudafilters/src/cuda/column_filter.8uc1.cu
new file mode 100644
index 00000000000..470f3ee8e65
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.8uc1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float, uchar>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.8uc3.cu b/modules/cudafilters/src/cuda/column_filter.8uc3.cu
new file mode 100644
index 00000000000..5d5be583109
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.8uc3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.8uc4.cu b/modules/cudafilters/src/cuda/column_filter.8uc4.cu
new file mode 100644
index 00000000000..8a322f2995a
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.8uc4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.hpp"
+
+namespace filter
+{
+    template void linearColumn<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/column_filter.hpp b/modules/cudafilters/src/cuda/column_filter.hpp
new file mode 100644
index 00000000000..e93fc836fa7
--- /dev/null
+++ b/modules/cudafilters/src/cuda/column_filter.hpp
@@ -0,0 +1,365 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace column_filter
+{
+    #define MAX_KERNEL_SIZE 32
+
+    template <int KSIZE, typename T, typename D, typename B>
+    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
+    {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+            const int BLOCK_DIM_X = 16;
+            const int BLOCK_DIM_Y = 16;
+            const int PATCH_PER_BLOCK = 4;
+            const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
+        #else
+            const int BLOCK_DIM_X = 16;
+            const int BLOCK_DIM_Y = 8;
+            const int PATCH_PER_BLOCK = 2;
+            const int HALO_SIZE = 2;
+        #endif
+
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+        __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
+
+        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+
+        if (x >= src.cols)
+            return;
+
+        const T* src_col = src.ptr() + x;
+
+        const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
+
+        if (blockIdx.y > 0)
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
+        }
+        else
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
+        }
+
+        if (blockIdx.y + 2 < gridDim.y)
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
+        }
+        else
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+        {
+            const int y = yStart + j * BLOCK_DIM_Y;
+
+            if (y < src.rows)
+            {
+                sum_t sum = VecTraits<sum_t>::all(0);
+
+                #pragma unroll
+                for (int k = 0; k < KSIZE; ++k)
+                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * kernel[k];
+
+                dst(y, x) = saturate_cast<D>(sum);
+            }
+        }
+    }
+
+    template <int KSIZE, typename T, typename D, template<typename> class B>
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
+    {
+        int BLOCK_DIM_X;
+        int BLOCK_DIM_Y;
+        int PATCH_PER_BLOCK;
+
+        if (cc >= 20)
+        {
+            BLOCK_DIM_X = 16;
+            BLOCK_DIM_Y = 16;
+            PATCH_PER_BLOCK = 4;
+        }
+        else
+        {
+            BLOCK_DIM_X = 16;
+            BLOCK_DIM_Y = 8;
+            PATCH_PER_BLOCK = 2;
+        }
+
+        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+        const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
+
+        B<T> brd(src.rows);
+
+        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+namespace filter
+{
+    template <typename T, typename D>
+    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);
+
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColConstant>,
+                column_filter::caller< 2, T, D, BrdColConstant>,
+                column_filter::caller< 3, T, D, BrdColConstant>,
+                column_filter::caller< 4, T, D, BrdColConstant>,
+                column_filter::caller< 5, T, D, BrdColConstant>,
+                column_filter::caller< 6, T, D, BrdColConstant>,
+                column_filter::caller< 7, T, D, BrdColConstant>,
+                column_filter::caller< 8, T, D, BrdColConstant>,
+                column_filter::caller< 9, T, D, BrdColConstant>,
+                column_filter::caller<10, T, D, BrdColConstant>,
+                column_filter::caller<11, T, D, BrdColConstant>,
+                column_filter::caller<12, T, D, BrdColConstant>,
+                column_filter::caller<13, T, D, BrdColConstant>,
+                column_filter::caller<14, T, D, BrdColConstant>,
+                column_filter::caller<15, T, D, BrdColConstant>,
+                column_filter::caller<16, T, D, BrdColConstant>,
+                column_filter::caller<17, T, D, BrdColConstant>,
+                column_filter::caller<18, T, D, BrdColConstant>,
+                column_filter::caller<19, T, D, BrdColConstant>,
+                column_filter::caller<20, T, D, BrdColConstant>,
+                column_filter::caller<21, T, D, BrdColConstant>,
+                column_filter::caller<22, T, D, BrdColConstant>,
+                column_filter::caller<23, T, D, BrdColConstant>,
+                column_filter::caller<24, T, D, BrdColConstant>,
+                column_filter::caller<25, T, D, BrdColConstant>,
+                column_filter::caller<26, T, D, BrdColConstant>,
+                column_filter::caller<27, T, D, BrdColConstant>,
+                column_filter::caller<28, T, D, BrdColConstant>,
+                column_filter::caller<29, T, D, BrdColConstant>,
+                column_filter::caller<30, T, D, BrdColConstant>,
+                column_filter::caller<31, T, D, BrdColConstant>,
+                column_filter::caller<32, T, D, BrdColConstant>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReplicate>,
+                column_filter::caller< 2, T, D, BrdColReplicate>,
+                column_filter::caller< 3, T, D, BrdColReplicate>,
+                column_filter::caller< 4, T, D, BrdColReplicate>,
+                column_filter::caller< 5, T, D, BrdColReplicate>,
+                column_filter::caller< 6, T, D, BrdColReplicate>,
+                column_filter::caller< 7, T, D, BrdColReplicate>,
+                column_filter::caller< 8, T, D, BrdColReplicate>,
+                column_filter::caller< 9, T, D, BrdColReplicate>,
+                column_filter::caller<10, T, D, BrdColReplicate>,
+                column_filter::caller<11, T, D, BrdColReplicate>,
+                column_filter::caller<12, T, D, BrdColReplicate>,
+                column_filter::caller<13, T, D, BrdColReplicate>,
+                column_filter::caller<14, T, D, BrdColReplicate>,
+                column_filter::caller<15, T, D, BrdColReplicate>,
+                column_filter::caller<16, T, D, BrdColReplicate>,
+                column_filter::caller<17, T, D, BrdColReplicate>,
+                column_filter::caller<18, T, D, BrdColReplicate>,
+                column_filter::caller<19, T, D, BrdColReplicate>,
+                column_filter::caller<20, T, D, BrdColReplicate>,
+                column_filter::caller<21, T, D, BrdColReplicate>,
+                column_filter::caller<22, T, D, BrdColReplicate>,
+                column_filter::caller<23, T, D, BrdColReplicate>,
+                column_filter::caller<24, T, D, BrdColReplicate>,
+                column_filter::caller<25, T, D, BrdColReplicate>,
+                column_filter::caller<26, T, D, BrdColReplicate>,
+                column_filter::caller<27, T, D, BrdColReplicate>,
+                column_filter::caller<28, T, D, BrdColReplicate>,
+                column_filter::caller<29, T, D, BrdColReplicate>,
+                column_filter::caller<30, T, D, BrdColReplicate>,
+                column_filter::caller<31, T, D, BrdColReplicate>,
+                column_filter::caller<32, T, D, BrdColReplicate>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReflect>,
+                column_filter::caller< 2, T, D, BrdColReflect>,
+                column_filter::caller< 3, T, D, BrdColReflect>,
+                column_filter::caller< 4, T, D, BrdColReflect>,
+                column_filter::caller< 5, T, D, BrdColReflect>,
+                column_filter::caller< 6, T, D, BrdColReflect>,
+                column_filter::caller< 7, T, D, BrdColReflect>,
+                column_filter::caller< 8, T, D, BrdColReflect>,
+                column_filter::caller< 9, T, D, BrdColReflect>,
+                column_filter::caller<10, T, D, BrdColReflect>,
+                column_filter::caller<11, T, D, BrdColReflect>,
+                column_filter::caller<12, T, D, BrdColReflect>,
+                column_filter::caller<13, T, D, BrdColReflect>,
+                column_filter::caller<14, T, D, BrdColReflect>,
+                column_filter::caller<15, T, D, BrdColReflect>,
+                column_filter::caller<16, T, D, BrdColReflect>,
+                column_filter::caller<17, T, D, BrdColReflect>,
+                column_filter::caller<18, T, D, BrdColReflect>,
+                column_filter::caller<19, T, D, BrdColReflect>,
+                column_filter::caller<20, T, D, BrdColReflect>,
+                column_filter::caller<21, T, D, BrdColReflect>,
+                column_filter::caller<22, T, D, BrdColReflect>,
+                column_filter::caller<23, T, D, BrdColReflect>,
+                column_filter::caller<24, T, D, BrdColReflect>,
+                column_filter::caller<25, T, D, BrdColReflect>,
+                column_filter::caller<26, T, D, BrdColReflect>,
+                column_filter::caller<27, T, D, BrdColReflect>,
+                column_filter::caller<28, T, D, BrdColReflect>,
+                column_filter::caller<29, T, D, BrdColReflect>,
+                column_filter::caller<30, T, D, BrdColReflect>,
+                column_filter::caller<31, T, D, BrdColReflect>,
+                column_filter::caller<32, T, D, BrdColReflect>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColWrap>,
+                column_filter::caller< 2, T, D, BrdColWrap>,
+                column_filter::caller< 3, T, D, BrdColWrap>,
+                column_filter::caller< 4, T, D, BrdColWrap>,
+                column_filter::caller< 5, T, D, BrdColWrap>,
+                column_filter::caller< 6, T, D, BrdColWrap>,
+                column_filter::caller< 7, T, D, BrdColWrap>,
+                column_filter::caller< 8, T, D, BrdColWrap>,
+                column_filter::caller< 9, T, D, BrdColWrap>,
+                column_filter::caller<10, T, D, BrdColWrap>,
+                column_filter::caller<11, T, D, BrdColWrap>,
+                column_filter::caller<12, T, D, BrdColWrap>,
+                column_filter::caller<13, T, D, BrdColWrap>,
+                column_filter::caller<14, T, D, BrdColWrap>,
+                column_filter::caller<15, T, D, BrdColWrap>,
+                column_filter::caller<16, T, D, BrdColWrap>,
+                column_filter::caller<17, T, D, BrdColWrap>,
+                column_filter::caller<18, T, D, BrdColWrap>,
+                column_filter::caller<19, T, D, BrdColWrap>,
+                column_filter::caller<20, T, D, BrdColWrap>,
+                column_filter::caller<21, T, D, BrdColWrap>,
+                column_filter::caller<22, T, D, BrdColWrap>,
+                column_filter::caller<23, T, D, BrdColWrap>,
+                column_filter::caller<24, T, D, BrdColWrap>,
+                column_filter::caller<25, T, D, BrdColWrap>,
+                column_filter::caller<26, T, D, BrdColWrap>,
+                column_filter::caller<27, T, D, BrdColWrap>,
+                column_filter::caller<28, T, D, BrdColWrap>,
+                column_filter::caller<29, T, D, BrdColWrap>,
+                column_filter::caller<30, T, D, BrdColWrap>,
+                column_filter::caller<31, T, D, BrdColWrap>,
+                column_filter::caller<32, T, D, BrdColWrap>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReflect101>,
+                column_filter::caller< 2, T, D, BrdColReflect101>,
+                column_filter::caller< 3, T, D, BrdColReflect101>,
+                column_filter::caller< 4, T, D, BrdColReflect101>,
+                column_filter::caller< 5, T, D, BrdColReflect101>,
+                column_filter::caller< 6, T, D, BrdColReflect101>,
+                column_filter::caller< 7, T, D, BrdColReflect101>,
+                column_filter::caller< 8, T, D, BrdColReflect101>,
+                column_filter::caller< 9, T, D, BrdColReflect101>,
+                column_filter::caller<10, T, D, BrdColReflect101>,
+                column_filter::caller<11, T, D, BrdColReflect101>,
+                column_filter::caller<12, T, D, BrdColReflect101>,
+                column_filter::caller<13, T, D, BrdColReflect101>,
+                column_filter::caller<14, T, D, BrdColReflect101>,
+                column_filter::caller<15, T, D, BrdColReflect101>,
+                column_filter::caller<16, T, D, BrdColReflect101>,
+                column_filter::caller<17, T, D, BrdColReflect101>,
+                column_filter::caller<18, T, D, BrdColReflect101>,
+                column_filter::caller<19, T, D, BrdColReflect101>,
+                column_filter::caller<20, T, D, BrdColReflect101>,
+                column_filter::caller<21, T, D, BrdColReflect101>,
+                column_filter::caller<22, T, D, BrdColReflect101>,
+                column_filter::caller<23, T, D, BrdColReflect101>,
+                column_filter::caller<24, T, D, BrdColReflect101>,
+                column_filter::caller<25, T, D, BrdColReflect101>,
+                column_filter::caller<26, T, D, BrdColReflect101>,
+                column_filter::caller<27, T, D, BrdColReflect101>,
+                column_filter::caller<28, T, D, BrdColReflect101>,
+                column_filter::caller<29, T, D, BrdColReflect101>,
+                column_filter::caller<30, T, D, BrdColReflect101>,
+                column_filter::caller<31, T, D, BrdColReflect101>,
+                column_filter::caller<32, T, D, BrdColReflect101>
+            }
+        };
+
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
+    }
+}
diff --git a/modules/cudafilters/src/cuda/filter2d.cu b/modules/cudafilters/src/cuda/filter2d.cu
new file mode 100644
index 00000000000..67d75d4c78c
--- /dev/null
+++ b/modules/cudafilters/src/cuda/filter2d.cu
@@ -0,0 +1,151 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    template <class SrcPtr, typename D>
+    __global__ void filter2D(const SrcPtr src, PtrStepSz<D> dst,
+                             const float* __restrict__ kernel,
+                             const int kWidth, const int kHeight,
+                             const int anchorX, const int anchorY)
+    {
+        typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= dst.cols || y >= dst.rows)
+            return;
+
+        sum_t res = VecTraits<sum_t>::all(0);
+        int kInd = 0;
+
+        for (int i = 0; i < kHeight; ++i)
+        {
+            for (int j = 0; j < kWidth; ++j)
+                res = res + src(y - anchorY + i, x - anchorX + j) * kernel[kInd++];
+        }
+
+        dst(y, x) = saturate_cast<D>(res);
+    }
+
+    template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
+
+    #define IMPLEMENT_FILTER2D_TEX_READER(type) \
+        texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        struct tex_filter2D_ ## type ## _reader \
+        { \
+            typedef type elem_type; \
+            typedef int index_type; \
+            const int xoff; \
+            const int yoff; \
+            tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+            { \
+                return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
+            } \
+        }; \
+        template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
+        { \
+            static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel, \
+                int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
+            { \
+                typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                dim3 block(16, 16); \
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                bindTexture(&tex_filter2D_ ## type , srcWhole); \
+                tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
+                Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
+                BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
+                filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kernel, kWidth, kHeight, anchorX, anchorY); \
+                cudaSafeCall( cudaGetLastError() ); \
+                if (stream == 0) \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+            } \
+        };
+
+    IMPLEMENT_FILTER2D_TEX_READER(uchar);
+    IMPLEMENT_FILTER2D_TEX_READER(uchar4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(ushort);
+    IMPLEMENT_FILTER2D_TEX_READER(ushort4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(float);
+    IMPLEMENT_FILTER2D_TEX_READER(float4);
+
+    #undef IMPLEMENT_FILTER2D_TEX_READER
+
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream)
+    {
+        typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel,
+                               int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            Filter2DCaller<T, D, BrdConstant>::call,
+            Filter2DCaller<T, D, BrdReplicate>::call,
+            Filter2DCaller<T, D, BrdReflect>::call,
+            Filter2DCaller<T, D, BrdWrap>::call,
+            Filter2DCaller<T, D, BrdReflect101>::call
+        };
+
+        funcs[borderMode]((PtrStepSz<T>) srcWhole, ofsX, ofsY, (PtrStepSz<D>) dst, kernel,
+                          kWidth, kHeight, anchorX, anchorY, borderValue, stream);
+    }
+
+    template void filter2D<uchar  , uchar  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<uchar4 , uchar4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort , ushort >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float  , float  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float4 , float4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+}}}
+
+#endif // CUDA_DISABLER
diff --git a/modules/cudafilters/src/cuda/median_filter.cu b/modules/cudafilters/src/cuda/median_filter.cu
new file mode 100644
index 00000000000..fe26c7be0e3
--- /dev/null
+++ b/modules/cudafilters/src/cuda/median_filter.cu
@@ -0,0 +1,343 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    // // namespace imgproc
+    // {
+
+        __device__ void histogramAddAndSub8(int* H, const int * hist_colAdd,const int * hist_colSub){
+            int tx = threadIdx.x;
+            if (tx<8){
+                H[tx]+=hist_colAdd[tx]-hist_colSub[tx];
+            }
+        }
+
+        __device__ void histogramMultipleAdd8(int* H, const int * hist_col,int histCount){
+            int tx = threadIdx.x;
+            if (tx<8){
+                int temp=H[tx];
+                for(int i=0; i<histCount; i++)
+                    temp+=hist_col[(i<<3)+tx];
+                H[tx]=temp;
+            }
+        }
+
+        __device__ void histogramClear8(int* H){
+            int tx = threadIdx.x;
+            if (tx<8){
+                H[tx]=0;
+            }
+        }
+
+        __device__ void histogramAdd8(int* H, const int * hist_col){
+            int tx = threadIdx.x;
+            if (tx<8){
+                H[tx]+=hist_col[tx];
+            }
+        }
+
+        __device__ void histogramSub8(int* H, const int * hist_col){
+            int tx = threadIdx.x;
+            if (tx<8){
+                H[tx]-=hist_col[tx];
+            }
+        }
+
+
+        __device__ void histogramAdd32(int* H, const int * hist_col){
+            int tx = threadIdx.x;
+            if (tx<32){
+                H[tx]+=hist_col[tx];
+            }
+        }
+
+        __device__ void histogramAddAndSub32(int* H, const int * hist_colAdd,const int * hist_colSub){
+            int tx = threadIdx.x;
+            if (tx<32){
+                H[tx]+=hist_colAdd[tx]-hist_colSub[tx];
+            }
+        }
+
+
+        __device__ void histogramClear32(int* H){
+            int tx = threadIdx.x;
+            if (tx<32){
+                H[tx]=0;
+            }
+        }
+
+        __device__ void lucClear8(int* luc){
+            int tx = threadIdx.x;
+            if (tx<8)
+                luc[tx]=0;
+        }
+
+        __device__ void histogramMedianPar8LookupOnly(int* H,int* Hscan, const int medPos,int* retval, int* countAtMed){
+            int tx=threadIdx.x;
+            *retval=*countAtMed=0;
+            if(tx<8){
+                Hscan[tx]=H[tx];
+            }
+            __syncthreads();
+            if(tx<8){
+                if(tx>=1 )
+                  Hscan[tx]+=Hscan[tx-1];
+                if(tx>=2)
+                  Hscan[tx]+=Hscan[tx-2];
+                if(tx>=4)
+                  Hscan[tx]+=Hscan[tx-4];
+            }
+            __syncthreads();
+
+            if(tx<7){
+                if(Hscan[tx+1] > medPos && Hscan[tx] < medPos){
+                    *retval=tx+1;
+                    *countAtMed=Hscan[tx];
+                }
+                else if(Hscan[tx]==medPos){
+                  if(Hscan[tx+1]>medPos){
+                     *retval=tx+1;
+                     *countAtMed=Hscan[tx];
+                  }
+                }
+            }
+        }
+
+        __device__ void histogramMedianPar32LookupOnly(int* H,int* Hscan, const int medPos,int* retval, int* countAtMed){
+            int tx=threadIdx.x;
+            *retval=*countAtMed=0;
+            if(tx<32){
+                Hscan[tx]=H[tx];
+            }
+            __syncthreads();
+            if(tx<32){
+                if(tx>=1)
+                  Hscan[tx]+=Hscan[tx-1];
+                if(tx>=2)
+                  Hscan[tx]+=Hscan[tx-2];
+                if(tx>=4)
+                  Hscan[tx]+=Hscan[tx-4];
+                if(tx>=8)
+                  Hscan[tx]+=Hscan[tx-8];
+                if(tx>=16)
+                  Hscan[tx]+=Hscan[tx-16];
+            }
+            __syncthreads();
+            if(tx<31){
+                if(Hscan[tx+1] > medPos && Hscan[tx] < medPos){
+                    *retval=tx+1;
+                    *countAtMed=Hscan[tx];
+                }
+                else if(Hscan[tx]==medPos){
+                  if(Hscan[tx+1]>medPos){
+                      *retval=tx+1;
+                      *countAtMed=Hscan[tx];
+                  }
+                }
+            }
+         }
+
+    __global__ void cuMedianFilterMultiBlock(PtrStepSzb src, PtrStepSzb  dest, PtrStepSzi histPar, PtrStepSzi coarseHistGrid,int r, int medPos_)
+    {
+        __shared__ int HCoarse[8];
+        __shared__ int HCoarseScan[32];
+        __shared__ int HFine[8][32];
+
+        __shared__ int luc[8];
+
+        __shared__ int firstBin,countAtMed, retval;
+
+        int rows = src.rows, cols=src.cols;
+
+        int extraRowThread=rows%gridDim.x;
+        int doExtraRow=blockIdx.x<extraRowThread;
+        int startRow=0, stopRow=0;
+        int rowsPerBlock= rows/gridDim.x+doExtraRow;
+
+
+        // The following code partitions the work to the blocks. Some blocks will do one row more
+        // than other blocks. This code is responsible for doing that balancing
+        if(doExtraRow){
+            startRow=rowsPerBlock*blockIdx.x;
+            stopRow=::min(rows, startRow+rowsPerBlock);
+        }
+        else{
+            startRow=(rowsPerBlock+1)*extraRowThread+(rowsPerBlock)*(blockIdx.x-extraRowThread);
+            stopRow=::min(rows, startRow+rowsPerBlock);
+        }
+
+        int* hist= histPar.data+cols*256*blockIdx.x;
+        int* histCoarse=coarseHistGrid.data +cols*8*blockIdx.x;
+
+        if (blockIdx.x==(gridDim.x-1))
+            stopRow=rows;
+        __syncthreads();
+        int initNeeded=0, initVal, initStartRow, initStopRow;
+
+        if(blockIdx.x==0){
+            initNeeded=1; initVal=r+2; initStartRow=1;  initStopRow=r;
+        }
+        else if (startRow<(r+2)){
+            initNeeded=1; initVal=r+2-startRow; initStartRow=1; initStopRow=r+startRow;
+        }
+        else{
+            initNeeded=0; initVal=0; initStartRow=startRow-(r+1);   initStopRow=r+startRow;
+        }
+       __syncthreads();
+
+
+        // In the original algorithm an initialization phase was required as part of the window was outside the
+        // image. In this parallel version, the initializtion is required for all thread blocks that part
+        // of the median filter is outside the window.
+        // For all threads in the block the same code will be executed.
+        if (initNeeded){
+            for (int j=threadIdx.x; j<(cols); j+=blockDim.x){
+                hist[j*256+src.ptr(0)[j]]=initVal;
+                histCoarse[j*8+(src.ptr(0)[j]>>5)]=initVal;
+            }
+        }
+        __syncthreads();
+
+        // For all remaining rows in the median filter, add the values to the the histogram
+        for (int j=threadIdx.x; j<cols; j+=blockDim.x){
+            for(int i=initStartRow; i<initStopRow; i++){
+                    int pos=::min(i,rows-1);
+                    hist[j*256+src.ptr(pos)[j]]++;
+                    histCoarse[j*8+(src.ptr(pos)[j]>>5)]++;
+                }
+        }
+        __syncthreads();
+         // Going through all the rows that the block is responsible for.
+         int inc=blockDim.x*256;
+         int incCoarse=blockDim.x*8;
+         for(int i=startRow; i< stopRow; i++){
+             // For every new row that is started the global histogram for the entire window is restarted.
+
+             histogramClear8(HCoarse);
+             lucClear8(luc);
+             // Computing some necessary indices
+             int possub=::max(0,i-r-1),posadd=::min(rows-1,i+r);
+             int histPos=threadIdx.x*256;
+             int histCoarsePos=threadIdx.x*8;
+             // Going through all the elements of a specific row. Foeach histogram, a value is taken out and
+             // one value is added.
+             for (int j=threadIdx.x; j<cols; j+=blockDim.x){
+                hist[histPos+ src.ptr(possub)[j] ]--;
+                hist[histPos+ src.ptr(posadd)[j] ]++;
+                histCoarse[histCoarsePos+ (src.ptr(possub)[j]>>5) ]--;
+                histCoarse[histCoarsePos+ (src.ptr(posadd)[j]>>5) ]++;
+
+                histPos+=inc;
+                histCoarsePos+=incCoarse;
+             }
+            __syncthreads();
+
+            histogramMultipleAdd8(HCoarse,histCoarse, 2*r+1);
+//            __syncthreads();
+            int cols_m_1=cols-1;
+
+             for(int j=r;j<cols-r;j++){
+                int possub=::max(j-r,0);
+                int posadd=::min(j+1+r,cols_m_1);
+                int medPos=medPos_;
+                __syncthreads();
+
+                histogramMedianPar8LookupOnly(HCoarse,HCoarseScan,medPos, &firstBin,&countAtMed);
+                __syncthreads();
+
+                if ( luc[firstBin] <= (j-r))
+                {
+                    histogramClear32(HFine[firstBin]);
+                    for ( luc[firstBin] = j-r; luc[firstBin] < ::min(j+r+1,cols); luc[firstBin]++ ){
+                        histogramAdd32(HFine[firstBin], hist+(luc[firstBin]*256+(firstBin<<5) ) );
+                    }
+                }
+                else{
+                    for ( ; luc[firstBin] < (j+r+1);luc[firstBin]++ ) {
+                        histogramAddAndSub32(HFine[firstBin],
+                        hist+(::min(luc[firstBin],cols_m_1)*256+(firstBin<<5) ),
+                        hist+(::max(luc[firstBin]-2*r-1,0)*256+(firstBin<<5) ) );
+                        __syncthreads();
+
+                    }
+                }
+                __syncthreads();
+
+                int leftOver=medPos-countAtMed;
+                if(leftOver>=0){
+                    histogramMedianPar32LookupOnly(HFine[firstBin],HCoarseScan,leftOver,&retval,&countAtMed);
+                }
+                else retval=0;
+                __syncthreads();
+
+                if (threadIdx.x==0){
+                    dest.ptr(i)[j]=(firstBin<<5) + retval;
+                }
+                histogramAddAndSub8(HCoarse, histCoarse+(int)(posadd<<3),histCoarse+(int)(possub<<3));
+
+                __syncthreads();
+            }
+             __syncthreads();
+        }
+    }
+
+    void medianFiltering_gpu(const PtrStepSzb src, PtrStepSzb dst, PtrStepSzi devHist, PtrStepSzi devCoarseHist,int kernel, int partitions,cudaStream_t stream){
+        int medPos=2*kernel*kernel+2*kernel;
+        dim3 gridDim; gridDim.x=partitions;
+        dim3 blockDim; blockDim.x=32;
+        cuMedianFilterMultiBlock<<<gridDim,blockDim,0, stream>>>(src, dst, devHist,devCoarseHist, kernel, medPos);
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+}}}
+
+#endif
diff --git a/modules/cudafilters/src/cuda/row_filter.16sc1.cu b/modules/cudafilters/src/cuda/row_filter.16sc1.cu
new file mode 100644
index 00000000000..59ebb9f5f79
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.16sc1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.16sc3.cu b/modules/cudafilters/src/cuda/row_filter.16sc3.cu
new file mode 100644
index 00000000000..fcf40d81e31
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.16sc3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.16sc4.cu b/modules/cudafilters/src/cuda/row_filter.16sc4.cu
new file mode 100644
index 00000000000..c5d472692bf
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.16sc4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<short4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.16uc1.cu b/modules/cudafilters/src/cuda/row_filter.16uc1.cu
new file mode 100644
index 00000000000..02e125abc08
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.16uc1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<unsigned short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.16uc3.cu b/modules/cudafilters/src/cuda/row_filter.16uc3.cu
new file mode 100644
index 00000000000..494c604b4ba
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.16uc3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<ushort3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.16uc4.cu b/modules/cudafilters/src/cuda/row_filter.16uc4.cu
new file mode 100644
index 00000000000..1eb1ac25a66
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.16uc4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<ushort4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.32fc1.cu b/modules/cudafilters/src/cuda/row_filter.32fc1.cu
new file mode 100644
index 00000000000..bf577c6b71f
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.32fc1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.32fc3.cu b/modules/cudafilters/src/cuda/row_filter.32fc3.cu
new file mode 100644
index 00000000000..594fc04b5c7
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.32fc3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.32fc4.cu b/modules/cudafilters/src/cuda/row_filter.32fc4.cu
new file mode 100644
index 00000000000..5f2812bb31b
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.32fc4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.32sc1.cu b/modules/cudafilters/src/cuda/row_filter.32sc1.cu
new file mode 100644
index 00000000000..67f3fb04c55
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.32sc1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<int, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.32sc3.cu b/modules/cudafilters/src/cuda/row_filter.32sc3.cu
new file mode 100644
index 00000000000..8e881a22ad8
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.32sc3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<int3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.32sc4.cu b/modules/cudafilters/src/cuda/row_filter.32sc4.cu
new file mode 100644
index 00000000000..66f00cf06b4
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.32sc4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<int4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.8uc1.cu b/modules/cudafilters/src/cuda/row_filter.8uc1.cu
new file mode 100644
index 00000000000..c94b39f1b49
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.8uc1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<uchar, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.8uc3.cu b/modules/cudafilters/src/cuda/row_filter.8uc3.cu
new file mode 100644
index 00000000000..1c924c10b24
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.8uc3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.8uc4.cu b/modules/cudafilters/src/cuda/row_filter.8uc4.cu
new file mode 100644
index 00000000000..1ae9651a809
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.8uc4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.hpp"
+
+namespace filter
+{
+    template void linearRow<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudafilters/src/cuda/row_filter.hpp b/modules/cudafilters/src/cuda/row_filter.hpp
new file mode 100644
index 00000000000..4a4be36f9af
--- /dev/null
+++ b/modules/cudafilters/src/cuda/row_filter.hpp
@@ -0,0 +1,364 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace row_filter
+{
+    #define MAX_KERNEL_SIZE 32
+
+    template <int KSIZE, typename T, typename D, typename B>
+    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
+    {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+            const int BLOCK_DIM_X = 32;
+            const int BLOCK_DIM_Y = 8;
+            const int PATCH_PER_BLOCK = 4;
+            const int HALO_SIZE = 1;
+        #else
+            const int BLOCK_DIM_X = 32;
+            const int BLOCK_DIM_Y = 4;
+            const int PATCH_PER_BLOCK = 4;
+            const int HALO_SIZE = 1;
+        #endif
+
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+        __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
+
+        const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+
+        if (y >= src.rows)
+            return;
+
+        const T* src_row = src.ptr(y);
+
+        const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
+
+        if (blockIdx.x > 0)
+        {
+            //Load left halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
+        }
+        else
+        {
+            //Load left halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
+        }
+
+        if (blockIdx.x + 2 < gridDim.x)
+        {
+            //Load main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
+
+            //Load right halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
+        }
+        else
+        {
+            //Load main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
+
+            //Load right halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+        {
+            const int x = xStart + j * BLOCK_DIM_X;
+
+            if (x < src.cols)
+            {
+                sum_t sum = VecTraits<sum_t>::all(0);
+
+                #pragma unroll
+                for (int k = 0; k < KSIZE; ++k)
+                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * kernel[k];
+
+                dst(y, x) = saturate_cast<D>(sum);
+            }
+        }
+    }
+
+    template <int KSIZE, typename T, typename D, template<typename> class B>
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
+    {
+        int BLOCK_DIM_X;
+        int BLOCK_DIM_Y;
+        int PATCH_PER_BLOCK;
+
+        if (cc >= 20)
+        {
+            BLOCK_DIM_X = 32;
+            BLOCK_DIM_Y = 8;
+            PATCH_PER_BLOCK = 4;
+        }
+        else
+        {
+            BLOCK_DIM_X = 32;
+            BLOCK_DIM_Y = 4;
+            PATCH_PER_BLOCK = 4;
+        }
+
+        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+        const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
+
+        B<T> brd(src.cols);
+
+        linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+namespace filter
+{
+    template <typename T, typename D>
+    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);
+
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowConstant>,
+                row_filter::caller< 2, T, D, BrdRowConstant>,
+                row_filter::caller< 3, T, D, BrdRowConstant>,
+                row_filter::caller< 4, T, D, BrdRowConstant>,
+                row_filter::caller< 5, T, D, BrdRowConstant>,
+                row_filter::caller< 6, T, D, BrdRowConstant>,
+                row_filter::caller< 7, T, D, BrdRowConstant>,
+                row_filter::caller< 8, T, D, BrdRowConstant>,
+                row_filter::caller< 9, T, D, BrdRowConstant>,
+                row_filter::caller<10, T, D, BrdRowConstant>,
+                row_filter::caller<11, T, D, BrdRowConstant>,
+                row_filter::caller<12, T, D, BrdRowConstant>,
+                row_filter::caller<13, T, D, BrdRowConstant>,
+                row_filter::caller<14, T, D, BrdRowConstant>,
+                row_filter::caller<15, T, D, BrdRowConstant>,
+                row_filter::caller<16, T, D, BrdRowConstant>,
+                row_filter::caller<17, T, D, BrdRowConstant>,
+                row_filter::caller<18, T, D, BrdRowConstant>,
+                row_filter::caller<19, T, D, BrdRowConstant>,
+                row_filter::caller<20, T, D, BrdRowConstant>,
+                row_filter::caller<21, T, D, BrdRowConstant>,
+                row_filter::caller<22, T, D, BrdRowConstant>,
+                row_filter::caller<23, T, D, BrdRowConstant>,
+                row_filter::caller<24, T, D, BrdRowConstant>,
+                row_filter::caller<25, T, D, BrdRowConstant>,
+                row_filter::caller<26, T, D, BrdRowConstant>,
+                row_filter::caller<27, T, D, BrdRowConstant>,
+                row_filter::caller<28, T, D, BrdRowConstant>,
+                row_filter::caller<29, T, D, BrdRowConstant>,
+                row_filter::caller<30, T, D, BrdRowConstant>,
+                row_filter::caller<31, T, D, BrdRowConstant>,
+                row_filter::caller<32, T, D, BrdRowConstant>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowReplicate>,
+                row_filter::caller< 2, T, D, BrdRowReplicate>,
+                row_filter::caller< 3, T, D, BrdRowReplicate>,
+                row_filter::caller< 4, T, D, BrdRowReplicate>,
+                row_filter::caller< 5, T, D, BrdRowReplicate>,
+                row_filter::caller< 6, T, D, BrdRowReplicate>,
+                row_filter::caller< 7, T, D, BrdRowReplicate>,
+                row_filter::caller< 8, T, D, BrdRowReplicate>,
+                row_filter::caller< 9, T, D, BrdRowReplicate>,
+                row_filter::caller<10, T, D, BrdRowReplicate>,
+                row_filter::caller<11, T, D, BrdRowReplicate>,
+                row_filter::caller<12, T, D, BrdRowReplicate>,
+                row_filter::caller<13, T, D, BrdRowReplicate>,
+                row_filter::caller<14, T, D, BrdRowReplicate>,
+                row_filter::caller<15, T, D, BrdRowReplicate>,
+                row_filter::caller<16, T, D, BrdRowReplicate>,
+                row_filter::caller<17, T, D, BrdRowReplicate>,
+                row_filter::caller<18, T, D, BrdRowReplicate>,
+                row_filter::caller<19, T, D, BrdRowReplicate>,
+                row_filter::caller<20, T, D, BrdRowReplicate>,
+                row_filter::caller<21, T, D, BrdRowReplicate>,
+                row_filter::caller<22, T, D, BrdRowReplicate>,
+                row_filter::caller<23, T, D, BrdRowReplicate>,
+                row_filter::caller<24, T, D, BrdRowReplicate>,
+                row_filter::caller<25, T, D, BrdRowReplicate>,
+                row_filter::caller<26, T, D, BrdRowReplicate>,
+                row_filter::caller<27, T, D, BrdRowReplicate>,
+                row_filter::caller<28, T, D, BrdRowReplicate>,
+                row_filter::caller<29, T, D, BrdRowReplicate>,
+                row_filter::caller<30, T, D, BrdRowReplicate>,
+                row_filter::caller<31, T, D, BrdRowReplicate>,
+                row_filter::caller<32, T, D, BrdRowReplicate>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowReflect>,
+                row_filter::caller< 2, T, D, BrdRowReflect>,
+                row_filter::caller< 3, T, D, BrdRowReflect>,
+                row_filter::caller< 4, T, D, BrdRowReflect>,
+                row_filter::caller< 5, T, D, BrdRowReflect>,
+                row_filter::caller< 6, T, D, BrdRowReflect>,
+                row_filter::caller< 7, T, D, BrdRowReflect>,
+                row_filter::caller< 8, T, D, BrdRowReflect>,
+                row_filter::caller< 9, T, D, BrdRowReflect>,
+                row_filter::caller<10, T, D, BrdRowReflect>,
+                row_filter::caller<11, T, D, BrdRowReflect>,
+                row_filter::caller<12, T, D, BrdRowReflect>,
+                row_filter::caller<13, T, D, BrdRowReflect>,
+                row_filter::caller<14, T, D, BrdRowReflect>,
+                row_filter::caller<15, T, D, BrdRowReflect>,
+                row_filter::caller<16, T, D, BrdRowReflect>,
+                row_filter::caller<17, T, D, BrdRowReflect>,
+                row_filter::caller<18, T, D, BrdRowReflect>,
+                row_filter::caller<19, T, D, BrdRowReflect>,
+                row_filter::caller<20, T, D, BrdRowReflect>,
+                row_filter::caller<21, T, D, BrdRowReflect>,
+                row_filter::caller<22, T, D, BrdRowReflect>,
+                row_filter::caller<23, T, D, BrdRowReflect>,
+                row_filter::caller<24, T, D, BrdRowReflect>,
+                row_filter::caller<25, T, D, BrdRowReflect>,
+                row_filter::caller<26, T, D, BrdRowReflect>,
+                row_filter::caller<27, T, D, BrdRowReflect>,
+                row_filter::caller<28, T, D, BrdRowReflect>,
+                row_filter::caller<29, T, D, BrdRowReflect>,
+                row_filter::caller<30, T, D, BrdRowReflect>,
+                row_filter::caller<31, T, D, BrdRowReflect>,
+                row_filter::caller<32, T, D, BrdRowReflect>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowWrap>,
+                row_filter::caller< 2, T, D, BrdRowWrap>,
+                row_filter::caller< 3, T, D, BrdRowWrap>,
+                row_filter::caller< 4, T, D, BrdRowWrap>,
+                row_filter::caller< 5, T, D, BrdRowWrap>,
+                row_filter::caller< 6, T, D, BrdRowWrap>,
+                row_filter::caller< 7, T, D, BrdRowWrap>,
+                row_filter::caller< 8, T, D, BrdRowWrap>,
+                row_filter::caller< 9, T, D, BrdRowWrap>,
+                row_filter::caller<10, T, D, BrdRowWrap>,
+                row_filter::caller<11, T, D, BrdRowWrap>,
+                row_filter::caller<12, T, D, BrdRowWrap>,
+                row_filter::caller<13, T, D, BrdRowWrap>,
+                row_filter::caller<14, T, D, BrdRowWrap>,
+                row_filter::caller<15, T, D, BrdRowWrap>,
+                row_filter::caller<16, T, D, BrdRowWrap>,
+                row_filter::caller<17, T, D, BrdRowWrap>,
+                row_filter::caller<18, T, D, BrdRowWrap>,
+                row_filter::caller<19, T, D, BrdRowWrap>,
+                row_filter::caller<20, T, D, BrdRowWrap>,
+                row_filter::caller<21, T, D, BrdRowWrap>,
+                row_filter::caller<22, T, D, BrdRowWrap>,
+                row_filter::caller<23, T, D, BrdRowWrap>,
+                row_filter::caller<24, T, D, BrdRowWrap>,
+                row_filter::caller<25, T, D, BrdRowWrap>,
+                row_filter::caller<26, T, D, BrdRowWrap>,
+                row_filter::caller<27, T, D, BrdRowWrap>,
+                row_filter::caller<28, T, D, BrdRowWrap>,
+                row_filter::caller<29, T, D, BrdRowWrap>,
+                row_filter::caller<30, T, D, BrdRowWrap>,
+                row_filter::caller<31, T, D, BrdRowWrap>,
+                row_filter::caller<32, T, D, BrdRowWrap>
+            },
+            {
+                0,
+                row_filter::caller< 1, T, D, BrdRowReflect101>,
+                row_filter::caller< 2, T, D, BrdRowReflect101>,
+                row_filter::caller< 3, T, D, BrdRowReflect101>,
+                row_filter::caller< 4, T, D, BrdRowReflect101>,
+                row_filter::caller< 5, T, D, BrdRowReflect101>,
+                row_filter::caller< 6, T, D, BrdRowReflect101>,
+                row_filter::caller< 7, T, D, BrdRowReflect101>,
+                row_filter::caller< 8, T, D, BrdRowReflect101>,
+                row_filter::caller< 9, T, D, BrdRowReflect101>,
+                row_filter::caller<10, T, D, BrdRowReflect101>,
+                row_filter::caller<11, T, D, BrdRowReflect101>,
+                row_filter::caller<12, T, D, BrdRowReflect101>,
+                row_filter::caller<13, T, D, BrdRowReflect101>,
+                row_filter::caller<14, T, D, BrdRowReflect101>,
+                row_filter::caller<15, T, D, BrdRowReflect101>,
+                row_filter::caller<16, T, D, BrdRowReflect101>,
+                row_filter::caller<17, T, D, BrdRowReflect101>,
+                row_filter::caller<18, T, D, BrdRowReflect101>,
+                row_filter::caller<19, T, D, BrdRowReflect101>,
+                row_filter::caller<20, T, D, BrdRowReflect101>,
+                row_filter::caller<21, T, D, BrdRowReflect101>,
+                row_filter::caller<22, T, D, BrdRowReflect101>,
+                row_filter::caller<23, T, D, BrdRowReflect101>,
+                row_filter::caller<24, T, D, BrdRowReflect101>,
+                row_filter::caller<25, T, D, BrdRowReflect101>,
+                row_filter::caller<26, T, D, BrdRowReflect101>,
+                row_filter::caller<27, T, D, BrdRowReflect101>,
+                row_filter::caller<28, T, D, BrdRowReflect101>,
+                row_filter::caller<29, T, D, BrdRowReflect101>,
+                row_filter::caller<30, T, D, BrdRowReflect101>,
+                row_filter::caller<31, T, D, BrdRowReflect101>,
+                row_filter::caller<32, T, D, BrdRowReflect101>
+            }
+        };
+
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
+    }
+}
diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp
new file mode 100644
index 00000000000..764e6f63096
--- /dev/null
+++ b/modules/cudafilters/src/filtering.cpp
@@ -0,0 +1,1118 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<Filter> cv::cuda::createBoxFilter(int, int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::cuda::createLinearFilter(int, int, InputArray, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::cuda::createLaplacianFilter(int, int, int, double, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::cuda::createSeparableLinearFilter(int, int, InputArray, InputArray, Point, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::cuda::createDerivFilter(int, int, int, int, int, bool, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::cuda::createSobelFilter(int, int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::cuda::createScharrFilter(int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::cuda::createGaussianFilter(int, int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::cuda::createMorphologyFilter(int, int, InputArray, Point, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::cuda::createBoxMaxFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::cuda::createBoxMinFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::cuda::createRowSumFilter(int, int, int, int, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::cuda::createColumnSumFilter(int, int, int, int, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::cuda::createMedianFilter(int srcType, int _windowSize, int _partitions){ throw_no_cuda(); return Ptr<Filter>();}
+
+#else
+
+namespace
+{
+    void normalizeAnchor(int& anchor, int ksize)
+    {
+        if (anchor < 0)
+            anchor = ksize >> 1;
+
+        CV_Assert( 0 <= anchor && anchor < ksize );
+    }
+
+    void normalizeAnchor(Point& anchor, Size ksize)
+    {
+        normalizeAnchor(anchor.x, ksize.width);
+        normalizeAnchor(anchor.y, ksize.height);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Box Filter
+
+namespace
+{
+    class NPPBoxFilter : public Filter
+    {
+    public:
+        NPPBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef NppStatus (*nppFilterBox8U_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep,
+                                            NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor);
+        typedef NppStatus (*nppFilterBox32F_t)(const Npp32f* pSrc, Npp32s nSrcStep, Npp32f* pDst, Npp32s nDstStep,
+                                            NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor);
+
+        Size ksize_;
+        Point anchor_;
+        int type_;
+        int borderMode_;
+        Scalar borderVal_;
+        GpuMat srcBorder_;
+    };
+
+    NPPBoxFilter::NPPBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
+        ksize_(ksize), anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_32FC1);
+        CV_Assert( dstType == srcType );
+
+        normalizeAnchor(anchor_, ksize);
+    }
+
+    void NPPBoxFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        cuda::copyMakeBorder(src, srcBorder_, ksize_.height, ksize_.height, ksize_.width, ksize_.width, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_.width, ksize_.height, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize_.height;
+        oMaskSize.width = ksize_.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        const int depth = CV_MAT_DEPTH(type_);
+        const int cn = CV_MAT_CN(type_);
+
+        switch (depth)
+        {
+        case CV_8U:
+        {
+            static const nppFilterBox8U_t funcs8U[] = { 0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R };
+            const nppFilterBox8U_t func8U = funcs8U[cn];
+            nppSafeCall(func8U(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                oSizeROI, oMaskSize, oAnchor));
+        }
+            break;
+        case CV_32F:
+        {
+            static const nppFilterBox32F_t funcs32F[] = { 0, nppiFilterBox_32f_C1R, 0, 0, 0 };
+            const nppFilterBox32F_t func32F = funcs32F[cn];
+            nppSafeCall(func32F(srcRoi.ptr<Npp32f>(), static_cast<int>(srcRoi.step),
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                oSizeROI, oMaskSize, oAnchor));
+        }
+            break;
+        }
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+Ptr<Filter> cv::cuda::createBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
+{
+    if (dstType < 0)
+        dstType = srcType;
+
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
+    return makePtr<NPPBoxFilter>(srcType, dstType, ksize, anchor, borderMode, borderVal);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear Filter
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream);
+}}}
+
+namespace
+{
+    class LinearFilter : public Filter
+    {
+    public:
+        LinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef void (*filter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                                   int kWidth, int kHeight, int anchorX, int anchorY,
+                                   int borderMode, const float* borderValue, cudaStream_t stream);
+
+        GpuMat kernel_;
+        Point anchor_;
+        int type_;
+        filter2D_t func_;
+        int borderMode_;
+        Scalar_<float> borderVal_;
+    };
+
+    LinearFilter::LinearFilter(int srcType, int dstType, InputArray _kernel, Point anchor, int borderMode, Scalar borderVal) :
+        anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int scn = CV_MAT_CN(srcType);
+
+        Mat kernel = _kernel.getMat();
+
+        CV_Assert( sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F );
+        CV_Assert( scn == 1 || scn == 4 );
+        CV_Assert( dstType == srcType );
+        CV_Assert( kernel.channels() == 1 );
+        CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
+
+        Mat kernel32F;
+        kernel.convertTo(kernel32F, CV_32F);
+
+        kernel_ = cuda::createContinuous(kernel.size(), CV_32FC1);
+        kernel_.upload(kernel32F);
+
+        normalizeAnchor(anchor_, kernel.size());
+
+        switch (srcType)
+        {
+        case CV_8UC1:
+            func_ = cv::cuda::device::filter2D<uchar, uchar>;
+            break;
+        case CV_8UC4:
+            func_ = cv::cuda::device::filter2D<uchar4, uchar4>;
+            break;
+        case CV_16UC1:
+            func_ = cv::cuda::device::filter2D<ushort, ushort>;
+            break;
+        case CV_16UC4:
+            func_ = cv::cuda::device::filter2D<ushort4, ushort4>;
+            break;
+        case CV_32FC1:
+            func_ = cv::cuda::device::filter2D<float, float>;
+            break;
+        case CV_32FC4:
+            func_ = cv::cuda::device::filter2D<float4, float4>;
+            break;
+        }
+    }
+
+    void LinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        Point ofs;
+        Size wholeSize;
+        src.locateROI(wholeSize, ofs);
+
+        GpuMat srcWhole(wholeSize, src.type(), src.datastart);
+
+        func_(srcWhole, ofs.x, ofs.y, dst, kernel_.ptr<float>(),
+              kernel_.cols, kernel_.rows, anchor_.x, anchor_.y,
+              borderMode_, borderVal_.val, StreamAccessor::getStream(_stream));
+    }
+}
+
+Ptr<Filter> cv::cuda::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal)
+{
+    if (dstType < 0)
+        dstType = srcType;
+
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
+    return makePtr<LinearFilter>(srcType, dstType, kernel, anchor, borderMode, borderVal);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian Filter
+
+Ptr<Filter> cv::cuda::createLaplacianFilter(int srcType, int dstType, int ksize, double scale, int borderMode, Scalar borderVal)
+{
+    CV_Assert( ksize == 1 || ksize == 3 );
+
+    static const float K[2][9] =
+    {
+        {0.0f, 1.0f, 0.0f, 1.0f, -4.0f, 1.0f, 0.0f, 1.0f, 0.0f},
+        {2.0f, 0.0f, 2.0f, 0.0f, -8.0f, 0.0f, 2.0f, 0.0f, 2.0f}
+    };
+
+    Mat kernel1(3, 3, CV_32FC1, (void*)K[ksize == 3]);
+    Mat kernel = (scale == 1) ? kernel1 : (kernel1 * scale);
+
+    return cuda::createLinearFilter(srcType, dstType, kernel, Point(-1,-1), borderMode, borderVal);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Separable Linear Filter
+
+namespace filter
+{
+    template <typename T, typename D>
+    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+
+    template <typename T, typename D>
+    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+namespace
+{
+    class SeparableLinearFilter : public Filter
+    {
+    public:
+        SeparableLinearFilter(int srcType, int dstType,
+                              InputArray rowKernel, InputArray columnKernel,
+                              Point anchor, int rowBorderMode, int columnBorderMode);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+
+        int srcType_, bufType_, dstType_;
+        GpuMat rowKernel_, columnKernel_;
+        func_t rowFilter_, columnFilter_;
+        Point anchor_;
+        int rowBorderMode_, columnBorderMode_;
+
+        GpuMat buf_;
+    };
+
+    SeparableLinearFilter::SeparableLinearFilter(int srcType, int dstType,
+                                                 InputArray _rowKernel, InputArray _columnKernel,
+                                                 Point anchor, int rowBorderMode, int columnBorderMode) :
+        srcType_(srcType), dstType_(dstType), anchor_(anchor), rowBorderMode_(rowBorderMode), columnBorderMode_(columnBorderMode)
+    {
+        static const func_t rowFilterFuncs[7][4] =
+        {
+            {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
+            {0, 0, 0, 0},
+            {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
+            {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
+            {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
+            {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
+            {0, 0, 0, 0}
+        };
+
+        static const func_t columnFilterFuncs[7][4] =
+        {
+            {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
+            {0, 0, 0, 0},
+            {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
+            {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
+            {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
+            {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
+            {0, 0, 0, 0}
+        };
+
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int cn = CV_MAT_CN(srcType);
+        const int ddepth = CV_MAT_DEPTH(dstType);
+
+        Mat rowKernel = _rowKernel.getMat();
+        Mat columnKernel = _columnKernel.getMat();
+
+        CV_Assert( sdepth <= CV_64F && cn <= 4 );
+        CV_Assert( rowKernel.channels() == 1 );
+        CV_Assert( columnKernel.channels() == 1 );
+        CV_Assert( rowBorderMode == BORDER_REFLECT101 || rowBorderMode == BORDER_REPLICATE || rowBorderMode == BORDER_CONSTANT || rowBorderMode == BORDER_REFLECT || rowBorderMode == BORDER_WRAP );
+        CV_Assert( columnBorderMode == BORDER_REFLECT101 || columnBorderMode == BORDER_REPLICATE || columnBorderMode == BORDER_CONSTANT || columnBorderMode == BORDER_REFLECT || columnBorderMode == BORDER_WRAP );
+
+        Mat kernel32F;
+
+        rowKernel.convertTo(kernel32F, CV_32F);
+        rowKernel_.upload(kernel32F.reshape(1, 1));
+
+        columnKernel.convertTo(kernel32F, CV_32F);
+        columnKernel_.upload(kernel32F.reshape(1, 1));
+
+        CV_Assert( rowKernel_.cols > 0 && rowKernel_.cols <= 32 );
+        CV_Assert( columnKernel_.cols > 0 && columnKernel_.cols <= 32 );
+
+        normalizeAnchor(anchor_.x, rowKernel_.cols);
+        normalizeAnchor(anchor_.y, columnKernel_.cols);
+
+        bufType_ = CV_MAKE_TYPE(CV_32F, cn);
+
+        rowFilter_ = rowFilterFuncs[sdepth][cn - 1];
+        CV_Assert( rowFilter_ != 0 );
+
+        columnFilter_ = columnFilterFuncs[ddepth][cn - 1];
+        CV_Assert( columnFilter_ != 0 );
+    }
+
+    void SeparableLinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        ensureSizeIsEnough(src.size(), bufType_, buf_);
+
+        DeviceInfo devInfo;
+        const int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        rowFilter_(src, buf_, rowKernel_.ptr<float>(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream);
+        columnFilter_(buf_, dst, columnKernel_.ptr<float>(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream);
+    }
+}
+
+Ptr<Filter> cv::cuda::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor, int rowBorderMode, int columnBorderMode)
+{
+    if (dstType < 0)
+        dstType = srcType;
+
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
+    if (columnBorderMode < 0)
+        columnBorderMode = rowBorderMode;
+
+    return makePtr<SeparableLinearFilter>(srcType, dstType, rowKernel, columnKernel, anchor, rowBorderMode, columnBorderMode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Deriv Filter
+
+Ptr<Filter> cv::cuda::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize, double scale, int rowBorderMode, int columnBorderMode)
+{
+    Mat kx, ky;
+    getDerivKernels(kx, ky, dx, dy, ksize, normalize, CV_32F);
+
+    if (scale != 1)
+    {
+        // usually the smoothing part is the slowest to compute,
+        // so try to scale it instead of the faster differentiating part
+        if (dx == 0)
+            kx *= scale;
+        else
+            ky *= scale;
+    }
+
+    return cuda::createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1, -1), rowBorderMode, columnBorderMode);
+}
+
+Ptr<Filter> cv::cuda::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize, double scale, int rowBorderMode, int columnBorderMode)
+{
+    return cuda::createDerivFilter(srcType, dstType, dx, dy, ksize, false, scale, rowBorderMode, columnBorderMode);
+}
+
+Ptr<Filter> cv::cuda::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale, int rowBorderMode, int columnBorderMode)
+{
+    return cuda::createDerivFilter(srcType, dstType, dx, dy, -1, false, scale, rowBorderMode, columnBorderMode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gaussian Filter
+
+Ptr<Filter> cv::cuda::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2, int rowBorderMode, int columnBorderMode)
+{
+    const int depth = CV_MAT_DEPTH(srcType);
+
+    if (sigma2 <= 0)
+        sigma2 = sigma1;
+
+    // automatic detection of kernel size from sigma
+    if (ksize.width <= 0 && sigma1 > 0)
+        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
+    if (ksize.height <= 0 && sigma2 > 0)
+        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
+
+    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
+
+    sigma1 = std::max(sigma1, 0.0);
+    sigma2 = std::max(sigma2, 0.0);
+
+    Mat kx = getGaussianKernel(ksize.width, sigma1, CV_32F);
+    Mat ky;
+    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
+        ky = kx;
+    else
+        ky = getGaussianKernel(ksize.height, sigma2, CV_32F);
+
+    return createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1,-1), rowBorderMode, columnBorderMode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Morphology Filter
+
+namespace
+{
+    class MorphologyFilter : public Filter
+    {
+    public:
+        MorphologyFilter(int op, int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef NppStatus (*nppMorfFilter8u_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                               const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor);
+        typedef NppStatus (*nppMorfFilter32f_t)(const Npp32f* pSrc, Npp32s nSrcStep, Npp32f* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                                const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor);
+
+        int type_;
+        GpuMat kernel_;
+        Point anchor_;
+        int iters_;
+        nppMorfFilter8u_t func8u_;
+        nppMorfFilter32f_t func32f_;
+
+        GpuMat srcBorder_;
+        GpuMat buf_;
+    };
+
+    MorphologyFilter::MorphologyFilter(int op, int srcType, InputArray _kernel, Point anchor, int iterations) :
+        type_(srcType), anchor_(anchor), iters_(iterations)
+    {
+        static const nppMorfFilter8u_t funcs8u[2][5] =
+        {
+            {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
+            {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
+        };
+        static const nppMorfFilter32f_t funcs32f[2][5] =
+        {
+            {0, nppiErode_32f_C1R, 0, 0, nppiErode_32f_C4R },
+            {0, nppiDilate_32f_C1R, 0, 0, nppiDilate_32f_C4R }
+        };
+
+        CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE );
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC4 );
+
+        Mat kernel = _kernel.getMat();
+        Size ksize = !kernel.empty() ? _kernel.size() : Size(3, 3);
+
+        normalizeAnchor(anchor_, ksize);
+
+        if (kernel.empty())
+        {
+            kernel = getStructuringElement(MORPH_RECT, Size(1 + iters_ * 2, 1 + iters_ * 2));
+            anchor_ = Point(iters_, iters_);
+            iters_ = 1;
+        }
+        else if (iters_ > 1 && cv::countNonZero(kernel) == (int) kernel.total())
+        {
+            anchor_ = Point(anchor_.x * iters_, anchor_.y * iters_);
+            kernel = getStructuringElement(MORPH_RECT,
+                                           Size(ksize.width + (iters_ - 1) * (ksize.width - 1),
+                                                ksize.height + (iters_ - 1) * (ksize.height - 1)),
+                                           anchor_);
+            iters_ = 1;
+        }
+
+        CV_Assert( kernel.channels() == 1 );
+
+        Mat kernel8U;
+        kernel.convertTo(kernel8U, CV_8U);
+
+        kernel_ = cuda::createContinuous(kernel.size(), CV_8UC1);
+        kernel_.upload(kernel8U);
+
+        if(srcType == CV_8UC1 || srcType == CV_8UC4)
+        {
+            func8u_ = funcs8u[op][CV_MAT_CN(srcType)];
+        }
+        else if(srcType == CV_32FC1 || srcType == CV_32FC4)
+        {
+            func32f_ = funcs32f[op][CV_MAT_CN(srcType)];
+        }
+    }
+
+    void MorphologyFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        Size ksize = kernel_.size();
+        cuda::copyMakeBorder(src, srcBorder_, ksize.height, ksize.height, ksize.width, ksize.width, BORDER_DEFAULT, Scalar(), _stream);
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize.width, ksize.height, src.cols, src.rows));
+
+        GpuMat bufRoi;
+        if (iters_ > 1)
+        {
+            ensureSizeIsEnough(srcBorder_.size(), type_, buf_);
+            buf_.setTo(Scalar::all(0), _stream);
+            bufRoi = buf_(Rect(ksize.width, ksize.height, src.cols, src.rows));
+        }
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize.height;
+        oMaskSize.width = ksize.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        if (type_ == CV_8UC1 || type_ == CV_8UC4)
+        {
+            nppSafeCall( func8u_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                                 oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+
+            for(int i = 1; i < iters_; ++i)
+            {
+                dst.copyTo(bufRoi, _stream);
+
+                nppSafeCall( func8u_(bufRoi.ptr<Npp8u>(), static_cast<int>(bufRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                                     oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+            }
+        }
+        else if (type_ == CV_32FC1 || type_ == CV_32FC4)
+        {
+            nppSafeCall( func32f_(srcRoi.ptr<Npp32f>(), static_cast<int>(srcRoi.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                  oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+
+            for(int i = 1; i < iters_; ++i)
+            {
+                dst.copyTo(bufRoi, _stream);
+
+                nppSafeCall( func32f_(bufRoi.ptr<Npp32f>(), static_cast<int>(bufRoi.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                      oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+            }
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+namespace
+{
+    class MorphologyExFilter : public Filter
+    {
+    public:
+        MorphologyExFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+    protected:
+        Ptr<cuda::Filter> erodeFilter_, dilateFilter_;
+        GpuMat buf_;
+    };
+
+    MorphologyExFilter::MorphologyExFilter(int srcType, InputArray kernel, Point anchor, int iterations)
+    {
+        erodeFilter_ = cuda::createMorphologyFilter(MORPH_ERODE, srcType, kernel, anchor, iterations);
+        dilateFilter_ = cuda::createMorphologyFilter(MORPH_DILATE, srcType, kernel, anchor, iterations);
+    }
+
+    // MORPH_OPEN
+
+    class MorphologyOpenFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyOpenFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyOpenFilter::MorphologyOpenFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyOpenFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, buf_, stream);
+        dilateFilter_->apply(buf_, dst, stream);
+    }
+
+    // MORPH_CLOSE
+
+    class MorphologyCloseFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyCloseFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyCloseFilter::MorphologyCloseFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyCloseFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        dilateFilter_->apply(src, buf_, stream);
+        erodeFilter_->apply(buf_, dst, stream);
+    }
+
+    // MORPH_GRADIENT
+
+    class MorphologyGradientFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyGradientFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyGradientFilter::MorphologyGradientFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyGradientFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, buf_, stream);
+        dilateFilter_->apply(src, dst, stream);
+        cuda::subtract(dst, buf_, dst, noArray(), -1, stream);
+    }
+
+    // MORPH_TOPHAT
+
+    class MorphologyTophatFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyTophatFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyTophatFilter::MorphologyTophatFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyTophatFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, dst, stream);
+        dilateFilter_->apply(dst, buf_, stream);
+        cuda::subtract(src, buf_, dst, noArray(), -1, stream);
+    }
+
+    // MORPH_BLACKHAT
+
+    class MorphologyBlackhatFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyBlackhatFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyBlackhatFilter::MorphologyBlackhatFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyBlackhatFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        dilateFilter_->apply(src, dst, stream);
+        erodeFilter_->apply(dst, buf_, stream);
+        cuda::subtract(buf_, src, dst, noArray(), -1, stream);
+    }
+}
+
+Ptr<Filter> cv::cuda::createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor, int iterations)
+{
+    switch( op )
+    {
+    case MORPH_ERODE:
+    case MORPH_DILATE:
+        return makePtr<MorphologyFilter>(op, srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_OPEN:
+        return makePtr<MorphologyOpenFilter>(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_CLOSE:
+        return makePtr<MorphologyCloseFilter>(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_GRADIENT:
+        return makePtr<MorphologyGradientFilter>(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_TOPHAT:
+        return makePtr<MorphologyTophatFilter>(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_BLACKHAT:
+        return makePtr<MorphologyBlackhatFilter>(srcType, kernel, anchor, iterations);
+        break;
+
+    default:
+        CV_Error(Error::StsBadArg, "Unknown morphological operation");
+        return Ptr<Filter>();
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Image Rank Filter
+
+namespace
+{
+    enum RankType
+    {
+        RANK_MAX,
+        RANK_MIN
+    };
+
+    class NPPRankFilter : public Filter
+    {
+    public:
+        NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef NppStatus (*nppFilterRank_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                             NppiSize oMaskSize, NppiPoint oAnchor);
+
+        int type_;
+        Size ksize_;
+        Point anchor_;
+        int borderMode_;
+        Scalar borderVal_;
+        nppFilterRank_t func_;
+
+        GpuMat srcBorder_;
+    };
+
+    NPPRankFilter::NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
+        type_(srcType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        static const nppFilterRank_t maxFuncs[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};
+        static const nppFilterRank_t minFuncs[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};
+
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
+
+        normalizeAnchor(anchor_, ksize_);
+
+        if (op == RANK_MAX)
+            func_ = maxFuncs[CV_MAT_CN(srcType)];
+        else
+            func_ = minFuncs[CV_MAT_CN(srcType)];
+    }
+
+    void NPPRankFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        cuda::copyMakeBorder(src, srcBorder_, ksize_.height, ksize_.height, ksize_.width, ksize_.width, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_.width, ksize_.height, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize_.height;
+        oMaskSize.width = ksize_.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, oMaskSize, oAnchor) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+Ptr<Filter> cv::cuda::createBoxMaxFilter(int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
+{
+    return makePtr<NPPRankFilter>(RANK_MAX, srcType, ksize, anchor, borderMode, borderVal);
+}
+
+Ptr<Filter> cv::cuda::createBoxMinFilter(int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
+{
+    return makePtr<NPPRankFilter>(RANK_MIN, srcType, ksize, anchor, borderMode, borderVal);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 1D Sum Filter
+
+namespace
+{
+    class NppRowSumFilter : public Filter
+    {
+    public:
+        NppRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int srcType_, dstType_;
+        int ksize_;
+        int anchor_;
+        int borderMode_;
+        Scalar borderVal_;
+
+        GpuMat srcBorder_;
+    };
+
+    NppRowSumFilter::NppRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal) :
+        srcType_(srcType), dstType_(dstType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        CV_Assert( srcType_ == CV_8UC1 );
+        CV_Assert( dstType_ == CV_32FC1 );
+
+        normalizeAnchor(anchor_, ksize_);
+    }
+
+    void NppRowSumFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        cuda::copyMakeBorder(src, srcBorder_, 0, 0, ksize_, ksize_, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_, 0, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        nppSafeCall( nppiSumWindowRow_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                                                dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                                oSizeROI, ksize_, anchor_) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+Ptr<Filter> cv::cuda::createRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal)
+{
+    return makePtr<NppRowSumFilter>(srcType, dstType, ksize, anchor, borderMode, borderVal);
+}
+
+namespace
+{
+    class NppColumnSumFilter : public Filter
+    {
+    public:
+        NppColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int srcType_, dstType_;
+        int ksize_;
+        int anchor_;
+        int borderMode_;
+        Scalar borderVal_;
+
+        GpuMat srcBorder_;
+    };
+
+    NppColumnSumFilter::NppColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal) :
+        srcType_(srcType), dstType_(dstType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        CV_Assert( srcType_ == CV_8UC1 );
+        CV_Assert( dstType_ == CV_32FC1 );
+
+        normalizeAnchor(anchor_, ksize_);
+    }
+
+    void NppColumnSumFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        cuda::copyMakeBorder(src, srcBorder_, ksize_, ksize_, 0, 0, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(0, ksize_, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        nppSafeCall( nppiSumWindowColumn_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                                                   dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                                   oSizeROI, ksize_, anchor_) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+Ptr<Filter> cv::cuda::createColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal)
+{
+    return makePtr<NppColumnSumFilter>(srcType, dstType, ksize, anchor, borderMode, borderVal);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Median Filter
+
+
+
+namespace cv { namespace cuda { namespace device
+{
+    void medianFiltering_gpu(const PtrStepSzb src, PtrStepSzb dst, PtrStepSzi devHist,
+        PtrStepSzi devCoarseHist,int kernel, int partitions, cudaStream_t stream);
+}}}
+
+namespace
+{
+    class MedianFilter : public Filter
+    {
+    public:
+        MedianFilter(int srcType, int _windowSize, int _partitions=128);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int windowSize;
+        int partitions;
+        GpuMat devHist;
+        GpuMat devCoarseHist;
+    };
+
+    MedianFilter::MedianFilter(int srcType, int _windowSize, int _partitions) :
+        windowSize(_windowSize),partitions(_partitions)
+    {
+        CV_Assert( srcType == CV_8UC1 );
+        CV_Assert(windowSize>=3);
+        CV_Assert(_partitions>=1);
+
+    }
+
+    void MedianFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        using namespace cv::cuda::device;
+
+        GpuMat src = _src.getGpuMat();
+         _dst.create(src.rows, src.cols, src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        if (partitions>src.rows)
+            partitions=src.rows/2;
+
+        // Kernel needs to be half window size
+        int kernel=windowSize/2;
+
+        CV_Assert(kernel < src.rows);
+        CV_Assert(kernel < src.cols);
+
+        // Note - these are hardcoded in the actual GPU kernel. Do not change these values.
+        int histSize=256, histCoarseSize=8;
+
+        devHist.create(1, src.cols*histSize*partitions, CV_32SC1);
+        devCoarseHist.create(1, src.cols*histCoarseSize*partitions, CV_32SC1);
+
+        devHist.setTo(0, _stream);
+        devCoarseHist.setTo(0, _stream);
+
+        medianFiltering_gpu(src,dst,devHist, devCoarseHist,kernel,partitions,StreamAccessor::getStream(_stream));
+    }
+}
+
+Ptr<Filter> cv::cuda::createMedianFilter(int srcType, int _windowSize, int _partitions)
+{
+    return makePtr<MedianFilter>(srcType, _windowSize,_partitions);
+}
+
+#endif
diff --git a/modules/cudafilters/src/precomp.hpp b/modules/cudafilters/src/precomp.hpp
new file mode 100644
index 00000000000..f2c47073fee
--- /dev/null
+++ b/modules/cudafilters/src/precomp.hpp
@@ -0,0 +1,54 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "opencv2/cudafilters.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/imgproc.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudafilters/test/test_filters.cpp b/modules/cudafilters/test/test_filters.cpp
new file mode 100644
index 00000000000..a86d8c9fff4
--- /dev/null
+++ b/modules/cudafilters/test/test_filters.cpp
@@ -0,0 +1,708 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(KSize, cv::Size)
+    IMPLEMENT_PARAM_CLASS(Anchor, cv::Point)
+    IMPLEMENT_PARAM_CLASS(Deriv_X, int)
+    IMPLEMENT_PARAM_CLASS(Deriv_Y, int)
+    IMPLEMENT_PARAM_CLASS(Iterations, int)
+    IMPLEMENT_PARAM_CLASS(KernelSize, int)
+
+    cv::Mat getInnerROI(cv::InputArray m_, cv::Size ksize)
+    {
+        cv::Mat m = getMat(m_);
+        cv::Rect roi(ksize.width, ksize.height, m.cols - 2 * ksize.width, m.rows - 2 * ksize.height);
+        return m(roi);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Blur
+
+PARAM_TEST_CASE(Blur, cv::cuda::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        anchor = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Blur, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::Ptr<cv::cuda::Filter> blurFilter = cv::cuda::createBoxFilter(src.type(), -1, ksize, anchor, borderType);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    blurFilter->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::blur(src, dst_gold, ksize, anchor, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, Blur, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PARAM_TEST_CASE(Filter2D, cv::cuda::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        anchor = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Filter2D, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::cuda::Filter> filter2D = cv::cuda::createLinearFilter(src.type(), -1, kernel, anchor, borderType);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    filter2D->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, Filter2D, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PARAM_TEST_CASE(Laplacian, cv::cuda::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Laplacian, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::Ptr<cv::cuda::Filter> laplacian = cv::cuda::createLaplacianFilter(src.type(), -1, ksize.width);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    laplacian->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::Laplacian(src, dst_gold, -1, ksize.width);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, Laplacian, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
+    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// SeparableLinearFilter
+
+PARAM_TEST_CASE(SeparableLinearFilter, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cn;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        anchor = GET_PARAM(5);
+        borderType = GET_PARAM(6);
+        useRoi = GET_PARAM(7);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+CUDA_TEST_P(SeparableLinearFilter, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat rowKernel = randomMat(Size(ksize.width, 1), CV_32FC1, 0.0, 1.0);
+    cv::Mat columnKernel = randomMat(Size(ksize.height, 1), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::cuda::Filter> filter = cv::cuda::createSeparableLinearFilter(src.type(), -1, rowKernel, columnKernel, anchor, borderType);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    filter->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::sepFilter2D(src, dst_gold, -1, rowKernel, columnKernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 1.0 : 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, SeparableLinearFilter, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)),
+                    KSize(cv::Size(7, 7)),
+                    KSize(cv::Size(13, 13)),
+                    KSize(cv::Size(15, 15)),
+                    KSize(cv::Size(17, 17)),
+                    KSize(cv::Size(23, 15)),
+                    KSize(cv::Size(31, 3))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Sobel
+
+PARAM_TEST_CASE(Sobel, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, KSize, Deriv_X, Deriv_Y, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cn;
+    cv::Size ksize;
+    int dx;
+    int dy;
+    int borderType;
+    bool useRoi;
+
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        dx = GET_PARAM(5);
+        dy = GET_PARAM(6);
+        borderType = GET_PARAM(7);
+        useRoi = GET_PARAM(8);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+CUDA_TEST_P(Sobel, Accuracy)
+{
+    if (dx == 0 && dy == 0)
+        return;
+
+    cv::Mat src = randomMat(size, type);
+
+    cv::Ptr<cv::cuda::Filter> sobel = cv::cuda::createSobelFilter(src.type(), -1, dx, dy, ksize.width, 1.0, borderType);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    sobel->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::Sobel(src, dst_gold, -1, dx, dy, ksize.width, 1.0, 0.0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, Sobel, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
+    testing::Values(Deriv_X(0), Deriv_X(1), Deriv_X(2)),
+    testing::Values(Deriv_Y(0), Deriv_Y(1), Deriv_Y(2)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Scharr
+
+PARAM_TEST_CASE(Scharr, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, Deriv_X, Deriv_Y, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cn;
+    int dx;
+    int dy;
+    int borderType;
+    bool useRoi;
+
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        dx = GET_PARAM(4);
+        dy = GET_PARAM(5);
+        borderType = GET_PARAM(6);
+        useRoi = GET_PARAM(7);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+CUDA_TEST_P(Scharr, Accuracy)
+{
+    if (dx + dy != 1)
+        return;
+
+    cv::Mat src = randomMat(size, type);
+
+    cv::Ptr<cv::cuda::Filter> scharr = cv::cuda::createScharrFilter(src.type(), -1, dx, dy, 1.0, borderType);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    scharr->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::Scharr(src, dst_gold, -1, dx, dy, 1.0, 0.0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, Scharr, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(Deriv_X(0), Deriv_X(1)),
+    testing::Values(Deriv_Y(0), Deriv_Y(1)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// GaussianBlur
+
+PARAM_TEST_CASE(GaussianBlur, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, KSize, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cn;
+    cv::Size ksize;
+    int borderType;
+    bool useRoi;
+
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+CUDA_TEST_P(GaussianBlur, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    double sigma1 = randomDouble(0.1, 1.0);
+    double sigma2 = randomDouble(0.1, 1.0);
+
+    cv::Ptr<cv::cuda::Filter> gauss = cv::cuda::createGaussianFilter(src.type(), -1, ksize, sigma1, sigma2, borderType);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    gauss->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 4.0 : 1e-4);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, GaussianBlur, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)),
+                    KSize(cv::Size(5, 5)),
+                    KSize(cv::Size(7, 7)),
+                    KSize(cv::Size(9, 9)),
+                    KSize(cv::Size(11, 11)),
+                    KSize(cv::Size(13, 13)),
+                    KSize(cv::Size(15, 15)),
+                    KSize(cv::Size(17, 17)),
+                    KSize(cv::Size(19, 19)),
+                    KSize(cv::Size(21, 21)),
+                    KSize(cv::Size(23, 23)),
+                    KSize(cv::Size(25, 25)),
+                    KSize(cv::Size(27, 27)),
+                    KSize(cv::Size(29, 29)),
+                    KSize(cv::Size(31, 31))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Erode
+
+PARAM_TEST_CASE(Erode, cv::cuda::DeviceInfo, cv::Size, MatType, Anchor, Iterations, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Point anchor;
+    int iterations;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        anchor = GET_PARAM(3);
+        iterations = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Erode, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
+
+    cv::Ptr<cv::cuda::Filter> erode = cv::cuda::createMorphologyFilter(cv::MORPH_ERODE, src.type(), kernel, anchor, iterations);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    erode->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::erode(src, dst_gold, kernel, anchor, iterations);
+
+    cv::Size ksize = cv::Size(kernel.cols + iterations * (kernel.cols - 1), kernel.rows + iterations * (kernel.rows - 1));
+
+    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, Erode, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(Iterations(1), Iterations(2), Iterations(3)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Dilate
+
+PARAM_TEST_CASE(Dilate, cv::cuda::DeviceInfo, cv::Size, MatType, Anchor, Iterations, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Point anchor;
+    int iterations;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        anchor = GET_PARAM(3);
+        iterations = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Dilate, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
+
+    cv::Ptr<cv::cuda::Filter> dilate = cv::cuda::createMorphologyFilter(cv::MORPH_DILATE, src.type(), kernel, anchor, iterations);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    dilate->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::dilate(src, dst_gold, kernel, anchor, iterations);
+
+    cv::Size ksize = cv::Size(kernel.cols + iterations * (kernel.cols - 1), kernel.rows + iterations * (kernel.rows - 1));
+
+    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, Dilate, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(Iterations(1), Iterations(2), Iterations(3)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// MorphEx
+
+CV_ENUM(MorphOp, MORPH_OPEN, MORPH_CLOSE, MORPH_GRADIENT, MORPH_TOPHAT, MORPH_BLACKHAT)
+
+PARAM_TEST_CASE(MorphEx, cv::cuda::DeviceInfo, cv::Size, MatType, MorphOp, Anchor, Iterations, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int morphOp;
+    cv::Point anchor;
+    int iterations;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        morphOp = GET_PARAM(3);
+        anchor = GET_PARAM(4);
+        iterations = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(MorphEx, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
+
+    cv::Ptr<cv::cuda::Filter> morph = cv::cuda::createMorphologyFilter(morphOp, src.type(), kernel, anchor, iterations);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    morph->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::morphologyEx(src, dst_gold, morphOp, kernel, anchor, iterations);
+
+    cv::Size border = cv::Size(kernel.cols + (iterations + 1) * kernel.cols + 2, kernel.rows + (iterations + 1) * kernel.rows + 2);
+
+    EXPECT_MAT_NEAR(getInnerROI(dst_gold, border), getInnerROI(dst, border), 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, MorphEx, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
+    MorphOp::all(),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(Iterations(1), Iterations(2), Iterations(3)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Median
+
+
+PARAM_TEST_CASE(Median, cv::cuda::DeviceInfo, cv::Size, MatDepth,  KernelSize, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int kernel;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        kernel = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+
+
+CUDA_TEST_P(Median, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::Ptr<cv::cuda::Filter> median = cv::cuda::createMedianFilter(src.type(), kernel);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    median->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::medianBlur(src,dst_gold,kernel);
+
+    cv::Rect rect(kernel+1,0,src.cols-(2*kernel+1),src.rows);
+    cv::Mat dst_gold_no_border = dst_gold(rect);
+    cv::cuda::GpuMat dst_no_border = cv::cuda::GpuMat(dst, rect);
+
+    EXPECT_MAT_NEAR(dst_gold_no_border, dst_no_border, 1);
+
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, Median, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(KernelSize(3),
+                    KernelSize(5),
+                    KernelSize(7),
+                    KernelSize(9),
+                    KernelSize(11),
+                    KernelSize(13),
+                    KernelSize(15)),
+    WHOLE_SUBMAT)
+    );
+
+}} // namespace
+
+#endif // HAVE_CUDA
diff --git a/modules/cudafilters/test/test_main.cpp b/modules/cudafilters/test/test_main.cpp
new file mode 100644
index 00000000000..04f4fcf6e60
--- /dev/null
+++ b/modules/cudafilters/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cudafilters/test/test_precomp.hpp b/modules/cudafilters/test/test_precomp.hpp
new file mode 100644
index 00000000000..5bb815b7416
--- /dev/null
+++ b/modules/cudafilters/test/test_precomp.hpp
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudafilters.hpp"
+
+#include "cvconfig.h"
+
+#endif
diff --git a/modules/cudaimgproc/CMakeLists.txt b/modules/cudaimgproc/CMakeLists.txt
new file mode 100644
index 00000000000..8d06804ddcc
--- /dev/null
+++ b/modules/cudaimgproc/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR WINRT OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudaimgproc)
+endif()
+
+set(the_description "CUDA-accelerated Image Processing")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)
+
+ocv_define_module(cudaimgproc opencv_imgproc OPTIONAL opencv_cudev opencv_cudaarithm opencv_cudafilters WRAP python)
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
new file mode 100644
index 00000000000..038dc9d053d
--- /dev/null
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -0,0 +1,738 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDAIMGPROC_HPP
+#define OPENCV_CUDAIMGPROC_HPP
+
+#ifndef __cplusplus
+#  error cudaimgproc.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/imgproc.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudaimgproc Image Processing
+    @{
+      @defgroup cudaimgproc_color Color space processing
+      @defgroup cudaimgproc_hist Histogram Calculation
+      @defgroup cudaimgproc_hough Hough Transform
+      @defgroup cudaimgproc_feature Feature Detection
+    @}
+  @}
+*/
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudaimgproc
+//! @{
+
+/////////////////////////// Color Processing ///////////////////////////
+
+//! @addtogroup cudaimgproc_color
+//! @{
+
+/** @brief Converts an image from one color space to another.
+
+@param src Source image with CV_8U , CV_16U , or CV_32F depth and 1, 3, or 4 channels.
+@param dst Destination image.
+@param code Color space conversion code. For details, see cvtColor .
+@param dcn Number of channels in the destination image. If the parameter is 0, the number of the
+channels is derived automatically from src and the code .
+@param stream Stream for the asynchronous version.
+
+3-channel color spaces (like HSV, XYZ, and so on) can be stored in a 4-channel image for better
+performance.
+
+@sa cvtColor
+ */
+CV_EXPORTS_W void cvtColor(InputArray src, OutputArray dst, int code, int dcn = 0, Stream& stream = Stream::Null());
+
+enum DemosaicTypes
+{
+    //! Bayer Demosaicing (Malvar, He, and Cutler)
+    COLOR_BayerBG2BGR_MHT = 256,
+    COLOR_BayerGB2BGR_MHT = 257,
+    COLOR_BayerRG2BGR_MHT = 258,
+    COLOR_BayerGR2BGR_MHT = 259,
+
+    COLOR_BayerBG2RGB_MHT = COLOR_BayerRG2BGR_MHT,
+    COLOR_BayerGB2RGB_MHT = COLOR_BayerGR2BGR_MHT,
+    COLOR_BayerRG2RGB_MHT = COLOR_BayerBG2BGR_MHT,
+    COLOR_BayerGR2RGB_MHT = COLOR_BayerGB2BGR_MHT,
+
+    COLOR_BayerBG2GRAY_MHT = 260,
+    COLOR_BayerGB2GRAY_MHT = 261,
+    COLOR_BayerRG2GRAY_MHT = 262,
+    COLOR_BayerGR2GRAY_MHT = 263
+};
+
+/** @brief Converts an image from Bayer pattern to RGB or grayscale.
+
+@param src Source image (8-bit or 16-bit single channel).
+@param dst Destination image.
+@param code Color space conversion code (see the description below).
+@param dcn Number of channels in the destination image. If the parameter is 0, the number of the
+channels is derived automatically from src and the code .
+@param stream Stream for the asynchronous version.
+
+The function can do the following transformations:
+
+-   Demosaicing using bilinear interpolation
+
+    > -   COLOR_BayerBG2GRAY , COLOR_BayerGB2GRAY , COLOR_BayerRG2GRAY , COLOR_BayerGR2GRAY
+    > -   COLOR_BayerBG2BGR , COLOR_BayerGB2BGR , COLOR_BayerRG2BGR , COLOR_BayerGR2BGR
+
+-   Demosaicing using Malvar-He-Cutler algorithm (@cite MHT2011)
+
+    > -   COLOR_BayerBG2GRAY_MHT , COLOR_BayerGB2GRAY_MHT , COLOR_BayerRG2GRAY_MHT ,
+    >     COLOR_BayerGR2GRAY_MHT
+    > -   COLOR_BayerBG2BGR_MHT , COLOR_BayerGB2BGR_MHT , COLOR_BayerRG2BGR_MHT ,
+    >     COLOR_BayerGR2BGR_MHT
+
+@sa cvtColor
+ */
+CV_EXPORTS_W void demosaicing(InputArray src, OutputArray dst, int code, int dcn = -1, Stream& stream = Stream::Null());
+
+/** @brief Exchanges the color channels of an image in-place.
+
+@param image Source image. Supports only CV_8UC4 type.
+@param dstOrder Integer array describing how channel values are permutated. The n-th entry of the
+array contains the number of the channel that is stored in the n-th channel of the output image.
+E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR channel order.
+@param stream Stream for the asynchronous version.
+
+The methods support arbitrary permutations of the original channels, including replication.
+ */
+CV_EXPORTS void swapChannels(InputOutputArray image, const int dstOrder[4], Stream& stream = Stream::Null());
+
+/** @brief Routines for correcting image color gamma.
+
+@param src Source image (3- or 4-channel 8 bit).
+@param dst Destination image.
+@param forward true for forward gamma correction or false for inverse gamma correction.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void gammaCorrection(InputArray src, OutputArray dst, bool forward = true, Stream& stream = Stream::Null());
+
+enum AlphaCompTypes { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL,
+       ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL};
+
+/** @brief Composites two images using alpha opacity values contained in each image.
+
+@param img1 First image. Supports CV_8UC4 , CV_16UC4 , CV_32SC4 and CV_32FC4 types.
+@param img2 Second image. Must have the same size and the same type as img1 .
+@param dst Destination image.
+@param alpha_op Flag specifying the alpha-blending operation:
+-   **ALPHA_OVER**
+-   **ALPHA_IN**
+-   **ALPHA_OUT**
+-   **ALPHA_ATOP**
+-   **ALPHA_XOR**
+-   **ALPHA_PLUS**
+-   **ALPHA_OVER_PREMUL**
+-   **ALPHA_IN_PREMUL**
+-   **ALPHA_OUT_PREMUL**
+-   **ALPHA_ATOP_PREMUL**
+-   **ALPHA_XOR_PREMUL**
+-   **ALPHA_PLUS_PREMUL**
+-   **ALPHA_PREMUL**
+@param stream Stream for the asynchronous version.
+
+@note
+   -   An example demonstrating the use of alphaComp can be found at
+        opencv_source_code/samples/gpu/alpha_comp.cpp
+ */
+CV_EXPORTS_W void alphaComp(InputArray img1, InputArray img2, OutputArray dst, int alpha_op, Stream& stream = Stream::Null());
+
+//! @} cudaimgproc_color
+
+////////////////////////////// Histogram ///////////////////////////////
+
+//! @addtogroup cudaimgproc_hist
+//! @{
+
+/** @brief Calculates histogram for one channel 8-bit image.
+
+@param src Source image with CV_8UC1 type.
+@param hist Destination histogram with one row, 256 columns, and the CV_32SC1 type.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void calcHist(InputArray src, OutputArray hist, Stream& stream = Stream::Null());
+
+/** @brief Calculates histogram for one channel 8-bit image confined in given mask.
+
+@param src Source image with CV_8UC1 type.
+@param hist Destination histogram with one row, 256 columns, and the CV_32SC1 type.
+@param mask A mask image same size as src and of type CV_8UC1.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void calcHist(InputArray src, InputArray mask, OutputArray hist, Stream& stream = Stream::Null());
+
+/** @brief Equalizes the histogram of a grayscale image.
+
+@param src Source image with CV_8UC1 type.
+@param dst Destination image.
+@param stream Stream for the asynchronous version.
+
+@sa equalizeHist
+ */
+CV_EXPORTS_W void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Base class for Contrast Limited Adaptive Histogram Equalization. :
+ */
+class CV_EXPORTS_W CLAHE : public cv::CLAHE
+{
+public:
+    using cv::CLAHE::apply;
+    /** @brief Equalizes the histogram of a grayscale image using Contrast Limited Adaptive Histogram Equalization.
+
+    @param src Source image with CV_8UC1 type.
+    @param dst Destination image.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void apply(InputArray src, OutputArray dst, Stream& stream) = 0;
+};
+
+/** @brief Creates implementation for cuda::CLAHE .
+
+@param clipLimit Threshold for contrast limiting.
+@param tileGridSize Size of grid for histogram equalization. Input image will be divided into
+equally sized rectangular tiles. tileGridSize defines the number of tiles in row and column.
+ */
+CV_EXPORTS_W Ptr<cuda::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
+
+/** @brief Computes levels with even distribution.
+
+@param levels Destination array. levels has 1 row, nLevels columns, and the CV_32SC1 type.
+@param nLevels Number of computed levels. nLevels must be at least 2.
+@param lowerLevel Lower boundary value of the lowest level.
+@param upperLevel Upper boundary value of the greatest level.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
+
+/** @brief Calculates a histogram with evenly distributed bins.
+
+@param src Source image. CV_8U, CV_16U, or CV_16S depth and 1 or 4 channels are supported. For
+a four-channel image, all channels are processed separately.
+@param hist Destination histogram with one row, histSize columns, and the CV_32S type.
+@param histSize Size of the histogram.
+@param lowerLevel Lower boundary of lowest-level bin.
+@param upperLevel Upper boundary of highest-level bin.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
+/** @overload */
+CV_EXPORTS_W void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
+
+/** @brief Calculates a histogram with bins determined by the levels array.
+
+@param src Source image. CV_8U , CV_16U , or CV_16S depth and 1 or 4 channels are supported.
+For a four-channel image, all channels are processed separately.
+@param hist Destination histogram with one row, (levels.cols-1) columns, and the CV_32SC1 type.
+@param levels Number of levels in the histogram.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null());
+/** @overload */
+CV_EXPORTS_W void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null());
+
+//! @} cudaimgproc_hist
+
+//////////////////////////////// Canny ////////////////////////////////
+
+/** @brief Base class for Canny Edge Detector. :
+ */
+class CV_EXPORTS_W CannyEdgeDetector : public Algorithm
+{
+public:
+    /** @brief Finds edges in an image using the @cite Canny86 algorithm.
+
+    @param image Single-channel 8-bit input image.
+    @param edges Output edge map. It has the same size and type as image.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void detect(InputArray image, OutputArray edges, Stream& stream = Stream::Null()) = 0;
+    /** @overload
+    @param dx First derivative of image in the vertical direction. Support only CV_32S type.
+    @param dy First derivative of image in the horizontal direction. Support only CV_32S type.
+    @param edges Output edge map. It has the same size and type as image.
+    @param stream Stream for the asynchronous version.
+    */
+    CV_WRAP virtual void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream = Stream::Null()) = 0;
+
+    CV_WRAP virtual void setLowThreshold(double low_thresh) = 0;
+    CV_WRAP virtual double getLowThreshold() const = 0;
+
+    CV_WRAP virtual void setHighThreshold(double high_thresh) = 0;
+    CV_WRAP virtual double getHighThreshold() const = 0;
+
+    CV_WRAP virtual void setAppertureSize(int apperture_size) = 0;
+    CV_WRAP virtual int getAppertureSize() const = 0;
+
+    CV_WRAP virtual void setL2Gradient(bool L2gradient) = 0;
+    CV_WRAP virtual bool getL2Gradient() const = 0;
+};
+
+/** @brief Creates implementation for cuda::CannyEdgeDetector .
+
+@param low_thresh First threshold for the hysteresis procedure.
+@param high_thresh Second threshold for the hysteresis procedure.
+@param apperture_size Aperture size for the Sobel operator.
+@param L2gradient Flag indicating whether a more accurate \f$L_2\f$ norm
+\f$=\sqrt{(dI/dx)^2 + (dI/dy)^2}\f$ should be used to compute the image gradient magnitude (
+L2gradient=true ), or a faster default \f$L_1\f$ norm \f$=|dI/dx|+|dI/dy|\f$ is enough ( L2gradient=false
+).
+ */
+CV_EXPORTS_W Ptr<CannyEdgeDetector> createCannyEdgeDetector(double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+
+/////////////////////////// Hough Transform ////////////////////////////
+
+//////////////////////////////////////
+// HoughLines
+
+//! @addtogroup cudaimgproc_hough
+//! @{
+
+/** @brief Base class for lines detector algorithm. :
+ */
+class CV_EXPORTS_W HoughLinesDetector : public Algorithm
+{
+public:
+    /** @brief Finds lines in a binary image using the classical Hough transform.
+
+    @param src 8-bit, single-channel binary source image.
+    @param lines Output vector of lines. Each line is represented by a two-element vector
+    \f$(\rho, \theta)\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of
+    the image). \f$\theta\f$ is the line rotation angle in radians (
+    \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ).
+    @param stream Stream for the asynchronous version.
+
+    @sa HoughLines
+     */
+    CV_WRAP virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Downloads results from cuda::HoughLinesDetector::detect to host memory.
+
+    @param d_lines Result of cuda::HoughLinesDetector::detect .
+    @param h_lines Output host array.
+    @param h_votes Optional output array for line's votes.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray(), Stream& stream = Stream::Null()) = 0;
+
+    CV_WRAP virtual void setRho(float rho) = 0;
+    CV_WRAP virtual float getRho() const = 0;
+
+    CV_WRAP virtual void setTheta(float theta) = 0;
+    CV_WRAP virtual float getTheta() const = 0;
+
+    CV_WRAP virtual void setThreshold(int threshold) = 0;
+    CV_WRAP virtual int getThreshold() const = 0;
+
+    CV_WRAP virtual void setDoSort(bool doSort) = 0;
+    CV_WRAP virtual bool getDoSort() const = 0;
+
+    CV_WRAP virtual void setMaxLines(int maxLines) = 0;
+    CV_WRAP virtual int getMaxLines() const = 0;
+};
+
+/** @brief Creates implementation for cuda::HoughLinesDetector .
+
+@param rho Distance resolution of the accumulator in pixels.
+@param theta Angle resolution of the accumulator in radians.
+@param threshold Accumulator threshold parameter. Only those lines are returned that get enough
+votes ( \f$>\texttt{threshold}\f$ ).
+@param doSort Performs lines sort by votes.
+@param maxLines Maximum number of output lines.
+ */
+CV_EXPORTS_W Ptr<HoughLinesDetector> createHoughLinesDetector(float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
+
+
+//////////////////////////////////////
+// HoughLinesP
+
+/** @brief Base class for line segments detector algorithm. :
+ */
+class CV_EXPORTS_W HoughSegmentDetector : public Algorithm
+{
+public:
+    /** @brief Finds line segments in a binary image using the probabilistic Hough transform.
+
+    @param src 8-bit, single-channel binary source image.
+    @param lines Output vector of lines. Each line is represented by a 4-element vector
+    \f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected
+    line segment.
+    @param stream Stream for the asynchronous version.
+
+    @sa HoughLinesP
+     */
+    CV_WRAP virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0;
+
+    CV_WRAP virtual void setRho(float rho) = 0;
+    CV_WRAP virtual float getRho() const = 0;
+
+    CV_WRAP virtual void setTheta(float theta) = 0;
+    CV_WRAP virtual float getTheta() const = 0;
+
+    CV_WRAP virtual void setMinLineLength(int minLineLength) = 0;
+    CV_WRAP virtual int getMinLineLength() const = 0;
+
+    CV_WRAP virtual void setMaxLineGap(int maxLineGap) = 0;
+    CV_WRAP virtual int getMaxLineGap() const = 0;
+
+    CV_WRAP virtual void setMaxLines(int maxLines) = 0;
+    CV_WRAP virtual int getMaxLines() const = 0;
+};
+
+/** @brief Creates implementation for cuda::HoughSegmentDetector .
+
+@param rho Distance resolution of the accumulator in pixels.
+@param theta Angle resolution of the accumulator in radians.
+@param minLineLength Minimum line length. Line segments shorter than that are rejected.
+@param maxLineGap Maximum allowed gap between points on the same line to link them.
+@param maxLines Maximum number of output lines.
+ */
+CV_EXPORTS_W Ptr<HoughSegmentDetector> createHoughSegmentDetector(float rho, float theta, int minLineLength, int maxLineGap, int maxLines = 4096);
+
+//////////////////////////////////////
+// HoughCircles
+
+/** @brief Base class for circles detector algorithm. :
+ */
+class CV_EXPORTS_W HoughCirclesDetector : public Algorithm
+{
+public:
+    /** @brief Finds circles in a grayscale image using the Hough transform.
+
+    @param src 8-bit, single-channel grayscale input image.
+    @param circles Output vector of found circles. Each vector is encoded as a 3-element
+    floating-point vector \f$(x, y, radius)\f$ .
+    @param stream Stream for the asynchronous version.
+
+    @sa HoughCircles
+     */
+    CV_WRAP virtual void detect(InputArray src, OutputArray circles, Stream& stream = Stream::Null()) = 0;
+
+    CV_WRAP virtual void setDp(float dp) = 0;
+    CV_WRAP virtual float getDp() const = 0;
+
+    CV_WRAP virtual void setMinDist(float minDist) = 0;
+    CV_WRAP virtual float getMinDist() const = 0;
+
+    CV_WRAP virtual void setCannyThreshold(int cannyThreshold) = 0;
+    CV_WRAP virtual int getCannyThreshold() const = 0;
+
+    CV_WRAP virtual void setVotesThreshold(int votesThreshold) = 0;
+    CV_WRAP virtual int getVotesThreshold() const = 0;
+
+    CV_WRAP virtual void setMinRadius(int minRadius) = 0;
+    CV_WRAP virtual int getMinRadius() const = 0;
+
+    CV_WRAP virtual void setMaxRadius(int maxRadius) = 0;
+    CV_WRAP virtual int getMaxRadius() const = 0;
+
+    CV_WRAP virtual void setMaxCircles(int maxCircles) = 0;
+    CV_WRAP virtual int getMaxCircles() const = 0;
+};
+
+/** @brief Creates implementation for cuda::HoughCirclesDetector .
+
+@param dp Inverse ratio of the accumulator resolution to the image resolution. For example, if
+dp=1 , the accumulator has the same resolution as the input image. If dp=2 , the accumulator has
+half as big width and height.
+@param minDist Minimum distance between the centers of the detected circles. If the parameter is
+too small, multiple neighbor circles may be falsely detected in addition to a true one. If it is
+too large, some circles may be missed.
+@param cannyThreshold The higher threshold of the two passed to Canny edge detector (the lower one
+is twice smaller).
+@param votesThreshold The accumulator threshold for the circle centers at the detection stage. The
+smaller it is, the more false circles may be detected.
+@param minRadius Minimum circle radius.
+@param maxRadius Maximum circle radius.
+@param maxCircles Maximum number of output circles.
+ */
+CV_EXPORTS_W Ptr<HoughCirclesDetector> createHoughCirclesDetector(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
+
+//////////////////////////////////////
+// GeneralizedHough
+
+/** @brief Creates implementation for generalized hough transform from @cite Ballard1981 .
+ */
+CV_EXPORTS_W Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();
+
+/** @brief Creates implementation for generalized hough transform from @cite Guil1999 .
+ */
+CV_EXPORTS_W Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();
+
+//! @} cudaimgproc_hough
+
+////////////////////////// Corners Detection ///////////////////////////
+
+//! @addtogroup cudaimgproc_feature
+//! @{
+
+/** @brief Base class for Cornerness Criteria computation. :
+ */
+class CV_EXPORTS_W CornernessCriteria : public Algorithm
+{
+public:
+    /** @brief Computes the cornerness criteria at each image pixel.
+
+    @param src Source image.
+    @param dst Destination image containing cornerness values. It will have the same size as src and
+    CV_32FC1 type.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+};
+
+/** @brief Creates implementation for Harris cornerness criteria.
+
+@param srcType Input source type. Only CV_8UC1 and CV_32FC1 are supported for now.
+@param blockSize Neighborhood size.
+@param ksize Aperture parameter for the Sobel operator.
+@param k Harris detector free parameter.
+@param borderType Pixel extrapolation method. Only BORDER_REFLECT101 and BORDER_REPLICATE are
+supported for now.
+
+@sa cornerHarris
+ */
+CV_EXPORTS_W Ptr<CornernessCriteria> createHarrisCorner(int srcType, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);
+
+/** @brief Creates implementation for the minimum eigen value of a 2x2 derivative covariation matrix (the
+cornerness criteria).
+
+@param srcType Input source type. Only CV_8UC1 and CV_32FC1 are supported for now.
+@param blockSize Neighborhood size.
+@param ksize Aperture parameter for the Sobel operator.
+@param borderType Pixel extrapolation method. Only BORDER_REFLECT101 and BORDER_REPLICATE are
+supported for now.
+
+@sa cornerMinEigenVal
+ */
+CV_EXPORTS_W Ptr<CornernessCriteria> createMinEigenValCorner(int srcType, int blockSize, int ksize, int borderType = BORDER_REFLECT101);
+
+////////////////////////// Corners Detection ///////////////////////////
+
+/** @brief Base class for Corners Detector. :
+ */
+class CV_EXPORTS_W CornersDetector : public Algorithm
+{
+public:
+    /** @brief Determines strong corners on an image.
+
+    @param image Input 8-bit or floating-point 32-bit, single-channel image.
+    @param corners Output vector of detected corners (1-row matrix with CV_32FC2 type with corners
+    positions).
+    @param mask Optional region of interest. If the image is not empty (it needs to have the type
+    CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray(), Stream& stream = Stream::Null()) = 0;
+};
+
+/** @brief Creates implementation for cuda::CornersDetector .
+
+@param srcType Input source type. Only CV_8UC1 and CV_32FC1 are supported for now.
+@param maxCorners Maximum number of corners to return. If there are more corners than are found,
+the strongest of them is returned.
+@param qualityLevel Parameter characterizing the minimal accepted quality of image corners. The
+parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue
+(see cornerMinEigenVal ) or the Harris function response (see cornerHarris ). The corners with the
+quality measure less than the product are rejected. For example, if the best corner has the
+quality measure = 1500, and the qualityLevel=0.01 , then all the corners with the quality measure
+less than 15 are rejected.
+@param minDistance Minimum possible Euclidean distance between the returned corners.
+@param blockSize Size of an average block for computing a derivative covariation matrix over each
+pixel neighborhood. See cornerEigenValsAndVecs .
+@param useHarrisDetector Parameter indicating whether to use a Harris detector (see cornerHarris)
+or cornerMinEigenVal.
+@param harrisK Free parameter of the Harris detector.
+ */
+CV_EXPORTS_W Ptr<CornersDetector> createGoodFeaturesToTrackDetector(int srcType, int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
+                                                                  int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
+
+//! @} cudaimgproc_feature
+
+
+///////////////////////////// Mean Shift //////////////////////////////
+
+/** @brief Performs mean-shift filtering for each point of the source image.
+
+@param src Source image. Only CV_8UC4 images are supported for now.
+@param dst Destination image containing the color of mapped points. It has the same size and type
+as src .
+@param sp Spatial window radius.
+@param sr Color window radius.
+@param criteria Termination criteria. See TermCriteria.
+@param stream Stream for the asynchronous version.
+
+It maps each point of the source image into another point. As a result, you have a new color and new
+position of each point.
+ */
+CV_EXPORTS_W void meanShiftFiltering(InputArray src, OutputArray dst, int sp, int sr,
+                                   TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
+                                   Stream& stream = Stream::Null());
+
+/** @brief Performs a mean-shift procedure and stores information about processed points (their colors and
+positions) in two images.
+
+@param src Source image. Only CV_8UC4 images are supported for now.
+@param dstr Destination image containing the color of mapped points. The size and type is the same
+as src .
+@param dstsp Destination image containing the position of mapped points. The size is the same as
+src size. The type is CV_16SC2 .
+@param sp Spatial window radius.
+@param sr Color window radius.
+@param criteria Termination criteria. See TermCriteria.
+@param stream Stream for the asynchronous version.
+
+@sa cuda::meanShiftFiltering
+ */
+CV_EXPORTS_W void meanShiftProc(InputArray src, OutputArray dstr, OutputArray dstsp, int sp, int sr,
+                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
+                              Stream& stream = Stream::Null());
+
+/** @brief Performs a mean-shift segmentation of the source image and eliminates small segments.
+
+@param src Source image. Only CV_8UC4 images are supported for now.
+@param dst Segmented image with the same size and type as src (host or gpu memory).
+@param sp Spatial window radius.
+@param sr Color window radius.
+@param minsize Minimum segment size. Smaller segments are merged.
+@param criteria Termination criteria. See TermCriteria.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, int sr, int minsize,
+                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
+                                      Stream& stream = Stream::Null());
+
+/////////////////////////// Match Template ////////////////////////////
+
+/** @brief Base class for Template Matching. :
+ */
+class CV_EXPORTS_W TemplateMatching : public Algorithm
+{
+public:
+    /** @brief Computes a proximity map for a raster template and an image where the template is searched for.
+
+    @param image Source image.
+    @param templ Template image with the size and type the same as image .
+    @param result Map containing comparison results ( CV_32FC1 ). If image is *W x H* and templ is *w
+    x h*, then result must be *W-w+1 x H-h+1*.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null()) = 0;
+};
+
+/** @brief Creates implementation for cuda::TemplateMatching .
+
+@param srcType Input source type. CV_32F and CV_8U depth images (1..4 channels) are supported
+for now.
+@param method Specifies the way to compare the template with the image.
+@param user_block_size You can use field user_block_size to set specific block size. If you
+leave its default value Size(0,0) then automatic estimation of block size will be used (which is
+optimized for speed). By varying user_block_size you can reduce memory requirements at the cost
+of speed.
+
+The following methods are supported for the CV_8U depth images for now:
+
+-   CV_TM_SQDIFF
+-   CV_TM_SQDIFF_NORMED
+-   CV_TM_CCORR
+-   CV_TM_CCORR_NORMED
+-   CV_TM_CCOEFF
+-   CV_TM_CCOEFF_NORMED
+
+The following methods are supported for the CV_32F images for now:
+
+-   CV_TM_SQDIFF
+-   CV_TM_CCORR
+
+@sa matchTemplate
+ */
+CV_EXPORTS_W Ptr<TemplateMatching> createTemplateMatching(int srcType, int method, Size user_block_size = Size());
+
+////////////////////////// Bilateral Filter ///////////////////////////
+
+/** @brief Performs bilateral filtering of passed image
+
+@param src Source image. Supports only (channels != 2 && depth() != CV_8S && depth() != CV_32S
+&& depth() != CV_64F).
+@param dst Destination imagwe.
+@param kernel_size Kernel window size.
+@param sigma_color Filter sigma in the color space.
+@param sigma_spatial Filter sigma in the coordinate space.
+@param borderMode Border type. See borderInterpolate for details. BORDER_REFLECT101 ,
+BORDER_REPLICATE , BORDER_CONSTANT , BORDER_REFLECT and BORDER_WRAP are supported for now.
+@param stream Stream for the asynchronous version.
+
+@sa bilateralFilter
+ */
+CV_EXPORTS_W void bilateralFilter(InputArray src, OutputArray dst, int kernel_size, float sigma_color, float sigma_spatial,
+                                int borderMode = BORDER_DEFAULT, Stream& stream = Stream::Null());
+
+///////////////////////////// Blending ////////////////////////////////
+
+/** @brief Performs linear blending of two images.
+
+@param img1 First image. Supports only CV_8U and CV_32F depth.
+@param img2 Second image. Must have the same size and the same type as img1 .
+@param weights1 Weights for first image. Must have tha same size as img1 . Supports only CV_32F
+type.
+@param weights2 Weights for second image. Must have tha same size as img2 . Supports only CV_32F
+type.
+@param result Destination image.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void blendLinear(InputArray img1, InputArray img2, InputArray weights1, InputArray weights2,
+                            OutputArray result, Stream& stream = Stream::Null());
+
+//! @}
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_CUDAIMGPROC_HPP */
diff --git a/modules/cudaimgproc/perf/perf_bilateral_filter.cpp b/modules/cudaimgproc/perf/perf_bilateral_filter.cpp
new file mode 100644
index 00000000000..8d651d4bdae
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_bilateral_filter.cpp
@@ -0,0 +1,93 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// BilateralFilter
+
+DEF_PARAM_TEST(Sz_Depth_Cn_KernelSz, cv::Size, MatDepth, MatCn, int);
+
+PERF_TEST_P(Sz_Depth_Cn_KernelSz, BilateralFilter,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_32F),
+                    CUDA_CHANNELS_1_3,
+                    Values(3, 5, 9)))
+{
+    declare.time(60.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int kernel_size = GET_PARAM(3);
+
+    const float sigma_color = 7;
+    const float sigma_spatial = 5;
+    const int borderMode = cv::BORDER_REFLECT101;
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::bilateralFilter(d_src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaimgproc/perf/perf_blend.cpp b/modules/cudaimgproc/perf/perf_blend.cpp
new file mode 100644
index 00000000000..f9906e1acfd
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_blend.cpp
@@ -0,0 +1,88 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// BlendLinear
+
+DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
+
+PERF_TEST_P(Sz_Depth_Cn, BlendLinear,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_32F),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat img1(size, type);
+    cv::Mat img2(size, type);
+    declare.in(img1, img2, WARMUP_RNG);
+
+    const cv::Mat weights1(size, CV_32FC1, cv::Scalar::all(0.5));
+    const cv::Mat weights2(size, CV_32FC1, cv::Scalar::all(0.5));
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_img1(img1);
+        const cv::cuda::GpuMat d_img2(img2);
+        const cv::cuda::GpuMat d_weights1(weights1);
+        const cv::cuda::GpuMat d_weights2(weights2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::blendLinear(d_img1, d_img2, d_weights1, d_weights2, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaimgproc/perf/perf_canny.cpp b/modules/cudaimgproc/perf/perf_canny.cpp
new file mode 100644
index 00000000000..9c0a6446b4a
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_canny.cpp
@@ -0,0 +1,88 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// Canny
+
+DEF_PARAM_TEST(Image_AppertureSz_L2gradient, string, int, bool);
+
+PERF_TEST_P(Image_AppertureSz_L2gradient, Canny,
+            Combine(Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
+                    Values(3, 5),
+                    Bool()))
+{
+    const string fileName = GET_PARAM(0);
+    const int apperture_size = GET_PARAM(1);
+    const bool useL2gradient = GET_PARAM(2);
+
+    const cv::Mat image = readImage(fileName, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    const double low_thresh = 50.0;
+    const double high_thresh = 100.0;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_image(image);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+
+        TEST_CYCLE() canny->detect(d_image, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Canny(image, dst, low_thresh, high_thresh, apperture_size, useL2gradient);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaimgproc/perf/perf_color.cpp b/modules/cudaimgproc/perf/perf_color.cpp
new file mode 100644
index 00000000000..90320835543
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_color.cpp
@@ -0,0 +1,253 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// CvtColor
+
+DEF_PARAM_TEST(Sz_Depth_Code, cv::Size, MatDepth, CvtColorInfo);
+
+PERF_TEST_P(Sz_Depth_Code, CvtColor,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_32F),
+                    Values(CvtColorInfo(4, 4, cv::COLOR_RGBA2BGRA),
+                           CvtColorInfo(4, 1, cv::COLOR_BGRA2GRAY),
+                           CvtColorInfo(1, 4, cv::COLOR_GRAY2BGRA),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2XYZ),
+                           CvtColorInfo(3, 3, cv::COLOR_XYZ2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2YCrCb),
+                           CvtColorInfo(3, 3, cv::COLOR_YCrCb2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2YUV),
+                           CvtColorInfo(3, 3, cv::COLOR_YUV2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2HSV),
+                           CvtColorInfo(3, 3, cv::COLOR_HSV2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2HLS),
+                           CvtColorInfo(3, 3, cv::COLOR_HLS2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2Lab),
+                           CvtColorInfo(3, 3, cv::COLOR_LBGR2Lab),
+                           CvtColorInfo(3, 3, cv::COLOR_BGR2Luv),
+                           CvtColorInfo(3, 3, cv::COLOR_LBGR2Luv),
+                           CvtColorInfo(3, 3, cv::COLOR_Lab2BGR),
+                           CvtColorInfo(3, 3, cv::COLOR_Lab2LBGR),
+                           CvtColorInfo(3, 3, cv::COLOR_Luv2RGB),
+                           CvtColorInfo(3, 3, cv::COLOR_Luv2LRGB))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const CvtColorInfo info = GET_PARAM(2);
+
+    cv::Mat src(size, CV_MAKETYPE(depth, info.scn));
+    cv::randu(src, 0, depth == CV_8U ? 255.0 : 1.0);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::cvtColor(d_src, dst, info.code, info.dcn);
+
+        CUDA_SANITY_CHECK(dst, 1e-4);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::cvtColor(src, dst, info.code, info.dcn);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+PERF_TEST_P(Sz_Depth_Code, CvtColorBayer,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U),
+                    Values(CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
+                           CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
+                           CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
+                           CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
+
+                           CvtColorInfo(1, 1, cv::COLOR_BayerBG2GRAY),
+                           CvtColorInfo(1, 1, cv::COLOR_BayerGB2GRAY),
+                           CvtColorInfo(1, 1, cv::COLOR_BayerRG2GRAY),
+                           CvtColorInfo(1, 1, cv::COLOR_BayerGR2GRAY))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const CvtColorInfo info = GET_PARAM(2);
+
+    cv::Mat src(size, CV_MAKETYPE(depth, info.scn));
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::cvtColor(d_src, dst, info.code, info.dcn);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::cvtColor(src, dst, info.code, info.dcn);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Demosaicing
+
+CV_ENUM(DemosaicingCode,
+        cv::COLOR_BayerBG2BGR, cv::COLOR_BayerGB2BGR, cv::COLOR_BayerRG2BGR, cv::COLOR_BayerGR2BGR,
+        cv::COLOR_BayerBG2GRAY, cv::COLOR_BayerGB2GRAY, cv::COLOR_BayerRG2GRAY, cv::COLOR_BayerGR2GRAY,
+        cv::cuda::COLOR_BayerBG2BGR_MHT, cv::cuda::COLOR_BayerGB2BGR_MHT, cv::cuda::COLOR_BayerRG2BGR_MHT, cv::cuda::COLOR_BayerGR2BGR_MHT,
+        cv::cuda::COLOR_BayerBG2GRAY_MHT, cv::cuda::COLOR_BayerGB2GRAY_MHT, cv::cuda::COLOR_BayerRG2GRAY_MHT, cv::cuda::COLOR_BayerGR2GRAY_MHT)
+
+DEF_PARAM_TEST(Sz_Code, cv::Size, DemosaicingCode);
+
+PERF_TEST_P(Sz_Code, Demosaicing,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    DemosaicingCode::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int code = GET_PARAM(1);
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::demosaicing(d_src, dst, code);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        if (code >= cv::COLOR_COLORCVT_MAX)
+        {
+            FAIL_NO_CPU();
+        }
+        else
+        {
+            cv::Mat dst;
+
+            TEST_CYCLE() cv::cvtColor(src, dst, code);
+
+            CPU_SANITY_CHECK(dst);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SwapChannels
+
+PERF_TEST_P(Sz, SwapChannels,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC4);
+    declare.in(src, WARMUP_RNG);
+
+    const int dstOrder[] = {2, 1, 0, 3};
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::cuda::GpuMat dst(src);
+
+        TEST_CYCLE() cv::cuda::swapChannels(dst, dstOrder);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// AlphaComp
+
+CV_ENUM(AlphaOp, cv::cuda::ALPHA_OVER, cv::cuda::ALPHA_IN, cv::cuda::ALPHA_OUT, cv::cuda::ALPHA_ATOP, cv::cuda::ALPHA_XOR, cv::cuda::ALPHA_PLUS, cv::cuda::ALPHA_OVER_PREMUL, cv::cuda::ALPHA_IN_PREMUL, cv::cuda::ALPHA_OUT_PREMUL, cv::cuda::ALPHA_ATOP_PREMUL, cv::cuda::ALPHA_XOR_PREMUL, cv::cuda::ALPHA_PLUS_PREMUL, cv::cuda::ALPHA_PREMUL)
+
+DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, AlphaOp);
+
+PERF_TEST_P(Sz_Type_Op, AlphaComp,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8UC4, CV_16UC4, CV_32SC4, CV_32FC4),
+                    AlphaOp::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int alpha_op = GET_PARAM(2);
+
+    cv::Mat img1(size, type);
+    cv::Mat img2(size, type);
+    declare.in(img1, img2, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_img1(img1);
+        const cv::cuda::GpuMat d_img2(img2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::alphaComp(d_img1, d_img2, dst, alpha_op);
+
+        // The function is a just wrapper for NPP. We can't control its results.
+        SANITY_CHECK_NOTHING();
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaimgproc/perf/perf_corners.cpp b/modules/cudaimgproc/perf/perf_corners.cpp
new file mode 100644
index 00000000000..22165caa644
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_corners.cpp
@@ -0,0 +1,135 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// CornerHarris
+
+DEF_PARAM_TEST(Image_Type_Border_BlockSz_ApertureSz, string, MatType, BorderMode, int, int);
+
+PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, CornerHarris,
+            Combine(Values<string>("gpu/stereobm/aloe-L.png"),
+                    Values(CV_8UC1, CV_32FC1),
+                    Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
+                    Values(3, 5, 7),
+                    Values(0, 3, 5, 7)))
+{
+    const string fileName = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int borderMode = GET_PARAM(2);
+    const int blockSize = GET_PARAM(3);
+    const int apertureSize = GET_PARAM(4);
+
+    cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
+
+    const double k = 0.5;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_img(img);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::CornernessCriteria> harris = cv::cuda::createHarrisCorner(img.type(), blockSize, apertureSize, k, borderMode);
+
+        TEST_CYCLE() harris->compute(d_img, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-4);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::cornerHarris(img, dst, blockSize, apertureSize, k, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CornerMinEigenVal
+
+PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, CornerMinEigenVal,
+            Combine(Values<string>("gpu/stereobm/aloe-L.png"),
+                    Values(CV_8UC1, CV_32FC1),
+                    Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
+                    Values(3, 5, 7),
+                    Values(0, 3, 5, 7)))
+{
+    const string fileName = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int borderMode = GET_PARAM(2);
+    const int blockSize = GET_PARAM(3);
+    const int apertureSize = GET_PARAM(4);
+
+    cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_img(img);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::CornernessCriteria> minEigenVal = cv::cuda::createMinEigenValCorner(img.type(), blockSize, apertureSize, borderMode);
+
+        TEST_CYCLE() minEigenVal->compute(d_img, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-4);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::cornerMinEigenVal(img, dst, blockSize, apertureSize, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaimgproc/perf/perf_gftt.cpp b/modules/cudaimgproc/perf/perf_gftt.cpp
new file mode 100644
index 00000000000..f64c722972e
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_gftt.cpp
@@ -0,0 +1,86 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////
+// GoodFeaturesToTrack
+
+DEF_PARAM_TEST(Image_MinDistance, string, double);
+
+PERF_TEST_P(Image_MinDistance, GoodFeaturesToTrack,
+            Combine(Values<string>("gpu/perf/aloe.png"),
+                    Values(0.0, 3.0)))
+{
+    const string fileName = GET_PARAM(0);
+    const double minDistance = GET_PARAM(1);
+
+    const cv::Mat image = readImage(fileName, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    const int maxCorners = 8000;
+    const double qualityLevel = 0.01;
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::CornersDetector> d_detector = cv::cuda::createGoodFeaturesToTrackDetector(image.type(), maxCorners, qualityLevel, minDistance);
+
+        const cv::cuda::GpuMat d_image(image);
+        cv::cuda::GpuMat pts;
+
+        TEST_CYCLE() d_detector->detect(d_image, pts);
+
+        CUDA_SANITY_CHECK(pts);
+    }
+    else
+    {
+        cv::Mat pts;
+
+        TEST_CYCLE() cv::goodFeaturesToTrack(image, pts, maxCorners, qualityLevel, minDistance);
+
+        CPU_SANITY_CHECK(pts);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaimgproc/perf/perf_histogram.cpp b/modules/cudaimgproc/perf/perf_histogram.cpp
new file mode 100644
index 00000000000..105411da9ed
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_histogram.cpp
@@ -0,0 +1,219 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// HistEvenC1
+
+DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
+
+PERF_TEST_P(Sz_Depth, HistEvenC1,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_16S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::histEven(d_src, dst, 30, 0, 180);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        const int hbins = 30;
+        const float hranges[] = {0.0f, 180.0f};
+        const int histSize[] = {hbins};
+        const float* ranges[] = {hranges};
+        const int channels[] = {0};
+
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::calcHist(&src, 1, channels, cv::Mat(), dst, 1, histSize, ranges);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// HistEvenC4
+
+PERF_TEST_P(Sz_Depth, HistEvenC4,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_16S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, CV_MAKE_TYPE(depth, 4));
+    declare.in(src, WARMUP_RNG);
+
+    int histSize[] = {30, 30, 30, 30};
+    int lowerLevel[] = {0, 0, 0, 0};
+    int upperLevel[] = {180, 180, 180, 180};
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_hist[4];
+
+        TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, histSize, lowerLevel, upperLevel);
+
+        cv::Mat cpu_hist0, cpu_hist1, cpu_hist2, cpu_hist3;
+        d_hist[0].download(cpu_hist0);
+        d_hist[1].download(cpu_hist1);
+        d_hist[2].download(cpu_hist2);
+        d_hist[3].download(cpu_hist3);
+        SANITY_CHECK(cpu_hist0);
+        SANITY_CHECK(cpu_hist1);
+        SANITY_CHECK(cpu_hist2);
+        SANITY_CHECK(cpu_hist3);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CalcHist
+
+PERF_TEST_P(Sz, CalcHist,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::calcHist(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// EqualizeHist
+
+PERF_TEST_P(Sz, EqualizeHist,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::equalizeHist(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CLAHE
+
+DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double);
+
+PERF_TEST_P(Sz_ClipLimit, CLAHE,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(0.0, 40.0)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const double clipLimit = GET_PARAM(1);
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::CLAHE> clahe = cv::cuda::createCLAHE(clipLimit);
+        cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() clahe->apply(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Ptr<cv::CLAHE> clahe = cv::createCLAHE(clipLimit);
+        cv::Mat dst;
+
+        TEST_CYCLE() clahe->apply(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaimgproc/perf/perf_hough.cpp b/modules/cudaimgproc/perf/perf_hough.cpp
new file mode 100644
index 00000000000..64b75e08a00
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_hough.cpp
@@ -0,0 +1,348 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// HoughLines
+
+namespace
+{
+    struct Vec4iComparator
+    {
+        bool operator()(const cv::Vec4i& a, const cv::Vec4i b) const
+        {
+            if (a[0] != b[0]) return a[0] < b[0];
+            else if(a[1] != b[1]) return a[1] < b[1];
+            else if(a[2] != b[2]) return a[2] < b[2];
+            else return a[3] < b[3];
+        }
+    };
+    struct Vec3fComparator
+    {
+        bool operator()(const cv::Vec3f& a, const cv::Vec3f b) const
+        {
+            if(a[0] != b[0]) return a[0] < b[0];
+            else if(a[1] != b[1]) return a[1] < b[1];
+            else return a[2] < b[2];
+        }
+    };
+    struct Vec2fComparator
+    {
+        bool operator()(const cv::Vec2f& a, const cv::Vec2f b) const
+        {
+            if(a[0] != b[0]) return a[0] < b[0];
+            else return a[1] < b[1];
+        }
+    };
+}
+
+PERF_TEST_P(Sz, HoughLines,
+            CUDA_TYPICAL_MAT_SIZES)
+{
+    declare.time(30.0);
+
+    const cv::Size size = GetParam();
+
+    const float rho = 1.0f;
+    const float theta = static_cast<float>(CV_PI / 180.0);
+    const int threshold = 300;
+
+    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
+    cv::line(src, cv::Point(0, 100), cv::Point(src.cols, 100), cv::Scalar::all(255), 1);
+    cv::line(src, cv::Point(0, 200), cv::Point(src.cols, 200), cv::Scalar::all(255), 1);
+    cv::line(src, cv::Point(0, 400), cv::Point(src.cols, 400), cv::Scalar::all(255), 1);
+    cv::line(src, cv::Point(100, 0), cv::Point(100, src.rows), cv::Scalar::all(255), 1);
+    cv::line(src, cv::Point(200, 0), cv::Point(200, src.rows), cv::Scalar::all(255), 1);
+    cv::line(src, cv::Point(400, 0), cv::Point(400, src.rows), cv::Scalar::all(255), 1);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_lines;
+
+        cv::Ptr<cv::cuda::HoughLinesDetector> hough = cv::cuda::createHoughLinesDetector(rho, theta, threshold);
+
+        TEST_CYCLE() hough->detect(d_src, d_lines);
+
+        cv::Mat gpu_lines(d_lines.row(0));
+        cv::Vec2f* begin = gpu_lines.ptr<cv::Vec2f>(0);
+        cv::Vec2f* end = begin + gpu_lines.cols;
+        std::sort(begin, end, Vec2fComparator());
+        SANITY_CHECK(gpu_lines);
+    }
+    else
+    {
+        std::vector<cv::Vec2f> cpu_lines;
+
+        TEST_CYCLE() cv::HoughLines(src, cpu_lines, rho, theta, threshold);
+
+        SANITY_CHECK(cpu_lines);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// HoughLinesP
+
+DEF_PARAM_TEST_1(Image, std::string);
+
+PERF_TEST_P(Image, HoughLinesP,
+            testing::Values("cv/shared/pic5.png", "stitching/a1.png"))
+{
+    declare.time(30.0);
+
+    const std::string fileName = getDataPath(GetParam());
+
+    const float rho = 1.0f;
+    const float theta = static_cast<float>(CV_PI / 180.0);
+    const int threshold = 100;
+    const int minLineLength = 50;
+    const int maxLineGap = 5;
+
+    const cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    cv::Mat mask;
+    cv::Canny(image, mask, 50, 100);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_mask(mask);
+        cv::cuda::GpuMat d_lines;
+
+        cv::Ptr<cv::cuda::HoughSegmentDetector> hough = cv::cuda::createHoughSegmentDetector(rho, theta, minLineLength, maxLineGap);
+
+        TEST_CYCLE() hough->detect(d_mask, d_lines);
+
+        cv::Mat gpu_lines(d_lines);
+        cv::Vec4i* begin = gpu_lines.ptr<cv::Vec4i>();
+        cv::Vec4i* end = begin + gpu_lines.cols;
+        std::sort(begin, end, Vec4iComparator());
+        SANITY_CHECK(gpu_lines);
+    }
+    else
+    {
+        std::vector<cv::Vec4i> cpu_lines;
+
+        TEST_CYCLE() cv::HoughLinesP(mask, cpu_lines, rho, theta, threshold, minLineLength, maxLineGap);
+
+        SANITY_CHECK(cpu_lines);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// HoughCircles
+
+DEF_PARAM_TEST(Sz_Dp_MinDist, cv::Size, float, float);
+
+PERF_TEST_P(Sz_Dp_MinDist, HoughCircles,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(1.0f, 2.0f, 4.0f),
+                    Values(1.0f)))
+{
+    declare.time(30.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const float dp = GET_PARAM(1);
+    const float minDist = GET_PARAM(2);
+
+    const int minRadius = 10;
+    const int maxRadius = 30;
+    const int cannyThreshold = 100;
+    const int votesThreshold = 15;
+
+    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
+    cv::circle(src, cv::Point(100, 100), 20, cv::Scalar::all(255), -1);
+    cv::circle(src, cv::Point(200, 200), 25, cv::Scalar::all(255), -1);
+    cv::circle(src, cv::Point(200, 100), 25, cv::Scalar::all(255), -1);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_circles;
+
+        cv::Ptr<cv::cuda::HoughCirclesDetector> houghCircles = cv::cuda::createHoughCirclesDetector(dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
+
+        TEST_CYCLE() houghCircles->detect(d_src, d_circles);
+
+        cv::Mat gpu_circles(d_circles);
+        cv::Vec3f* begin = gpu_circles.ptr<cv::Vec3f>(0);
+        cv::Vec3f* end = begin + gpu_circles.cols;
+        std::sort(begin, end, Vec3fComparator());
+        SANITY_CHECK(gpu_circles);
+    }
+    else
+    {
+        std::vector<cv::Vec3f> cpu_circles;
+
+        TEST_CYCLE() cv::HoughCircles(src, cpu_circles, cv::HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
+
+        SANITY_CHECK(cpu_circles);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// GeneralizedHough
+
+PERF_TEST_P(Sz, GeneralizedHoughBallard, CUDA_TYPICAL_MAT_SIZES)
+{
+    declare.time(10);
+
+    const cv::Size imageSize = GetParam();
+
+    const cv::Mat templ = readImage("cv/shared/templ.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(templ.empty());
+
+    cv::Mat image(imageSize, CV_8UC1, cv::Scalar::all(0));
+    templ.copyTo(image(cv::Rect(50, 50, templ.cols, templ.rows)));
+
+    cv::Mat edges;
+    cv::Canny(image, edges, 50, 100);
+
+    cv::Mat dx, dy;
+    cv::Sobel(image, dx, CV_32F, 1, 0);
+    cv::Sobel(image, dy, CV_32F, 0, 1);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::GeneralizedHoughBallard> alg = cv::cuda::createGeneralizedHoughBallard();
+
+        const cv::cuda::GpuMat d_edges(edges);
+        const cv::cuda::GpuMat d_dx(dx);
+        const cv::cuda::GpuMat d_dy(dy);
+        cv::cuda::GpuMat positions;
+
+        alg->setTemplate(cv::cuda::GpuMat(templ));
+
+        TEST_CYCLE() alg->detect(d_edges, d_dx, d_dy, positions);
+
+        CUDA_SANITY_CHECK(positions);
+    }
+    else
+    {
+        cv::Ptr<cv::GeneralizedHoughBallard> alg = cv::createGeneralizedHoughBallard();
+
+        cv::Mat positions;
+
+        alg->setTemplate(templ);
+
+        TEST_CYCLE() alg->detect(edges, dx, dy, positions);
+
+        CPU_SANITY_CHECK(positions);
+    }
+}
+
+PERF_TEST_P(Sz, DISABLED_GeneralizedHoughGuil, CUDA_TYPICAL_MAT_SIZES)
+{
+    declare.time(10);
+
+    const cv::Size imageSize = GetParam();
+
+    const cv::Mat templ = readImage("cv/shared/templ.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(templ.empty());
+
+    cv::Mat image(imageSize, CV_8UC1, cv::Scalar::all(0));
+    templ.copyTo(image(cv::Rect(50, 50, templ.cols, templ.rows)));
+
+    cv::RNG rng(123456789);
+    const int objCount = rng.uniform(5, 15);
+    for (int i = 0; i < objCount; ++i)
+    {
+        double scale = rng.uniform(0.7, 1.3);
+        bool rotate = 1 == rng.uniform(0, 2);
+
+        cv::Mat obj;
+        cv::resize(templ, obj, cv::Size(), scale, scale);
+        if (rotate)
+            obj = obj.t();
+
+        cv::Point pos;
+
+        pos.x = rng.uniform(0, image.cols - obj.cols);
+        pos.y = rng.uniform(0, image.rows - obj.rows);
+
+        cv::Mat roi = image(cv::Rect(pos, obj.size()));
+        cv::add(roi, obj, roi);
+    }
+
+    cv::Mat edges;
+    cv::Canny(image, edges, 50, 100);
+
+    cv::Mat dx, dy;
+    cv::Sobel(image, dx, CV_32F, 1, 0);
+    cv::Sobel(image, dy, CV_32F, 0, 1);
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::GeneralizedHoughGuil> alg = cv::cuda::createGeneralizedHoughGuil();
+        alg->setMaxAngle(90.0);
+        alg->setAngleStep(2.0);
+
+        const cv::cuda::GpuMat d_edges(edges);
+        const cv::cuda::GpuMat d_dx(dx);
+        const cv::cuda::GpuMat d_dy(dy);
+        cv::cuda::GpuMat positions;
+
+        alg->setTemplate(cv::cuda::GpuMat(templ));
+
+        TEST_CYCLE() alg->detect(d_edges, d_dx, d_dy, positions);
+    }
+    else
+    {
+        cv::Ptr<cv::GeneralizedHoughGuil> alg = cv::createGeneralizedHoughGuil();
+        alg->setMaxAngle(90.0);
+        alg->setAngleStep(2.0);
+
+        cv::Mat positions;
+
+        alg->setTemplate(templ);
+
+        TEST_CYCLE() alg->detect(edges, dx, dy, positions);
+    }
+
+    // The algorithm is not stable yet.
+    SANITY_CHECK_NOTHING();
+}
+
+}} // namespace
diff --git a/modules/cudaimgproc/perf/perf_main.cpp b/modules/cudaimgproc/perf/perf_main.cpp
new file mode 100644
index 00000000000..11a1651b4c3
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudaimgproc)
diff --git a/modules/cudaimgproc/perf/perf_match_template.cpp b/modules/cudaimgproc/perf/perf_match_template.cpp
new file mode 100644
index 00000000000..335fe10c012
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_match_template.cpp
@@ -0,0 +1,135 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate8U
+
+CV_ENUM(TemplateMethod, TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED)
+
+DEF_PARAM_TEST(Sz_TemplateSz_Cn_Method, cv::Size, cv::Size, MatCn, TemplateMethod);
+
+PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate8U,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(cv::Size(5, 5), cv::Size(16, 16), cv::Size(30, 30)),
+                    CUDA_CHANNELS_1_3_4,
+                    TemplateMethod::all()))
+{
+    declare.time(300.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const cv::Size templ_size = GET_PARAM(1);
+    const int cn = GET_PARAM(2);
+    const int method = GET_PARAM(3);
+
+    cv::Mat image(size, CV_MAKE_TYPE(CV_8U, cn));
+    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_8U, cn));
+    declare.in(image, templ, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_image(image);
+        const cv::cuda::GpuMat d_templ(templ);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::TemplateMatching> alg = cv::cuda::createTemplateMatching(image.type(), method);
+
+        TEST_CYCLE() alg->match(d_image, d_templ, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::matchTemplate(image, templ, dst, method);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate32F
+
+PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate32F,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(cv::Size(5, 5), cv::Size(16, 16), cv::Size(30, 30)),
+                    CUDA_CHANNELS_1_3_4,
+                    Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))))
+{
+    declare.time(300.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const cv::Size templ_size = GET_PARAM(1);
+    const int cn = GET_PARAM(2);
+    int method = GET_PARAM(3);
+
+    cv::Mat image(size, CV_MAKE_TYPE(CV_32F, cn));
+    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_32F, cn));
+    declare.in(image, templ, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_image(image);
+        const cv::cuda::GpuMat d_templ(templ);
+        cv::cuda::GpuMat dst;
+
+        cv::Ptr<cv::cuda::TemplateMatching> alg = cv::cuda::createTemplateMatching(image.type(), method);
+
+        TEST_CYCLE() alg->match(d_image, d_templ, dst);
+
+        CUDA_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::matchTemplate(image, templ, dst, method);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaimgproc/perf/perf_mean_shift.cpp b/modules/cudaimgproc/perf/perf_mean_shift.cpp
new file mode 100644
index 00000000000..5a151bff295
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_mean_shift.cpp
@@ -0,0 +1,152 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// MeanShiftFiltering
+
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, MeanShiftFiltering,
+            Values<string>("gpu/meanshift/cones.png"))
+{
+    declare.time(300.0);
+
+    const cv::Mat img = readImage(GetParam());
+    ASSERT_FALSE(img.empty());
+
+    cv::Mat rgba;
+    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);
+
+    const int sp = 50;
+    const int sr = 50;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(rgba);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::meanShiftFiltering(d_src, dst, sp, sr);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::pyrMeanShiftFiltering(img, dst, sp, sr);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MeanShiftProc
+
+PERF_TEST_P(Image, MeanShiftProc,
+            Values<string>("gpu/meanshift/cones.png"))
+{
+    declare.time(300.0);
+
+    const cv::Mat img = readImage(GetParam());
+    ASSERT_FALSE(img.empty());
+
+    cv::Mat rgba;
+    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);
+
+    const int sp = 50;
+    const int sr = 50;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(rgba);
+        cv::cuda::GpuMat dstr;
+        cv::cuda::GpuMat dstsp;
+
+        TEST_CYCLE() cv::cuda::meanShiftProc(d_src, dstr, dstsp, sp, sr);
+
+        CUDA_SANITY_CHECK(dstr);
+        CUDA_SANITY_CHECK(dstsp);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MeanShiftSegmentation
+
+PERF_TEST_P(Image, MeanShiftSegmentation,
+            Values<string>("gpu/meanshift/cones.png"))
+{
+    declare.time(300.0);
+
+    const cv::Mat img = readImage(GetParam());
+    ASSERT_FALSE(img.empty());
+
+    cv::Mat rgba;
+    cv::cvtColor(img, rgba, cv::COLOR_BGR2BGRA);
+
+    const int sp = 10;
+    const int sr = 10;
+    const int minsize = 20;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(rgba);
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::cuda::meanShiftSegmentation(d_src, dst, sp, sr, minsize);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaimgproc/perf/perf_precomp.hpp b/modules/cudaimgproc/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..b793f312a7c
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_precomp.hpp
@@ -0,0 +1,55 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudaimgproc.hpp"
+
+namespace opencv_test {
+using namespace perf;
+using namespace testing;
+}
+
+#endif
diff --git a/modules/cudaimgproc/src/bilateral_filter.cpp b/modules/cudaimgproc/src/bilateral_filter.cpp
new file mode 100644
index 00000000000..fe77ba93d89
--- /dev/null
+++ b/modules/cudaimgproc/src/bilateral_filter.cpp
@@ -0,0 +1,99 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::bilateralFilter(InputArray, OutputArray, int, float, float, int, Stream&) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template<typename T>
+        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::bilateralFilter(InputArray _src, OutputArray _dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode, Stream& stream)
+{
+    using cv::cuda::device::imgproc::bilateral_filter_gpu;
+
+    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s);
+
+    static const func_t funcs[6][4] =
+    {
+        {bilateral_filter_gpu<uchar>      , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>      , bilateral_filter_gpu<uchar4>      },
+        {0 /*bilateral_filter_gpu<schar>*/, 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/, 0 /*bilateral_filter_gpu<schar4>*/},
+        {bilateral_filter_gpu<ushort>     , 0 /*bilateral_filter_gpu<ushort2>*/, bilateral_filter_gpu<ushort3>     , bilateral_filter_gpu<ushort4>     },
+        {bilateral_filter_gpu<short>      , 0 /*bilateral_filter_gpu<short2>*/ , bilateral_filter_gpu<short3>      , bilateral_filter_gpu<short4>      },
+        {0 /*bilateral_filter_gpu<int>*/  , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/  , 0 /*bilateral_filter_gpu<int4>*/  },
+        {bilateral_filter_gpu<float>      , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>      , bilateral_filter_gpu<float4>      }
+    };
+
+    sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
+    sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;
+
+    int radius = (kernel_size <= 0) ? cvRound(sigma_spatial*1.5) : kernel_size/2;
+    kernel_size = std::max(radius, 1)*2 + 1;
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert( func != 0 );
+
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    func(src, dst, kernel_size, sigma_spatial, sigma_color, borderMode, StreamAccessor::getStream(stream));
+}
+
+#endif
diff --git a/modules/cudaimgproc/src/blend.cpp b/modules/cudaimgproc/src/blend.cpp
new file mode 100644
index 00000000000..90a43280057
--- /dev/null
+++ b/modules/cudaimgproc/src/blend.cpp
@@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::blendLinear(InputArray, InputArray, InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+#else
+
+////////////////////////////////////////////////////////////////////////
+// blendLinear
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace blend
+    {
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
+    }
+}}}
+
+using namespace ::cv::cuda::device::blend;
+
+void cv::cuda::blendLinear(InputArray _img1, InputArray _img2, InputArray _weights1, InputArray _weights2,
+                          OutputArray _result, Stream& stream)
+{
+    GpuMat img1 = _img1.getGpuMat();
+    GpuMat img2 = _img2.getGpuMat();
+
+    GpuMat weights1 = _weights1.getGpuMat();
+    GpuMat weights2 = _weights2.getGpuMat();
+
+    CV_Assert( img1.size() == img2.size() );
+    CV_Assert( img1.type() == img2.type() );
+    CV_Assert( weights1.size() == img1.size() );
+    CV_Assert( weights2.size() == img2.size() );
+    CV_Assert( weights1.type() == CV_32FC1 );
+    CV_Assert( weights2.type() == CV_32FC1 );
+
+    const Size size = img1.size();
+    const int depth = img1.depth();
+    const int cn = img1.channels();
+
+    _result.create(size, CV_MAKE_TYPE(depth, cn));
+    GpuMat result = _result.getGpuMat();
+
+    switch (depth)
+    {
+    case CV_8U:
+        if (cn != 4)
+            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        else
+            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        break;
+    case CV_32F:
+        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        break;
+    default:
+        CV_Error(cv::Error::StsUnsupportedFormat, "bad image depth in linear blending function");
+    }
+}
+
+#endif
diff --git a/modules/cudaimgproc/src/canny.cpp b/modules/cudaimgproc/src/canny.cpp
new file mode 100644
index 00000000000..9a1125d1cdd
--- /dev/null
+++ b/modules/cudaimgproc/src/canny.cpp
@@ -0,0 +1,245 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<CannyEdgeDetector> cv::cuda::createCannyEdgeDetector(double, double, int, bool) { throw_no_cuda(); return Ptr<CannyEdgeDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace canny
+{
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream);
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream);
+
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream);
+
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream);
+
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream);
+
+    void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream);
+}
+
+namespace
+{
+    class CannyImpl : public CannyEdgeDetector
+    {
+    public:
+        CannyImpl(double low_thresh, double high_thresh, int apperture_size, bool L2gradient) :
+            low_thresh_(low_thresh), high_thresh_(high_thresh), apperture_size_(apperture_size), L2gradient_(L2gradient)
+        {
+            old_apperture_size_ = -1;
+            d_counter = nullptr;
+        }
+
+        void detect(InputArray image, OutputArray edges, Stream& stream);
+        void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream);
+
+        void setLowThreshold(double low_thresh) { low_thresh_ = low_thresh; }
+        double getLowThreshold() const { return low_thresh_; }
+
+        void setHighThreshold(double high_thresh) { high_thresh_ = high_thresh; }
+        double getHighThreshold() const { return high_thresh_; }
+
+        void setAppertureSize(int apperture_size) { apperture_size_ = apperture_size; }
+        int getAppertureSize() const { return apperture_size_; }
+
+        void setL2Gradient(bool L2gradient) { L2gradient_ = L2gradient; }
+        bool getL2Gradient() const { return L2gradient_; }
+
+        void write(FileStorage& fs) const
+        {
+            writeFormat(fs);
+            fs << "name" << "Canny_CUDA"
+            << "low_thresh" << low_thresh_
+            << "high_thresh" << high_thresh_
+            << "apperture_size" << apperture_size_
+            << "L2gradient" << L2gradient_;
+        }
+
+        void read(const FileNode& fn)
+        {
+            CV_Assert( String(fn["name"]) == "Canny_CUDA" );
+            low_thresh_ = (double)fn["low_thresh"];
+            high_thresh_ = (double)fn["high_thresh"];
+            apperture_size_ = (int)fn["apperture_size"];
+            L2gradient_ = (int)fn["L2gradient"] != 0;
+        }
+
+    private:
+        void createBuf(Size image_size);
+        void CannyCaller(GpuMat& edges, Stream& stream);
+
+        double low_thresh_;
+        double high_thresh_;
+        int apperture_size_;
+        bool L2gradient_;
+
+        GpuMat dx_, dy_;
+        GpuMat mag_;
+        GpuMat map_;
+        GpuMat st1_, st2_;
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        Ptr<Filter> filterDX_, filterDY_;
+#endif
+        int old_apperture_size_;
+
+        int *d_counter;
+    };
+
+    void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream)
+    {
+        GpuMat image = _image.getGpuMat();
+
+        CV_Assert( image.type() == CV_8UC1 );
+        CV_Assert( deviceSupports(SHARED_ATOMICS) );
+
+        if (low_thresh_ > high_thresh_)
+            std::swap(low_thresh_, high_thresh_);
+
+        createBuf(image.size());
+
+        _edges.create(image.size(), CV_8UC1);
+        GpuMat edges = _edges.getGpuMat();
+
+        if (apperture_size_ == 3)
+        {
+            Size wholeSize;
+            Point ofs;
+            image.locateROI(wholeSize, ofs);
+            GpuMat srcWhole(wholeSize, image.type(), image.datastart, image.step);
+
+            canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
+        }
+        else
+        {
+#ifndef HAVE_OPENCV_CUDAFILTERS
+            throw_no_cuda();
+#else
+            filterDX_->apply(image, dx_, stream);
+            filterDY_->apply(image, dy_, stream);
+
+            canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
+#endif
+        }
+
+        CannyCaller(edges, stream);
+    }
+
+    void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges, Stream& stream)
+    {
+        GpuMat dx = _dx.getGpuMat();
+        GpuMat dy = _dy.getGpuMat();
+
+        CV_Assert( dx.type() == CV_32SC1 );
+        CV_Assert( dy.type() == dx.type() && dy.size() == dx.size() );
+        CV_Assert( deviceSupports(SHARED_ATOMICS) );
+
+        dx.copyTo(dx_, stream);
+        dy.copyTo(dy_, stream);
+
+        if (low_thresh_ > high_thresh_)
+            std::swap(low_thresh_, high_thresh_);
+
+        createBuf(dx.size());
+
+        _edges.create(dx.size(), CV_8UC1);
+        GpuMat edges = _edges.getGpuMat();
+
+        canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
+
+        CannyCaller(edges, stream);
+    }
+
+    void CannyImpl::createBuf(Size image_size)
+    {
+        CV_Assert(image_size.width < std::numeric_limits<short>::max() && image_size.height < std::numeric_limits<short>::max());
+
+        ensureSizeIsEnough(image_size, CV_32SC1, dx_);
+        ensureSizeIsEnough(image_size, CV_32SC1, dy_);
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        if (apperture_size_ != 3 && apperture_size_ != old_apperture_size_)
+        {
+            filterDX_ = cuda::createDerivFilter(CV_8UC1, CV_32S, 1, 0, apperture_size_, false, 1, BORDER_REPLICATE);
+            filterDY_ = cuda::createDerivFilter(CV_8UC1, CV_32S, 0, 1, apperture_size_, false, 1, BORDER_REPLICATE);
+            old_apperture_size_ = apperture_size_;
+        }
+#endif
+
+        ensureSizeIsEnough(image_size, CV_32FC1, mag_);
+        ensureSizeIsEnough(image_size, CV_32SC1, map_);
+
+        ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st1_);
+        ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2_);
+    }
+
+    void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream)
+    {
+        map_.setTo(Scalar::all(0), stream);
+
+        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_), StreamAccessor::getStream(stream));
+
+        cudaSafeCall( cudaMalloc(&d_counter, sizeof(int)) );
+
+        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));
+
+        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));
+
+        cudaSafeCall( cudaFree(d_counter) );
+
+        canny::getEdges(map_, edges, StreamAccessor::getStream(stream));
+    }
+}
+
+Ptr<CannyEdgeDetector> cv::cuda::createCannyEdgeDetector(double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
+{
+    return makePtr<CannyImpl>(low_thresh, high_thresh, apperture_size, L2gradient);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/color.cpp b/modules/cudaimgproc/src/color.cpp
new file mode 100644
index 00000000000..5adfa6cda6c
--- /dev/null
+++ b/modules/cudaimgproc/src/color.cpp
@@ -0,0 +1,2332 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::cvtColor(InputArray, OutputArray, int, int, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::demosaicing(InputArray, OutputArray, int, int, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::swapChannels(InputOutputArray, const int[], Stream&) { throw_no_cuda(); }
+
+void cv::cuda::gammaCorrection(InputArray, OutputArray, bool, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::alphaComp(InputArray, InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
+
+
+#else /* !defined (HAVE_CUDA) */
+
+#include "cvt_color_internal.h"
+
+namespace cv { namespace cuda {
+    namespace device
+    {
+        template <int cn>
+        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template <int cn>
+        void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+        template <int cn>
+        void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    }
+}}
+
+using namespace ::cv::cuda::device;
+
+namespace
+{
+    typedef void (*gpu_func_t)(const GpuMat& _src, GpuMat& _dst, Stream& stream);
+
+    void BGR_to_RGB(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {BGR_to_RGB_8u, 0, BGR_to_RGB_16u, 0, 0, BGR_to_RGB_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 3));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_BGRA(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {BGR_to_BGRA_8u, 0, BGR_to_BGRA_16u, 0, 0, BGR_to_BGRA_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 4));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_RGBA(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {BGR_to_RGBA_8u, 0, BGR_to_RGBA_16u, 0, 0, BGR_to_RGBA_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 4));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void BGRA_to_BGR(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {BGRA_to_BGR_8u, 0, BGRA_to_BGR_16u, 0, 0, BGRA_to_BGR_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 3));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void BGRA_to_RGB(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {BGRA_to_RGB_8u, 0, BGRA_to_RGB_16u, 0, 0, BGRA_to_RGB_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 3));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void BGRA_to_RGBA(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {BGRA_to_RGBA_8u, 0, BGRA_to_RGBA_16u, 0, 0, BGRA_to_RGBA_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 4));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_BGR555(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 3 );
+
+        _dst.create(src.size(), CV_8UC2);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR_to_BGR555(src, dst, stream);
+    }
+
+    void BGR_to_BGR565(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 3 );
+
+        _dst.create(src.size(), CV_8UC2);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR_to_BGR565(src, dst, stream);
+    }
+
+    void RGB_to_BGR555(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 3 );
+
+        _dst.create(src.size(), CV_8UC2);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::RGB_to_BGR555(src, dst, stream);
+    }
+
+    void RGB_to_BGR565(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 3 );
+
+        _dst.create(src.size(), CV_8UC2);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::RGB_to_BGR565(src, dst, stream);
+    }
+
+    void BGRA_to_BGR555(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 4 );
+
+        _dst.create(src.size(), CV_8UC2);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGRA_to_BGR555(src, dst, stream);
+    }
+
+    void BGRA_to_BGR565(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 4 );
+
+        _dst.create(src.size(), CV_8UC2);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGRA_to_BGR565(src, dst, stream);
+    }
+
+    void RGBA_to_BGR555(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 4 );
+
+        _dst.create(src.size(), CV_8UC2);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::RGBA_to_BGR555(src, dst, stream);
+    }
+
+    void RGBA_to_BGR565(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 4 );
+
+        _dst.create(src.size(), CV_8UC2);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::RGBA_to_BGR565(src, dst, stream);
+    }
+
+    void BGR555_to_RGB(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 2 );
+
+        _dst.create(src.size(), CV_8UC3);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR555_to_RGB(src, dst, stream);
+    }
+
+    void BGR565_to_RGB(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 2 );
+
+        _dst.create(src.size(), CV_8UC3);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR565_to_RGB(src, dst, stream);
+    }
+
+    void BGR555_to_BGR(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 2 );
+
+        _dst.create(src.size(), CV_8UC3);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR555_to_BGR(src, dst, stream);
+    }
+
+    void BGR565_to_BGR(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 2 );
+
+        _dst.create(src.size(), CV_8UC3);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR565_to_BGR(src, dst, stream);
+    }
+
+    void BGR555_to_RGBA(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 2 );
+
+        _dst.create(src.size(), CV_8UC4);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR555_to_RGBA(src, dst, stream);
+    }
+
+    void BGR565_to_RGBA(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 2 );
+
+        _dst.create(src.size(), CV_8UC4);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR565_to_RGBA(src, dst, stream);
+    }
+
+    void BGR555_to_BGRA(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 2 );
+
+        _dst.create(src.size(), CV_8UC4);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR555_to_BGRA(src, dst, stream);
+    }
+
+    void BGR565_to_BGRA(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 2 );
+
+        _dst.create(src.size(), CV_8UC4);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR565_to_BGRA(src, dst, stream);
+    }
+
+    void GRAY_to_BGR(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {GRAY_to_BGR_8u, 0, GRAY_to_BGR_16u, 0, 0, GRAY_to_BGR_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 1 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 3));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void GRAY_to_BGRA(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {GRAY_to_BGRA_8u, 0, GRAY_to_BGRA_16u, 0, 0, GRAY_to_BGRA_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 1 );
+
+        _dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void GRAY_to_BGR555(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 1 );
+
+        _dst.create(src.size(), CV_8UC2);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::GRAY_to_BGR555(src, dst, stream);
+    }
+
+    void GRAY_to_BGR565(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 1 );
+
+        _dst.create(src.size(), CV_8UC2);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::GRAY_to_BGR565(src, dst, stream);
+    }
+
+    void BGR555_to_GRAY(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 2 );
+
+        _dst.create(src.size(), CV_8UC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR555_to_GRAY(src, dst, stream);
+    }
+
+    void BGR565_to_GRAY(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U );
+        CV_Assert( src.channels() == 2 );
+
+        _dst.create(src.size(), CV_8UC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        cv::cuda::device::BGR565_to_GRAY(src, dst, stream);
+    }
+
+    void RGB_to_GRAY(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {RGB_to_GRAY_8u, 0, RGB_to_GRAY_16u, 0, 0, RGB_to_GRAY_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 1));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_GRAY(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {BGR_to_GRAY_8u, 0, BGR_to_GRAY_16u, 0, 0, BGR_to_GRAY_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 1));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void RGBA_to_GRAY(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {RGBA_to_GRAY_8u, 0, RGBA_to_GRAY_16u, 0, 0, RGBA_to_GRAY_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 1));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void BGRA_to_GRAY(InputArray _src, OutputArray _dst, int, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[] = {BGRA_to_GRAY_8u, 0, BGRA_to_GRAY_16u, 0, 0, BGRA_to_GRAY_32f};
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 1));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, stream);
+    }
+
+    void RGB_to_YUV(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {RGB_to_YUV_8u, 0, RGB_to_YUV_16u, 0, 0, RGB_to_YUV_32f},
+                {RGBA_to_YUV_8u, 0, RGBA_to_YUV_16u, 0, 0, RGBA_to_YUV_32f}
+            },
+            {
+                {RGB_to_YUV4_8u, 0, RGB_to_YUV4_16u, 0, 0, RGB_to_YUV4_32f},
+                {RGBA_to_YUV4_8u, 0, RGBA_to_YUV4_16u, 0, 0, RGBA_to_YUV4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_YUV(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {BGR_to_YUV_8u, 0, BGR_to_YUV_16u, 0, 0, BGR_to_YUV_32f},
+                {BGRA_to_YUV_8u, 0, BGRA_to_YUV_16u, 0, 0, BGRA_to_YUV_32f}
+            },
+            {
+                {BGR_to_YUV4_8u, 0, BGR_to_YUV4_16u, 0, 0, BGR_to_YUV4_32f},
+                {BGRA_to_YUV4_8u, 0, BGRA_to_YUV4_16u, 0, 0, BGRA_to_YUV4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void YUV_to_RGB(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {YUV_to_RGB_8u, 0, YUV_to_RGB_16u, 0, 0, YUV_to_RGB_32f},
+                {YUV4_to_RGB_8u, 0, YUV4_to_RGB_16u, 0, 0, YUV4_to_RGB_32f}
+            },
+            {
+                {YUV_to_RGBA_8u, 0, YUV_to_RGBA_16u, 0, 0, YUV_to_RGBA_32f},
+                {YUV4_to_RGBA_8u, 0, YUV4_to_RGBA_16u, 0, 0, YUV4_to_RGBA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void YUV_to_BGR(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {YUV_to_BGR_8u, 0, YUV_to_BGR_16u, 0, 0, YUV_to_BGR_32f},
+                {YUV4_to_BGR_8u, 0, YUV4_to_BGR_16u, 0, 0, YUV4_to_BGR_32f}
+            },
+            {
+                {YUV_to_BGRA_8u, 0, YUV_to_BGRA_16u, 0, 0, YUV_to_BGRA_32f},
+                {YUV4_to_BGRA_8u, 0, YUV4_to_BGRA_16u, 0, 0, YUV4_to_BGRA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void RGB_to_YCrCb(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {RGB_to_YCrCb_8u, 0, RGB_to_YCrCb_16u, 0, 0, RGB_to_YCrCb_32f},
+                {RGBA_to_YCrCb_8u, 0, RGBA_to_YCrCb_16u, 0, 0, RGBA_to_YCrCb_32f}
+            },
+            {
+                {RGB_to_YCrCb4_8u, 0, RGB_to_YCrCb4_16u, 0, 0, RGB_to_YCrCb4_32f},
+                {RGBA_to_YCrCb4_8u, 0, RGBA_to_YCrCb4_16u, 0, 0, RGBA_to_YCrCb4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_YCrCb(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {BGR_to_YCrCb_8u, 0, BGR_to_YCrCb_16u, 0, 0, BGR_to_YCrCb_32f},
+                {BGRA_to_YCrCb_8u, 0, BGRA_to_YCrCb_16u, 0, 0, BGRA_to_YCrCb_32f}
+            },
+            {
+                {BGR_to_YCrCb4_8u, 0, BGR_to_YCrCb4_16u, 0, 0, BGR_to_YCrCb4_32f},
+                {BGRA_to_YCrCb4_8u, 0, BGRA_to_YCrCb4_16u, 0, 0, BGRA_to_YCrCb4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void YCrCb_to_RGB(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {YCrCb_to_RGB_8u, 0, YCrCb_to_RGB_16u, 0, 0, YCrCb_to_RGB_32f},
+                {YCrCb4_to_RGB_8u, 0, YCrCb4_to_RGB_16u, 0, 0, YCrCb4_to_RGB_32f}
+            },
+            {
+                {YCrCb_to_RGBA_8u, 0, YCrCb_to_RGBA_16u, 0, 0, YCrCb_to_RGBA_32f},
+                {YCrCb4_to_RGBA_8u, 0, YCrCb4_to_RGBA_16u, 0, 0, YCrCb4_to_RGBA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void YCrCb_to_BGR(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {YCrCb_to_BGR_8u, 0, YCrCb_to_BGR_16u, 0, 0, YCrCb_to_BGR_32f},
+                {YCrCb4_to_BGR_8u, 0, YCrCb4_to_BGR_16u, 0, 0, YCrCb4_to_BGR_32f}
+            },
+            {
+                {YCrCb_to_BGRA_8u, 0, YCrCb_to_BGRA_16u, 0, 0, YCrCb_to_BGRA_32f},
+                {YCrCb4_to_BGRA_8u, 0, YCrCb4_to_BGRA_16u, 0, 0, YCrCb4_to_BGRA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void RGB_to_XYZ(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {RGB_to_XYZ_8u, 0, RGB_to_XYZ_16u, 0, 0, RGB_to_XYZ_32f},
+                {RGBA_to_XYZ_8u, 0, RGBA_to_XYZ_16u, 0, 0, RGBA_to_XYZ_32f}
+            },
+            {
+                {RGB_to_XYZ4_8u, 0, RGB_to_XYZ4_16u, 0, 0, RGB_to_XYZ4_32f},
+                {RGBA_to_XYZ4_8u, 0, RGBA_to_XYZ4_16u, 0, 0, RGBA_to_XYZ4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_XYZ(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {BGR_to_XYZ_8u, 0, BGR_to_XYZ_16u, 0, 0, BGR_to_XYZ_32f},
+                {BGRA_to_XYZ_8u, 0, BGRA_to_XYZ_16u, 0, 0, BGRA_to_XYZ_32f}
+            },
+            {
+                {BGR_to_XYZ4_8u, 0, BGR_to_XYZ4_16u, 0, 0, BGR_to_XYZ4_32f},
+                {BGRA_to_XYZ4_8u, 0, BGRA_to_XYZ4_16u, 0, 0, BGRA_to_XYZ4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void XYZ_to_RGB(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {XYZ_to_RGB_8u, 0, XYZ_to_RGB_16u, 0, 0, XYZ_to_RGB_32f},
+                {XYZ4_to_RGB_8u, 0, XYZ4_to_RGB_16u, 0, 0, XYZ4_to_RGB_32f}
+            },
+            {
+                {XYZ_to_RGBA_8u, 0, XYZ_to_RGBA_16u, 0, 0, XYZ_to_RGBA_32f},
+                {XYZ4_to_RGBA_8u, 0, XYZ4_to_RGBA_16u, 0, 0, XYZ4_to_RGBA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void XYZ_to_BGR(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {XYZ_to_BGR_8u, 0, XYZ_to_BGR_16u, 0, 0, XYZ_to_BGR_32f},
+                {XYZ4_to_BGR_8u, 0, XYZ4_to_BGR_16u, 0, 0, XYZ4_to_BGR_32f}
+            },
+            {
+                {XYZ_to_BGRA_8u, 0, XYZ_to_BGRA_16u, 0, 0, XYZ_to_BGRA_32f},
+                {XYZ4_to_BGRA_8u, 0, XYZ4_to_BGRA_16u, 0, 0, XYZ4_to_BGRA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void RGB_to_HSV(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {RGB_to_HSV_8u, 0, 0, 0, 0, RGB_to_HSV_32f},
+                {RGBA_to_HSV_8u, 0, 0, 0, 0, RGBA_to_HSV_32f},
+            },
+            {
+                {RGB_to_HSV4_8u, 0, 0, 0, 0, RGB_to_HSV4_32f},
+                {RGBA_to_HSV4_8u, 0, 0, 0, 0, RGBA_to_HSV4_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_HSV(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {BGR_to_HSV_8u, 0, 0, 0, 0, BGR_to_HSV_32f},
+                {BGRA_to_HSV_8u, 0, 0, 0, 0, BGRA_to_HSV_32f}
+            },
+            {
+                {BGR_to_HSV4_8u, 0, 0, 0, 0, BGR_to_HSV4_32f},
+                {BGRA_to_HSV4_8u, 0, 0, 0, 0, BGRA_to_HSV4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void HSV_to_RGB(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {HSV_to_RGB_8u, 0, 0, 0, 0, HSV_to_RGB_32f},
+                {HSV4_to_RGB_8u, 0, 0, 0, 0, HSV4_to_RGB_32f}
+            },
+            {
+                {HSV_to_RGBA_8u, 0, 0, 0, 0, HSV_to_RGBA_32f},
+                {HSV4_to_RGBA_8u, 0, 0, 0, 0, HSV4_to_RGBA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void HSV_to_BGR(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {HSV_to_BGR_8u, 0, 0, 0, 0, HSV_to_BGR_32f},
+                {HSV4_to_BGR_8u, 0, 0, 0, 0, HSV4_to_BGR_32f}
+            },
+            {
+                {HSV_to_BGRA_8u, 0, 0, 0, 0, HSV_to_BGRA_32f},
+                {HSV4_to_BGRA_8u, 0, 0, 0, 0, HSV4_to_BGRA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void RGB_to_HLS(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {RGB_to_HLS_8u, 0, 0, 0, 0, RGB_to_HLS_32f},
+                {RGBA_to_HLS_8u, 0, 0, 0, 0, RGBA_to_HLS_32f},
+            },
+            {
+                {RGB_to_HLS4_8u, 0, 0, 0, 0, RGB_to_HLS4_32f},
+                {RGBA_to_HLS4_8u, 0, 0, 0, 0, RGBA_to_HLS4_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_HLS(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {BGR_to_HLS_8u, 0, 0, 0, 0, BGR_to_HLS_32f},
+                {BGRA_to_HLS_8u, 0, 0, 0, 0, BGRA_to_HLS_32f}
+            },
+            {
+                {BGR_to_HLS4_8u, 0, 0, 0, 0, BGR_to_HLS4_32f},
+                {BGRA_to_HLS4_8u, 0, 0, 0, 0, BGRA_to_HLS4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void HLS_to_RGB(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {HLS_to_RGB_8u, 0, 0, 0, 0, HLS_to_RGB_32f},
+                {HLS4_to_RGB_8u, 0, 0, 0, 0, HLS4_to_RGB_32f}
+            },
+            {
+                {HLS_to_RGBA_8u, 0, 0, 0, 0, HLS_to_RGBA_32f},
+                {HLS4_to_RGBA_8u, 0, 0, 0, 0, HLS4_to_RGBA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void HLS_to_BGR(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {HLS_to_BGR_8u, 0, 0, 0, 0, HLS_to_BGR_32f},
+                {HLS4_to_BGR_8u, 0, 0, 0, 0, HLS4_to_BGR_32f}
+            },
+            {
+                {HLS_to_BGRA_8u, 0, 0, 0, 0, HLS_to_BGRA_32f},
+                {HLS4_to_BGRA_8u, 0, 0, 0, 0, HLS4_to_BGRA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void RGB_to_HSV_FULL(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {RGB_to_HSV_FULL_8u, 0, 0, 0, 0, RGB_to_HSV_FULL_32f},
+                {RGBA_to_HSV_FULL_8u, 0, 0, 0, 0, RGBA_to_HSV_FULL_32f},
+            },
+            {
+                {RGB_to_HSV4_FULL_8u, 0, 0, 0, 0, RGB_to_HSV4_FULL_32f},
+                {RGBA_to_HSV4_FULL_8u, 0, 0, 0, 0, RGBA_to_HSV4_FULL_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_HSV_FULL(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {BGR_to_HSV_FULL_8u, 0, 0, 0, 0, BGR_to_HSV_FULL_32f},
+                {BGRA_to_HSV_FULL_8u, 0, 0, 0, 0, BGRA_to_HSV_FULL_32f}
+            },
+            {
+                {BGR_to_HSV4_FULL_8u, 0, 0, 0, 0, BGR_to_HSV4_FULL_32f},
+                {BGRA_to_HSV4_FULL_8u, 0, 0, 0, 0, BGRA_to_HSV4_FULL_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void HSV_to_RGB_FULL(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {HSV_to_RGB_FULL_8u, 0, 0, 0, 0, HSV_to_RGB_FULL_32f},
+                {HSV4_to_RGB_FULL_8u, 0, 0, 0, 0, HSV4_to_RGB_FULL_32f}
+            },
+            {
+                {HSV_to_RGBA_FULL_8u, 0, 0, 0, 0, HSV_to_RGBA_FULL_32f},
+                {HSV4_to_RGBA_FULL_8u, 0, 0, 0, 0, HSV4_to_RGBA_FULL_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void HSV_to_BGR_FULL(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {HSV_to_BGR_FULL_8u, 0, 0, 0, 0, HSV_to_BGR_FULL_32f},
+                {HSV4_to_BGR_FULL_8u, 0, 0, 0, 0, HSV4_to_BGR_FULL_32f}
+            },
+            {
+                {HSV_to_BGRA_FULL_8u, 0, 0, 0, 0, HSV_to_BGRA_FULL_32f},
+                {HSV4_to_BGRA_FULL_8u, 0, 0, 0, 0, HSV4_to_BGRA_FULL_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void RGB_to_HLS_FULL(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {RGB_to_HLS_FULL_8u, 0, 0, 0, 0, RGB_to_HLS_FULL_32f},
+                {RGBA_to_HLS_FULL_8u, 0, 0, 0, 0, RGBA_to_HLS_FULL_32f},
+            },
+            {
+                {RGB_to_HLS4_FULL_8u, 0, 0, 0, 0, RGB_to_HLS4_FULL_32f},
+                {RGBA_to_HLS4_FULL_8u, 0, 0, 0, 0, RGBA_to_HLS4_FULL_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_HLS_FULL(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {BGR_to_HLS_FULL_8u, 0, 0, 0, 0, BGR_to_HLS_FULL_32f},
+                {BGRA_to_HLS_FULL_8u, 0, 0, 0, 0, BGRA_to_HLS_FULL_32f}
+            },
+            {
+                {BGR_to_HLS4_FULL_8u, 0, 0, 0, 0, BGR_to_HLS4_FULL_32f},
+                {BGRA_to_HLS4_FULL_8u, 0, 0, 0, 0, BGRA_to_HLS4_FULL_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void HLS_to_RGB_FULL(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {HLS_to_RGB_FULL_8u, 0, 0, 0, 0, HLS_to_RGB_FULL_32f},
+                {HLS4_to_RGB_FULL_8u, 0, 0, 0, 0, HLS4_to_RGB_FULL_32f}
+            },
+            {
+                {HLS_to_RGBA_FULL_8u, 0, 0, 0, 0, HLS_to_RGBA_FULL_32f},
+                {HLS4_to_RGBA_FULL_8u, 0, 0, 0, 0, HLS4_to_RGBA_FULL_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void HLS_to_BGR_FULL(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][6] =
+        {
+            {
+                {HLS_to_BGR_FULL_8u, 0, 0, 0, 0, HLS_to_BGR_FULL_32f},
+                {HLS4_to_BGR_FULL_8u, 0, 0, 0, 0, HLS4_to_BGR_FULL_32f}
+            },
+            {
+                {HLS_to_BGRA_FULL_8u, 0, 0, 0, 0, HLS_to_BGRA_FULL_32f},
+                {HLS4_to_BGRA_FULL_8u, 0, 0, 0, 0, HLS4_to_BGRA_FULL_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, stream);
+    }
+
+    void BGR_to_Lab(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {BGR_to_Lab_8u, BGR_to_Lab_32f},
+                {BGRA_to_Lab_8u, BGRA_to_Lab_32f}
+            },
+            {
+                {BGR_to_Lab4_8u, BGR_to_Lab4_32f},
+                {BGRA_to_Lab4_8u, BGRA_to_Lab4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void RGB_to_Lab(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {RGB_to_Lab_8u, RGB_to_Lab_32f},
+                {RGBA_to_Lab_8u, RGBA_to_Lab_32f}
+            },
+            {
+                {RGB_to_Lab4_8u, RGB_to_Lab4_32f},
+                {RGBA_to_Lab4_8u, RGBA_to_Lab4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void LBGR_to_Lab(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {LBGR_to_Lab_8u, LBGR_to_Lab_32f},
+                {LBGRA_to_Lab_8u, LBGRA_to_Lab_32f}
+            },
+            {
+                {LBGR_to_Lab4_8u, LBGR_to_Lab4_32f},
+                {LBGRA_to_Lab4_8u, LBGRA_to_Lab4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void LRGB_to_Lab(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {LRGB_to_Lab_8u, LRGB_to_Lab_32f},
+                {LRGBA_to_Lab_8u, LRGBA_to_Lab_32f}
+            },
+            {
+                {LRGB_to_Lab4_8u, LRGB_to_Lab4_32f},
+                {LRGBA_to_Lab4_8u, LRGBA_to_Lab4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void Lab_to_BGR(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {Lab_to_BGR_8u, Lab_to_BGR_32f},
+                {Lab4_to_BGR_8u, Lab4_to_BGR_32f}
+            },
+            {
+                {Lab_to_BGRA_8u, Lab_to_BGRA_32f},
+                {Lab4_to_BGRA_8u, Lab4_to_BGRA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void Lab_to_RGB(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {Lab_to_RGB_8u, Lab_to_RGB_32f},
+                {Lab4_to_RGB_8u, Lab4_to_RGB_32f}
+            },
+            {
+                {Lab_to_RGBA_8u, Lab_to_RGBA_32f},
+                {Lab4_to_RGBA_8u, Lab4_to_RGBA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void Lab_to_LBGR(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {Lab_to_LBGR_8u, Lab_to_LBGR_32f},
+                {Lab4_to_LBGR_8u, Lab4_to_LBGR_32f}
+            },
+            {
+                {Lab_to_LBGRA_8u, Lab_to_LBGRA_32f},
+                {Lab4_to_LBGRA_8u, Lab4_to_LBGRA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void Lab_to_LRGB(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {Lab_to_LRGB_8u, Lab_to_LRGB_32f},
+                {Lab4_to_LRGB_8u, Lab4_to_LRGB_32f}
+            },
+            {
+                {Lab_to_LRGBA_8u, Lab_to_LRGBA_32f},
+                {Lab4_to_LRGBA_8u, Lab4_to_LRGBA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void BGR_to_Luv(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {BGR_to_Luv_8u, BGR_to_Luv_32f},
+                {BGRA_to_Luv_8u, BGRA_to_Luv_32f}
+            },
+            {
+                {BGR_to_Luv4_8u, BGR_to_Luv4_32f},
+                {BGRA_to_Luv4_8u, BGRA_to_Luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void RGB_to_Luv(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {RGB_to_Luv_8u, RGB_to_Luv_32f},
+                {RGBA_to_Luv_8u, RGBA_to_Luv_32f}
+            },
+            {
+                {RGB_to_Luv4_8u, RGB_to_Luv4_32f},
+                {RGBA_to_Luv4_8u, RGBA_to_Luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void LBGR_to_Luv(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {LBGR_to_Luv_8u, LBGR_to_Luv_32f},
+                {LBGRA_to_Luv_8u, LBGRA_to_Luv_32f}
+            },
+            {
+                {LBGR_to_Luv4_8u, LBGR_to_Luv4_32f},
+                {LBGRA_to_Luv4_8u, LBGRA_to_Luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void LRGB_to_Luv(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {LRGB_to_Luv_8u, LRGB_to_Luv_32f},
+                {LRGBA_to_Luv_8u, LRGBA_to_Luv_32f}
+            },
+            {
+                {LRGB_to_Luv4_8u, LRGB_to_Luv4_32f},
+                {LRGBA_to_Luv4_8u, LRGBA_to_Luv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void Luv_to_BGR(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {Luv_to_BGR_8u, Luv_to_BGR_32f},
+                {Luv4_to_BGR_8u, Luv4_to_BGR_32f}
+            },
+            {
+                {Luv_to_BGRA_8u, Luv_to_BGRA_32f},
+                {Luv4_to_BGRA_8u, Luv4_to_BGRA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void Luv_to_RGB(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {Luv_to_RGB_8u, Luv_to_RGB_32f},
+                {Luv4_to_RGB_8u, Luv4_to_RGB_32f}
+            },
+            {
+                {Luv_to_RGBA_8u, Luv_to_RGBA_32f},
+                {Luv4_to_RGBA_8u, Luv4_to_RGBA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void Luv_to_LBGR(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {Luv_to_LBGR_8u, Luv_to_LBGR_32f},
+                {Luv4_to_LBGR_8u, Luv4_to_LBGR_32f}
+            },
+            {
+                {Luv_to_LBGRA_8u, Luv_to_LBGRA_32f},
+                {Luv4_to_LBGRA_8u, Luv4_to_LBGRA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void Luv_to_LRGB(InputArray _src, OutputArray _dst, int dcn, Stream& stream)
+    {
+        using namespace cv::cuda::device;
+        static const gpu_func_t funcs[2][2][2] =
+        {
+            {
+                {Luv_to_LRGB_8u, Luv_to_LRGB_32f},
+                {Luv4_to_LRGB_8u, Luv4_to_LRGB_32f}
+            },
+            {
+                {Luv_to_LRGBA_8u, Luv_to_LRGBA_32f},
+                {Luv4_to_LRGBA_8u, Luv4_to_LRGBA_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.depth() == CV_8U || src.depth() == CV_32F );
+        CV_Assert( src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[dcn == 4][src.channels() == 4][src.depth() == CV_32F](src, dst, stream);
+    }
+
+    void RGBA_to_mBGRA(InputArray _src, OutputArray _dst, int, Stream& _stream)
+    {
+    #if (CUDA_VERSION < 5000)
+        CV_UNUSED(_src);
+        CV_UNUSED(_dst);
+        CV_UNUSED(_stream);
+        CV_Error( Error::StsBadFlag, "Unknown/unsupported color conversion code" );
+    #else
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        if (src.depth() == CV_8U)
+            nppSafeCall( nppiAlphaPremul_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+        else
+            nppSafeCall( nppiAlphaPremul_16u_AC4R(src.ptr<Npp16u>(), static_cast<int>(src.step), dst.ptr<Npp16u>(), static_cast<int>(dst.step), oSizeROI) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    #endif
+    }
+
+    void bayer_to_BGR(InputArray _src, OutputArray _dst, int dcn, bool blue_last, bool start_with_green, Stream& stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        static const func_t funcs[3][4] =
+        {
+            {0,0,Bayer2BGR_8u_gpu<3>, Bayer2BGR_8u_gpu<4>},
+            {0,0,0,0},
+            {0,0,Bayer2BGR_16u_gpu<3>, Bayer2BGR_16u_gpu<4>}
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 );
+        CV_Assert( src.rows > 2 && src.cols > 2 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()][dcn - 1](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
+    }
+    void bayerBG_to_BGR(InputArray src, OutputArray dst, int dcn, Stream& stream)
+    {
+        bayer_to_BGR(src, dst, dcn, false, false, stream);
+    }
+    void bayeRGB_to_BGR(InputArray src, OutputArray dst, int dcn, Stream& stream)
+    {
+        bayer_to_BGR(src, dst, dcn, false, true, stream);
+    }
+    void bayerRG_to_BGR(InputArray src, OutputArray dst, int dcn, Stream& stream)
+    {
+        bayer_to_BGR(src, dst, dcn, true, false, stream);
+    }
+    void bayerGR_to_BGR(InputArray src, OutputArray dst, int dcn, Stream& stream)
+    {
+        bayer_to_BGR(src, dst, dcn, true, true, stream);
+    }
+
+    void bayer_to_gray(InputArray _src, OutputArray _dst, bool blue_last, bool start_with_green, Stream& stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        static const func_t funcs[3] =
+        {
+            Bayer2BGR_8u_gpu<1>,
+            0,
+            Bayer2BGR_16u_gpu<1>,
+        };
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 );
+        CV_Assert( src.rows > 2 && src.cols > 2 );
+
+        _dst.create(src.size(), CV_MAKE_TYPE(src.depth(), 1));
+        GpuMat dst = _dst.getGpuMat();
+
+        funcs[src.depth()](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
+    }
+    void bayerBG_to_gray(InputArray src, OutputArray dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, false, false, stream);
+    }
+    void bayeRGB_to_GRAY(InputArray src, OutputArray dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, false, true, stream);
+    }
+    void bayerRG_to_gray(InputArray src, OutputArray dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, true, false, stream);
+    }
+    void bayerGR_to_gray(InputArray src, OutputArray dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, true, true, stream);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// cvtColor
+
+void cv::cuda::cvtColor(InputArray src, OutputArray dst, int code, int dcn, Stream& stream)
+{
+    typedef void (*func_t)(InputArray src, OutputArray dst, int dcn, Stream& stream);
+    static const func_t funcs[] =
+    {
+        BGR_to_BGRA,            // CV_BGR2BGRA    =0
+        BGRA_to_BGR,            // CV_BGRA2BGR    =1
+        BGR_to_RGBA,            // CV_BGR2RGBA    =2
+        BGRA_to_RGB,            // CV_RGBA2BGR    =3
+        BGR_to_RGB,             // CV_BGR2RGB     =4
+        BGRA_to_RGBA,           // CV_BGRA2RGBA   =5
+
+        BGR_to_GRAY,            // CV_BGR2GRAY    =6
+        RGB_to_GRAY,            // CV_RGB2GRAY    =7
+        GRAY_to_BGR,            // CV_GRAY2BGR    =8
+        GRAY_to_BGRA,           // CV_GRAY2BGRA   =9
+        BGRA_to_GRAY,           // CV_BGRA2GRAY   =10
+        RGBA_to_GRAY,           // CV_RGBA2GRAY   =11
+
+        BGR_to_BGR565,          // CV_BGR2BGR565  =12
+        RGB_to_BGR565,          // CV_RGB2BGR565  =13
+        BGR565_to_BGR,          // CV_BGR5652BGR  =14
+        BGR565_to_RGB,          // CV_BGR5652RGB  =15
+        BGRA_to_BGR565,         // CV_BGRA2BGR565 =16
+        RGBA_to_BGR565,         // CV_RGBA2BGR565 =17
+        BGR565_to_BGRA,         // CV_BGR5652BGRA =18
+        BGR565_to_RGBA,         // CV_BGR5652RGBA =19
+
+        GRAY_to_BGR565,         // CV_GRAY2BGR565 =20
+        BGR565_to_GRAY,         // CV_BGR5652GRAY =21
+
+        BGR_to_BGR555,          // CV_BGR2BGR555  =22
+        RGB_to_BGR555,          // CV_RGB2BGR555  =23
+        BGR555_to_BGR,          // CV_BGR5552BGR  =24
+        BGR555_to_RGB,          // CV_BGR5552RGB  =25
+        BGRA_to_BGR555,         // CV_BGRA2BGR555 =26
+        RGBA_to_BGR555,         // CV_RGBA2BGR555 =27
+        BGR555_to_BGRA,         // CV_BGR5552BGRA =28
+        BGR555_to_RGBA,         // CV_BGR5552RGBA =29
+
+        GRAY_to_BGR555,         // CV_GRAY2BGR555 =30
+        BGR555_to_GRAY,         // CV_BGR5552GRAY =31
+
+        BGR_to_XYZ,             // CV_BGR2XYZ     =32
+        RGB_to_XYZ,             // CV_RGB2XYZ     =33
+        XYZ_to_BGR,             // CV_XYZ2BGR     =34
+        XYZ_to_RGB,             // CV_XYZ2RGB     =35
+
+        BGR_to_YCrCb,           // CV_BGR2YCrCb   =36
+        RGB_to_YCrCb,           // CV_RGB2YCrCb   =37
+        YCrCb_to_BGR,           // CV_YCrCb2BGR   =38
+        YCrCb_to_RGB,           // CV_YCrCb2RGB   =39
+
+        BGR_to_HSV,             // CV_BGR2HSV     =40
+        RGB_to_HSV,             // CV_RGB2HSV     =41
+
+        0,                      //                =42
+        0,                      //                =43
+
+        BGR_to_Lab,             // CV_BGR2Lab     =44
+        RGB_to_Lab,             // CV_RGB2Lab     =45
+
+        bayerBG_to_BGR,         // CV_BayerBG2BGR =46
+        bayeRGB_to_BGR,         // CV_BayeRGB2BGR =47
+        bayerRG_to_BGR,         // CV_BayerRG2BGR =48
+        bayerGR_to_BGR,         // CV_BayerGR2BGR =49
+
+        BGR_to_Luv,             // CV_BGR2Luv     =50
+        RGB_to_Luv,             // CV_RGB2Luv     =51
+
+        BGR_to_HLS,             // CV_BGR2HLS     =52
+        RGB_to_HLS,             // CV_RGB2HLS     =53
+
+        HSV_to_BGR,             // CV_HSV2BGR     =54
+        HSV_to_RGB,             // CV_HSV2RGB     =55
+
+        Lab_to_BGR,             // CV_Lab2BGR     =56
+        Lab_to_RGB,             // CV_Lab2RGB     =57
+        Luv_to_BGR,             // CV_Luv2BGR     =58
+        Luv_to_RGB,             // CV_Luv2RGB     =59
+
+        HLS_to_BGR,             // CV_HLS2BGR     =60
+        HLS_to_RGB,             // CV_HLS2RGB     =61
+
+        0,                      // CV_BayerBG2BGR_VNG =62
+        0,                      // CV_BayeRGB2BGR_VNG =63
+        0,                      // CV_BayerRG2BGR_VNG =64
+        0,                      // CV_BayerGR2BGR_VNG =65
+
+        BGR_to_HSV_FULL,        // CV_BGR2HSV_FULL = 66
+        RGB_to_HSV_FULL,        // CV_RGB2HSV_FULL = 67
+        BGR_to_HLS_FULL,        // CV_BGR2HLS_FULL = 68
+        RGB_to_HLS_FULL,        // CV_RGB2HLS_FULL = 69
+
+        HSV_to_BGR_FULL,        // CV_HSV2BGR_FULL = 70
+        HSV_to_RGB_FULL,        // CV_HSV2RGB_FULL = 71
+        HLS_to_BGR_FULL,        // CV_HLS2BGR_FULL = 72
+        HLS_to_RGB_FULL,        // CV_HLS2RGB_FULL = 73
+
+        LBGR_to_Lab,            // CV_LBGR2Lab     = 74
+        LRGB_to_Lab,            // CV_LRGB2Lab     = 75
+        LBGR_to_Luv,            // CV_LBGR2Luv     = 76
+        LRGB_to_Luv,            // CV_LRGB2Luv     = 77
+
+        Lab_to_LBGR,            // CV_Lab2LBGR     = 78
+        Lab_to_LRGB,            // CV_Lab2LRGB     = 79
+        Luv_to_LBGR,            // CV_Luv2LBGR     = 80
+        Luv_to_LRGB,            // CV_Luv2LRGB     = 81
+
+        BGR_to_YUV,             // CV_BGR2YUV      = 82
+        RGB_to_YUV,             // CV_RGB2YUV      = 83
+        YUV_to_BGR,             // CV_YUV2BGR      = 84
+        YUV_to_RGB,             // CV_YUV2RGB      = 85
+
+        bayerBG_to_gray,        // CV_BayerBG2GRAY = 86
+        bayeRGB_to_GRAY,        // CV_BayeRGB2GRAY = 87
+        bayerRG_to_gray,        // CV_BayerRG2GRAY = 88
+        bayerGR_to_gray,        // CV_BayerGR2GRAY = 89
+
+        //YUV 4:2:0 formats family
+        0,                      // CV_YUV2RGB_NV12 = 90,
+        0,                      // CV_YUV2BGR_NV12 = 91,
+        0,                      // CV_YUV2RGB_NV21 = 92,
+        0,                      // CV_YUV2BGR_NV21 = 93,
+
+        0,                      // CV_YUV2RGBA_NV12 = 94,
+        0,                      // CV_YUV2BGRA_NV12 = 95,
+        0,                      // CV_YUV2RGBA_NV21 = 96,
+        0,                      // CV_YUV2BGRA_NV21 = 97,
+
+        0,                      // CV_YUV2RGB_YV12 = 98,
+        0,                      // CV_YUV2BGR_YV12 = 99,
+        0,                      // CV_YUV2RGB_IYUV = 100,
+        0,                      // CV_YUV2BGR_IYUV = 101,
+
+        0,                      // CV_YUV2RGBA_YV12 = 102,
+        0,                      // CV_YUV2BGRA_YV12 = 103,
+        0,                      // CV_YUV2RGBA_IYUV = 104,
+        0,                      // CV_YUV2BGRA_IYUV = 105,
+
+        0,                      // CV_YUV2GRAY_420 = 106,
+
+        //YUV 4:2:2 formats family
+        0,                      // CV_YUV2RGB_UYVY = 107,
+        0,                      // CV_YUV2BGR_UYVY = 108,
+        0,                      // //CV_YUV2RGB_VYUY = 109,
+        0,                      // //CV_YUV2BGR_VYUY = 110,
+
+        0,                      // CV_YUV2RGBA_UYVY = 111,
+        0,                      // CV_YUV2BGRA_UYVY = 112,
+        0,                      // //CV_YUV2RGBA_VYUY = 113,
+        0,                      // //CV_YUV2BGRA_VYUY = 114,
+
+        0,                      // CV_YUV2RGB_YUY2 = 115,
+        0,                      // CV_YUV2BGR_YUY2 = 116,
+        0,                      // CV_YUV2RGB_YVYU = 117,
+        0,                      // CV_YUV2BGR_YVYU = 118,
+
+        0,                      // CV_YUV2RGBA_YUY2 = 119,
+        0,                      // CV_YUV2BGRA_YUY2 = 120,
+        0,                      // CV_YUV2RGBA_YVYU = 121,
+        0,                      // CV_YUV2BGRA_YVYU = 122,
+
+        0,                      // CV_YUV2GRAY_UYVY = 123,
+        0,                      // CV_YUV2GRAY_YUY2 = 124,
+
+        // alpha premultiplication
+        RGBA_to_mBGRA,          // CV_RGBA2mRGBA = 125,
+        0,                      // CV_mRGBA2RGBA = 126,
+
+        0,                      // CV_COLORCVT_MAX  = 127
+    };
+
+    CV_Assert( code < 128 );
+
+    func_t func = funcs[code];
+
+    if (func == 0)
+        CV_Error(Error::StsBadFlag, "Unknown/unsupported color conversion code");
+
+    func(src, dst, dcn, stream);
+}
+
+////////////////////////////////////////////////////////////////////////
+// demosaicing
+
+void cv::cuda::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn, Stream& stream)
+{
+    CV_Assert( !_src.empty() );
+
+    switch (code)
+    {
+    case cv::COLOR_BayerBG2GRAY: case cv::COLOR_BayerGB2GRAY: case cv::COLOR_BayerRG2GRAY: case cv::COLOR_BayerGR2GRAY:
+        bayer_to_gray(_src, _dst, code == cv::COLOR_BayerBG2GRAY || code == cv::COLOR_BayerGB2GRAY, code == cv::COLOR_BayerGB2GRAY || code == cv::COLOR_BayerGR2GRAY, stream);
+        break;
+
+    case cv::COLOR_BayerBG2BGR: case cv::COLOR_BayerGB2BGR: case cv::COLOR_BayerRG2BGR: case cv::COLOR_BayerGR2BGR:
+        bayer_to_BGR(_src, _dst, dcn, code == cv::COLOR_BayerBG2BGR || code == cv::COLOR_BayerGB2BGR, code == cv::COLOR_BayerGB2BGR || code == cv::COLOR_BayerGR2BGR, stream);
+        break;
+
+    case COLOR_BayerBG2BGR_MHT: case COLOR_BayerGB2BGR_MHT: case COLOR_BayerRG2BGR_MHT: case COLOR_BayerGR2BGR_MHT:
+    {
+        if (dcn <= 0) dcn = 3;
+
+        GpuMat src = _src.getGpuMat();
+        const int depth = _src.depth();
+
+        CV_Assert( depth == CV_8U );
+        CV_Assert( src.channels() == 1 );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        _dst.create(_src.size(), CV_MAKE_TYPE(depth, dcn));
+        GpuMat dst = _dst.getGpuMat();
+
+        dst.setTo(Scalar::all(0), stream);
+
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+
+        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
+                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
+
+        if (dcn == 3)
+            cv::cuda::device::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+        else
+            cv::cuda::device::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+
+        break;
+    }
+
+    case COLOR_BayerBG2GRAY_MHT: case COLOR_BayerGB2GRAY_MHT: case COLOR_BayerRG2GRAY_MHT: case COLOR_BayerGR2GRAY_MHT:
+    {
+        GpuMat src = _src.getGpuMat();
+        const int depth = _src.depth();
+
+        CV_Assert( depth == CV_8U );
+
+        _dst.create(_src.size(), CV_MAKE_TYPE(depth, 1));
+        GpuMat dst = _dst.getGpuMat();
+
+        dst.setTo(Scalar::all(0), stream);
+
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+
+        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
+                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
+
+        cv::cuda::device::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+
+        break;
+    }
+
+    default:
+        CV_Error(Error::StsBadFlag, "Unknown / unsupported color conversion code");
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// swapChannels
+
+void cv::cuda::swapChannels(InputOutputArray _image, const int dstOrder[4], Stream& _stream)
+{
+    GpuMat image = _image.getGpuMat();
+
+    CV_Assert( image.type() == CV_8UC4 );
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+    NppStreamHandler h(stream);
+
+    NppiSize sz;
+    sz.width  = image.cols;
+    sz.height = image.rows;
+
+    nppSafeCall( nppiSwapChannels_8u_C4IR(image.ptr<Npp8u>(), static_cast<int>(image.step), sz, dstOrder) );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+////////////////////////////////////////////////////////////////////////
+// gammaCorrection
+
+void cv::cuda::gammaCorrection(InputArray _src, OutputArray _dst, bool forward, Stream& stream)
+{
+#if (CUDA_VERSION < 5000)
+    CV_UNUSED(_src);
+    CV_UNUSED(_dst);
+    CV_UNUSED(forward);
+    CV_UNUSED(stream);
+    CV_Error(Error::StsNotImplemented, "This function works only with CUDA 5.0 or higher");
+#else
+    typedef NppStatus (*func_t)(const Npp8u* pSrc, int nSrcStep, Npp8u* pDst, int nDstStep, NppiSize oSizeROI);
+    typedef NppStatus (*func_inplace_t)(Npp8u* pSrcDst, int nSrcDstStep, NppiSize oSizeROI);
+
+    static const func_t funcs[2][5] =
+    {
+        {0, 0, 0, nppiGammaInv_8u_C3R, nppiGammaInv_8u_AC4R},
+        {0, 0, 0, nppiGammaFwd_8u_C3R, nppiGammaFwd_8u_AC4R}
+    };
+    static const func_inplace_t funcs_inplace[2][5] =
+    {
+        {0, 0, 0, nppiGammaInv_8u_C3IR, nppiGammaInv_8u_AC4IR},
+        {0, 0, 0, nppiGammaFwd_8u_C3IR, nppiGammaFwd_8u_AC4IR}
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC3 || src.type() == CV_8UC4 );
+
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    NppStreamHandler h(StreamAccessor::getStream(stream));
+
+    NppiSize oSizeROI;
+    oSizeROI.width = src.cols;
+    oSizeROI.height = src.rows;
+
+    if (dst.data == src.data)
+        funcs_inplace[forward][src.channels()](dst.ptr<Npp8u>(), static_cast<int>(src.step), oSizeROI);
+    else
+        funcs[forward][src.channels()](src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI);
+
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////
+// alphaComp
+
+namespace
+{
+    template <int DEPTH> struct NppAlphaCompFunc
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp);
+    };
+
+    template <int DEPTH, typename NppAlphaCompFunc<DEPTH>::func_t func> struct NppAlphaComp
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_t;
+
+        static void call(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize oSizeROI;
+            oSizeROI.width = img1.cols;
+            oSizeROI.height = img2.rows;
+
+            nppSafeCall( func(img1.ptr<npp_t>(), static_cast<int>(img1.step), img2.ptr<npp_t>(), static_cast<int>(img2.step),
+                              dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, eAlphaOp) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::cuda::alphaComp(InputArray _img1, InputArray _img2, OutputArray _dst, int alpha_op, Stream& stream)
+{
+    static const NppiAlphaOp npp_alpha_ops[] = {
+        NPPI_OP_ALPHA_OVER,
+        NPPI_OP_ALPHA_IN,
+        NPPI_OP_ALPHA_OUT,
+        NPPI_OP_ALPHA_ATOP,
+        NPPI_OP_ALPHA_XOR,
+        NPPI_OP_ALPHA_PLUS,
+        NPPI_OP_ALPHA_OVER_PREMUL,
+        NPPI_OP_ALPHA_IN_PREMUL,
+        NPPI_OP_ALPHA_OUT_PREMUL,
+        NPPI_OP_ALPHA_ATOP_PREMUL,
+        NPPI_OP_ALPHA_XOR_PREMUL,
+        NPPI_OP_ALPHA_PLUS_PREMUL,
+        NPPI_OP_ALPHA_PREMUL
+    };
+
+    typedef void (*func_t)(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream);
+    static const func_t funcs[] =
+    {
+        NppAlphaComp<CV_8U, nppiAlphaComp_8u_AC4R>::call,
+        0,
+        NppAlphaComp<CV_16U, nppiAlphaComp_16u_AC4R>::call,
+        0,
+        NppAlphaComp<CV_32S, nppiAlphaComp_32s_AC4R>::call,
+        NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
+    };
+
+    GpuMat img1 = _img1.getGpuMat();
+    GpuMat img2 = _img2.getGpuMat();
+
+    CV_Assert( img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4 );
+    CV_Assert( img1.size() == img2.size() && img1.type() == img2.type() );
+
+    _dst.create(img1.size(), img1.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    const func_t func = funcs[img1.depth()];
+
+    func(img1, img2, dst, npp_alpha_ops[alpha_op], StreamAccessor::getStream(stream));
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/corners.cpp b/modules/cudaimgproc/src/corners.cpp
new file mode 100644
index 00000000000..aa8867f0efc
--- /dev/null
+++ b/modules/cudaimgproc/src/corners.cpp
@@ -0,0 +1,189 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_CUDAFILTERS)
+
+Ptr<cuda::CornernessCriteria> cv::cuda::createHarrisCorner(int, int, int, double, int) { throw_no_cuda(); return Ptr<cuda::CornernessCriteria>(); }
+Ptr<cuda::CornernessCriteria> cv::cuda::createMinEigenValCorner(int, int, int, int) { throw_no_cuda(); return Ptr<cuda::CornernessCriteria>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream);
+        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    class CornerBase : public CornernessCriteria
+    {
+    protected:
+        CornerBase(int srcType, int blockSize, int ksize, int borderType);
+
+        void extractCovData(const GpuMat& src, Stream& stream);
+
+        int srcType_;
+        int blockSize_;
+        int ksize_;
+        int borderType_;
+        GpuMat Dx_, Dy_;
+
+    private:
+        Ptr<cuda::Filter> filterDx_, filterDy_;
+    };
+
+    CornerBase::CornerBase(int srcType, int blockSize, int ksize, int borderType) :
+        srcType_(srcType), blockSize_(blockSize), ksize_(ksize), borderType_(borderType)
+    {
+        CV_Assert( borderType_ == BORDER_REFLECT101 || borderType_ == BORDER_REPLICATE || borderType_ == BORDER_REFLECT );
+
+        const int sdepth = CV_MAT_DEPTH(srcType_);
+        const int cn = CV_MAT_CN(srcType_);
+
+        CV_Assert( cn == 1 );
+
+        double scale = static_cast<double>(1 << ((ksize_ > 0 ? ksize_ : 3) - 1)) * blockSize_;
+
+        if (ksize_ < 0)
+            scale *= 2.;
+
+        if (sdepth == CV_8U)
+            scale *= 255.;
+
+        scale = 1./scale;
+
+        if (ksize_ > 0)
+        {
+            filterDx_ = cuda::createSobelFilter(srcType, CV_32F, 1, 0, ksize_, scale, borderType_);
+            filterDy_ = cuda::createSobelFilter(srcType, CV_32F, 0, 1, ksize_, scale, borderType_);
+        }
+        else
+        {
+            filterDx_ = cuda::createScharrFilter(srcType, CV_32F, 1, 0, scale, borderType_);
+            filterDy_ = cuda::createScharrFilter(srcType, CV_32F, 0, 1, scale, borderType_);
+        }
+    }
+
+    void CornerBase::extractCovData(const GpuMat& src, Stream& stream)
+    {
+        CV_Assert( src.type() == srcType_ );
+        filterDx_->apply(src, Dx_, stream);
+        filterDy_->apply(src, Dy_, stream);
+    }
+
+    class Harris : public CornerBase
+    {
+    public:
+        Harris(int srcType, int blockSize, int ksize, double k, int borderType) :
+            CornerBase(srcType, blockSize, ksize, borderType), k_(static_cast<float>(k))
+        {
+        }
+
+        void compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        float k_;
+    };
+
+    void Harris::compute(InputArray _src, OutputArray _dst, Stream& stream)
+    {
+        using namespace cv::cuda::device::imgproc;
+
+        GpuMat src = _src.getGpuMat();
+
+        extractCovData(src, stream);
+
+        _dst.create(src.size(), CV_32FC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        cornerHarris_gpu(blockSize_, k_, Dx_, Dy_, dst, borderType_, StreamAccessor::getStream(stream));
+    }
+
+    class MinEigenVal : public CornerBase
+    {
+    public:
+        MinEigenVal(int srcType, int blockSize, int ksize, int borderType) :
+            CornerBase(srcType, blockSize, ksize, borderType)
+        {
+        }
+
+        void compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        float k_;
+    };
+
+    void MinEigenVal::compute(InputArray _src, OutputArray _dst, Stream& stream)
+    {
+        using namespace cv::cuda::device::imgproc;
+
+        GpuMat src = _src.getGpuMat();
+
+        extractCovData(src, stream);
+
+        _dst.create(src.size(), CV_32FC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        cornerMinEigenVal_gpu(blockSize_, Dx_, Dy_, dst, borderType_, StreamAccessor::getStream(stream));
+    }
+}
+
+Ptr<cuda::CornernessCriteria> cv::cuda::createHarrisCorner(int srcType, int blockSize, int ksize, double k, int borderType)
+{
+    return makePtr<Harris>(srcType, blockSize, ksize, k, borderType);
+}
+
+Ptr<cuda::CornernessCriteria> cv::cuda::createMinEigenValCorner(int srcType, int blockSize, int ksize, int borderType)
+{
+    return makePtr<MinEigenVal>(srcType, blockSize, ksize, borderType);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/cuda/bilateral_filter.cu b/modules/cudaimgproc/src/cuda/bilateral_filter.cu
new file mode 100644
index 00000000000..f81adc7a983
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/bilateral_filter.cu
@@ -0,0 +1,199 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+using namespace cv::cuda;
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+//////////////////////////////////////////////////////////////////////////////////
+/// Bilateral filtering
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
+        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
+        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
+        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
+
+        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
+
+        template<typename T, typename B>
+        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= src.cols || y >= src.rows)
+                return;
+
+            value_type center = saturate_cast<value_type>(src(y, x));
+
+            value_type sum1 = VecTraits<value_type>::all(0);
+            float sum2 = 0;
+
+            int r = ksz / 2;
+            float r2 = (float)(r * r);
+
+            int tx = x - r + ksz;
+            int ty = y - r + ksz;
+
+            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(src(cy, cx));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            else
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            dst(y, x) = saturate_cast<T>(sum1 / sum2);
+        }
+
+        template<typename T, template <typename> class B>
+        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
+        {
+            dim3 block (32, 8);
+            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
+
+            B<T> b(src.rows, src.cols);
+
+            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
+            float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
+
+            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
+            bilateral_kernel<<<grid, block, 0, stream>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
+            cudaSafeCall ( cudaGetLastError () );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template<typename T>
+        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
+
+            static caller_t funcs[] =
+            {
+                bilateral_caller<T, BrdConstant>,
+                bilateral_caller<T, BrdReplicate>,
+                bilateral_caller<T, BrdReflect>,
+                bilateral_caller<T, BrdWrap>,
+                bilateral_caller<T, BrdReflect101>
+            };
+            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
+        }
+    }
+}}}
+
+
+#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
+    template void cv::cuda::device::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
+
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(short)
+//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
+OCV_INSTANTIATE_BILATERAL_FILTER(short3)
+OCV_INSTANTIATE_BILATERAL_FILTER(short4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
+//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(int)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(float)
+//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
+OCV_INSTANTIATE_BILATERAL_FILTER(float3)
+OCV_INSTANTIATE_BILATERAL_FILTER(float4)
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/blend.cu b/modules/cudaimgproc/src/cuda/blend.cu
new file mode 100644
index 00000000000..9e47645e383
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/blend.cu
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace blend
+    {
+        template <typename T>
+        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
+                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                int x_ = x / cn;
+                float w1 = weights1.ptr(y)[x_];
+                float w2 = weights2.ptr(y)[x_];
+                T p1 = img1.ptr(y)[x];
+                T p2 = img2.ptr(y)[x];
+                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
+            }
+        }
+
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
+        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
+
+
+        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
+                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                float w1 = weights1.ptr(y)[x];
+                float w2 = weights2.ptr(y)[x];
+                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
+                w1 *= sum_inv;
+                w2 *= sum_inv;
+                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
+                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
+                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
+                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
+            }
+        }
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace blend
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/build_point_list.cu b/modules/cudaimgproc/src/cuda/build_point_list.cu
new file mode 100644
index 00000000000..addcabc2442
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/build_point_list.cu
@@ -0,0 +1,138 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough
+    {
+        __device__ int g_counter;
+
+        template <int PIXELS_PER_THREAD>
+        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
+        {
+            __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
+            __shared__ int s_qsize[4];
+            __shared__ int s_globStart[4];
+
+            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (threadIdx.x == 0)
+                s_qsize[threadIdx.y] = 0;
+            __syncthreads();
+
+            if (y < src.rows)
+            {
+                // fill the queue
+                const uchar* srcRow = src.ptr(y);
+                for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
+                {
+                    if (srcRow[xx])
+                    {
+                        const unsigned int val = (y << 16) | xx;
+                        const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
+                        s_queues[threadIdx.y][qidx] = val;
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            // let one thread reserve the space required in the global list
+            if (threadIdx.x == 0 && threadIdx.y == 0)
+            {
+                // find how many items are stored in each list
+                int totalSize = 0;
+                for (int i = 0; i < blockDim.y; ++i)
+                {
+                    s_globStart[i] = totalSize;
+                    totalSize += s_qsize[i];
+                }
+
+                // calculate the offset in the global list
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
+                for (int i = 0; i < blockDim.y; ++i)
+                    s_globStart[i] += globalOffset;
+            }
+
+            __syncthreads();
+
+            // copy local queues to global queue
+            const int qsize = s_qsize[threadIdx.y];
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+                list[gidx] = s_queues[threadIdx.y][i];
+        }
+
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
+        {
+            const int PIXELS_PER_THREAD = 16;
+
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 4);
+            const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
+
+            buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu
new file mode 100644
index 00000000000..4418b8e5eb2
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -0,0 +1,670 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace canny
+{
+    struct L1 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::abs(x) + ::abs(y);
+        }
+
+        __host__ __device__ __forceinline__ L1() {}
+        __host__ __device__ __forceinline__ L1(const L1&) {}
+    };
+    struct L2 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::sqrtf(x * x + y * y);
+        }
+
+        __host__ __device__ __forceinline__ L2() {}
+        __host__ __device__ __forceinline__ L2(const L2&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTex
+    {
+        int xoff;
+        int yoff;
+        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
+
+        __device__ __forceinline__ int operator ()(int y, int x) const
+        {
+            return tex2D(tex_src, x + xoff, y + yoff);
+        }
+    };
+
+    struct SrcTexObject
+    {
+        int xoff;
+        int yoff;
+        cudaTextureObject_t tex_src_object;
+        __host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { }
+
+        __device__ __forceinline__ int operator ()(int y, int x) const
+        {
+            return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
+        }
+
+    };
+
+    template <class Norm> __global__
+    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (y >= mag.rows || x >= mag.cols)
+            return;
+
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+
+        dx(y, x) = dxVal;
+        dy(y, x) = dyVal;
+
+        mag(y, x) = norm(dxVal, dyVal);
+    }
+
+    template <class Norm> __global__
+    void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (y >= mag.rows || x >= mag.cols)
+            return;
+
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+
+        dx(y, x) = dxVal;
+        dy(y, x) = dyVal;
+
+        mag(y, x) = norm(dxVal, dyVal);
+    }
+
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
+
+        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
+
+        if (cc30)
+        {
+            cudaResourceDesc resDesc;
+            memset(&resDesc, 0, sizeof(resDesc));
+            resDesc.resType = cudaResourceTypePitch2D;
+            resDesc.res.pitch2D.devPtr = srcWhole.ptr();
+            resDesc.res.pitch2D.height = srcWhole.rows;
+            resDesc.res.pitch2D.width = srcWhole.cols;
+            resDesc.res.pitch2D.pitchInBytes = srcWhole.step;
+            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar>();
+
+            cudaTextureDesc texDesc;
+            memset(&texDesc, 0, sizeof(texDesc));
+            texDesc.addressMode[0] = cudaAddressModeClamp;
+            texDesc.addressMode[1] = cudaAddressModeClamp;
+            texDesc.addressMode[2] = cudaAddressModeClamp;
+
+            cudaTextureObject_t tex = 0;
+            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+
+            SrcTexObject src(xoff, yoff, tex);
+
+            if (L2Grad)
+            {
+                L2 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
+            else
+            {
+                L1 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+            else
+                cudaSafeCall( cudaStreamSynchronize(stream) );
+
+            cudaSafeCall( cudaDestroyTextureObject(tex) );
+        }
+        else
+        {
+            bindTexture(&tex_src, srcWhole);
+            SrcTex src(xoff, yoff);
+
+            if (L2Grad)
+            {
+                L2 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
+            else
+            {
+                L1 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
+    {
+        if (L2Grad)
+        {
+            L2 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), stream);
+        }
+        else
+        {
+            L1 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), stream);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
+    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
+    {
+        const int CANNY_SHIFT = 15;
+        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
+            return;
+
+        int dxVal = dx(y, x);
+        int dyVal = dy(y, x);
+
+        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+        const float m = tex2D(tex_mag, x, y);
+
+        dxVal = ::abs(dxVal);
+        dyVal = ::abs(dyVal);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+
+        if (m > low_thresh)
+        {
+            const int tg22x = dxVal * TG22;
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
+
+            dyVal <<= CANNY_SHIFT;
+
+            if (dyVal < tg22x)
+            {
+                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else if(dyVal > tg67x)
+            {
+                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else
+            {
+                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+        }
+
+        map(y, x) = edge_type;
+    }
+
+    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
+    {
+        const int CANNY_SHIFT = 15;
+        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
+            return;
+
+        int dxVal = dx(y, x);
+        int dyVal = dy(y, x);
+
+        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+        const float m = tex2D<float>(tex_mag, x, y);
+
+        dxVal = ::abs(dxVal);
+        dyVal = ::abs(dyVal);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+
+        if (m > low_thresh)
+        {
+            const int tg22x = dxVal * TG22;
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
+
+            dyVal <<= CANNY_SHIFT;
+
+            if (dyVal < tg22x)
+            {
+                if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else if(dyVal > tg67x)
+            {
+                if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else
+            {
+                if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+        }
+
+        map(y, x) = edge_type;
+    }
+
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_30))
+        {
+            // Use the texture object
+            cudaResourceDesc resDesc;
+            memset(&resDesc, 0, sizeof(resDesc));
+            resDesc.resType = cudaResourceTypePitch2D;
+            resDesc.res.pitch2D.devPtr = mag.ptr();
+            resDesc.res.pitch2D.height = mag.rows;
+            resDesc.res.pitch2D.width = mag.cols;
+            resDesc.res.pitch2D.pitchInBytes = mag.step;
+            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
+
+            cudaTextureDesc texDesc;
+            memset(&texDesc, 0, sizeof(texDesc));
+            texDesc.addressMode[0] = cudaAddressModeClamp;
+            texDesc.addressMode[1] = cudaAddressModeClamp;
+            texDesc.addressMode[2] = cudaAddressModeClamp;
+
+            cudaTextureObject_t tex=0;
+            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+            else
+                cudaSafeCall( cudaStreamSynchronize(stream) );
+
+            cudaSafeCall( cudaDestroyTextureObject(tex) );
+        }
+        else
+        {
+            // Use the texture reference
+            bindTexture(&tex_mag, mag);
+            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
+    {
+        return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
+    }
+
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st, int* d_counter)
+    {
+        __shared__ volatile int smem[18][18];
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        smem[threadIdx.y + 1][threadIdx.x + 1] = checkIdx(y, x, map.rows, map.cols) ? map(y, x) : 0;
+        if (threadIdx.y == 0)
+            smem[0][threadIdx.x + 1] = checkIdx(y - 1, x, map.rows, map.cols) ? map(y - 1, x) : 0;
+        if (threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][threadIdx.x + 1] = checkIdx(y + 1, x, map.rows, map.cols) ? map(y + 1, x) : 0;
+        if (threadIdx.x == 0)
+            smem[threadIdx.y + 1][0] = checkIdx(y, x - 1, map.rows, map.cols) ? map(y, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1)
+            smem[threadIdx.y + 1][blockDim.x + 1] = checkIdx(y, x + 1, map.rows, map.cols) ? map(y, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+            smem[0][0] = checkIdx(y - 1, x - 1, map.rows, map.cols) ? map(y - 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
+            smem[0][blockDim.x + 1] = checkIdx(y - 1, x + 1, map.rows, map.cols) ? map(y - 1, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][0] = checkIdx(y + 1, x - 1, map.rows, map.cols) ? map(y + 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][blockDim.x + 1] = checkIdx(y + 1, x + 1, map.rows, map.cols) ? map(y + 1, x + 1) : 0;
+
+        __syncthreads();
+
+        if (x >= map.cols || y >= map.rows)
+            return;
+
+        int n;
+
+        #pragma unroll
+        for (int k = 0; k < 16; ++k)
+        {
+            n = 0;
+
+            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+            {
+                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+            }
+
+            __syncthreads();
+
+            if (n > 0)
+                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+
+            __syncthreads();
+        }
+
+        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+        map(y, x) = e;
+
+        n = 0;
+
+        if (e == 2)
+        {
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+        }
+
+        if (n > 0)
+        {
+            const int ind =  ::atomicAdd(d_counter, 1);
+            st[ind] = make_short2(x, y);
+        }
+    }
+
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream)
+    {
+        cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );
+
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
+
+        edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1, d_counter);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, const int count)
+    {
+        const int stack_size = 512;
+
+        __shared__ int s_counter;
+        __shared__ int s_ind;
+        __shared__ short2 s_st[stack_size];
+
+        if (threadIdx.x == 0)
+            s_counter = 0;
+
+        __syncthreads();
+
+        int ind = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (ind >= count)
+            return;
+
+        short2 pos = st1[ind];
+
+        if (threadIdx.x < 8)
+        {
+            pos.x += c_dx[threadIdx.x];
+            pos.y += c_dy[threadIdx.x];
+
+            if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
+            {
+                map(pos.y, pos.x) = 2;
+
+                ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+                s_st[ind] = pos;
+            }
+        }
+
+        __syncthreads();
+
+        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+        {
+            const int subTaskIdx = threadIdx.x >> 3;
+            const int portion = ::min(s_counter, blockDim.x >> 3);
+
+            if (subTaskIdx < portion)
+                pos = s_st[s_counter - 1 - subTaskIdx];
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+                s_counter -= portion;
+
+            __syncthreads();
+
+            if (subTaskIdx < portion)
+            {
+                pos.x += c_dx[threadIdx.x & 7];
+                pos.y += c_dy[threadIdx.x & 7];
+
+                if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
+                {
+                    map(pos.y, pos.x) = 2;
+
+                    ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+                    s_st[ind] = pos;
+                }
+            }
+
+            __syncthreads();
+        }
+
+        if (s_counter > 0)
+        {
+            if (threadIdx.x == 0)
+            {
+                s_ind = ::atomicAdd(d_counter, s_counter);
+
+                if (s_ind + s_counter > map.cols * map.rows)
+                    s_counter = 0;
+            }
+
+            __syncthreads();
+
+            ind = s_ind;
+
+            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+                st2[ind + i] = s_st[i];
+        }
+    }
+
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream)
+    {
+        int count;
+        cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+        cudaSafeCall( cudaStreamSynchronize(stream) );
+
+        while (count > 0)
+        {
+            cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );
+
+            const dim3 block(128);
+            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
+
+            edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, d_counter, count);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
+
+            count = min(count, map.cols * map.rows);
+
+            //std::swap(st1, st2);
+            short2* tmp = st1;
+            st1 = st2;
+            st2 = tmp;
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    struct GetEdges : unary_function<int, uchar>
+    {
+        __device__ __forceinline__ uchar operator ()(int e) const
+        {
+            return (uchar)(-(e >> 1));
+        }
+
+        __host__ __device__ __forceinline__ GetEdges() {}
+        __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform(map, dst, GetEdges(), WithOutMask(), stream);
+    }
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/clahe.cu b/modules/cudaimgproc/src/cuda/clahe.cu
new file mode 100644
index 00000000000..455aa20d984
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/clahe.cu
@@ -0,0 +1,186 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/scan.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace clahe
+{
+    __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
+                                  const int2 tileSize, const int tilesX,
+                                  const int clipLimit, const float lutScale)
+    {
+        __shared__ int smem[512];
+
+        const int tx = blockIdx.x;
+        const int ty = blockIdx.y;
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        smem[tid] = 0;
+        __syncthreads();
+
+        for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
+        {
+            const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
+            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
+            {
+                const int data = srcPtr[j];
+                Emulation::smem::atomicAdd(&smem[data], 1);
+            }
+        }
+
+        __syncthreads();
+
+        int tHistVal = smem[tid];
+
+        __syncthreads();
+
+        if (clipLimit > 0)
+        {
+            // clip histogram bar
+
+            int clipped = 0;
+            if (tHistVal > clipLimit)
+            {
+                clipped = tHistVal - clipLimit;
+                tHistVal = clipLimit;
+            }
+
+            // find number of overall clipped samples
+
+            reduce<256>(smem, clipped, tid, plus<int>());
+
+            // broadcast evaluated value
+
+            __shared__ int totalClipped;
+
+            if (tid == 0)
+                totalClipped = clipped;
+            __syncthreads();
+
+            // redistribute clipped samples evenly
+
+            int redistBatch = totalClipped / 256;
+            tHistVal += redistBatch;
+
+            int residual = totalClipped - redistBatch * 256;
+            if (tid < residual)
+                ++tHistVal;
+        }
+
+        const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid);
+
+        lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
+    }
+
+    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(tilesX, tilesY);
+
+        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    __global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= src.cols || y >= src.rows)
+            return;
+
+        const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f;
+        int ty1 = __float2int_rd(tyf);
+        int ty2 = ty1 + 1;
+        const float ya = tyf - ty1;
+        ty1 = ::max(ty1, 0);
+        ty2 = ::min(ty2, tilesY - 1);
+
+        const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f;
+        int tx1 = __float2int_rd(txf);
+        int tx2 = tx1 + 1;
+        const float xa = txf - tx1;
+        tx1 = ::max(tx1, 0);
+        tx2 = ::min(tx2, tilesX - 1);
+
+        const int srcVal = src(y, x);
+
+        float res = 0;
+
+        res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya));
+        res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya));
+        res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
+        res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
+
+        dst(y, x) = saturate_cast<uchar>(res);
+    }
+
+    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
+
+        transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif // CUDA_DISABLER
diff --git a/modules/cudaimgproc/src/cuda/color.cu b/modules/cudaimgproc/src/cuda/color.cu
new file mode 100644
index 00000000000..319b83ab20a
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/color.cu
@@ -0,0 +1,297 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "../cvt_color_internal.h"
+#include "opencv2/cudev.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace cv { namespace cuda { namespace device
+{
+
+#define OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name, func_t) \
+    void name(const GpuMat& src, GpuMat& dst, Stream& stream) \
+    { \
+        func_t op; \
+        typedef typename func_t::argument_type src_t; \
+        typedef typename func_t::result_type   dst_t; \
+        gridTransformUnary(globPtr<src_t>(src), globPtr<dst_t>(dst), op, stream); \
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(name) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name, name ## _func)
+
+#define OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(name) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _func<uchar>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _func<ushort>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _func<float>)
+
+#define OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(name) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _func<uchar>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _func<float>)
+
+#define OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _func<uchar>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _func<float>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _FULL_8u, name ## _FULL_func<uchar>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _FULL_32f, name ## _FULL_func<float>)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_RGBA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_GRAY)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_GRAY)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_GRAY)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_GRAY)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(GRAY_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(GRAY_to_BGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_YUV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_YUV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_YUV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_YUV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_YUV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_YUV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_YUV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_YUV4)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV4_to_BGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_YCrCb)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_YCrCb)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_YCrCb4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_YCrCb4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_YCrCb)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_YCrCb)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_YCrCb4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_YCrCb4)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_BGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_XYZ)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_XYZ)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_XYZ4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_XYZ4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_XYZ)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_XYZ)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_XYZ4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_XYZ4)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ4_to_BGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGB_to_HSV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGBA_to_HSV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGB_to_HSV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGBA_to_HSV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGR_to_HSV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGRA_to_HSV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGR_to_HSV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGRA_to_HSV4)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV4_to_BGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGB_to_HLS)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGBA_to_HLS)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGB_to_HLS4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGBA_to_HLS4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGR_to_HLS)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGRA_to_HLS)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGR_to_HLS4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGRA_to_HLS4)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS4_to_BGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGB_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGBA_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGB_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGBA_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGR_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGRA_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGR_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGRA_to_Lab4)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGB_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGBA_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGB_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGBA_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGR_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGRA_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGR_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGRA_to_Lab4)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_BGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_LRGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_LRGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_LRGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_LRGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_LBGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_LBGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_LBGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_LBGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGB_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGBA_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGB_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGBA_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGR_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGRA_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGR_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGRA_to_Luv4)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGB_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGBA_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGB_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGBA_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGR_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGRA_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGR_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGRA_to_Luv4)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_BGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_LRGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_LRGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_LRGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_LRGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_LBGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_LBGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_LBGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_LBGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR_to_BGR555)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR_to_BGR565)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(RGB_to_BGR555)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(RGB_to_BGR565)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGRA_to_BGR555)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGRA_to_BGR565)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(RGBA_to_BGR555)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(RGBA_to_BGR565)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR555_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR565_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR555_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR565_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR555_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR565_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR555_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR565_to_BGRA)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(GRAY_to_BGR555)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(GRAY_to_BGR565)
+
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR555_to_GRAY)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR565_to_GRAY)
+
+    #undef OPENCV_CUDA_IMPLEMENT_CVTCOLOR
+    #undef OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE
+    #undef OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL
+    #undef OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F
+    #undef OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL
+
+}}}
+
+#endif
diff --git a/modules/cudaimgproc/src/cuda/corners.cu b/modules/cudaimgproc/src/cuda/corners.cu
new file mode 100644
index 00000000000..92a37e6fde1
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/corners.cu
@@ -0,0 +1,280 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
+
+        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        float dx = tex2D(harrisDxTex, j, i);
+                        float dy = tex2D(harrisDyTex, j, i);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
+            }
+        }
+
+        template <typename BR, typename BC>
+        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    const int y = border_col.idx_row(i);
+
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        const int x = border_row.idx_col(j);
+
+                        float dx = tex2D(harrisDxTex, x, y);
+                        float dy = tex2D(harrisDyTex, x, y);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
+            }
+        }
+
+        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
+
+            bindTexture(&harrisDxTex, Dx);
+            bindTexture(&harrisDyTex, Dy);
+
+            switch (border_type)
+            {
+            case BORDER_REFLECT101:
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                break;
+
+            case BORDER_REFLECT:
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                break;
+
+            case BORDER_REPLICATE:
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
+
+        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        float dx = tex2D(minEigenValDxTex, j, i);
+                        float dy = tex2D(minEigenValDyTex, j, i);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                a *= 0.5f;
+                c *= 0.5f;
+
+                dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
+            }
+        }
+
+
+        template <typename BR, typename BC>
+        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    int y = border_col.idx_row(i);
+
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        int x = border_row.idx_col(j);
+
+                        float dx = tex2D(minEigenValDxTex, x, y);
+                        float dy = tex2D(minEigenValDyTex, x, y);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                a *= 0.5f;
+                c *= 0.5f;
+
+                dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
+            }
+        }
+
+        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
+
+            bindTexture(&minEigenValDxTex, Dx);
+            bindTexture(&minEigenValDyTex, Dy);
+
+            switch (border_type)
+            {
+            case BORDER_REFLECT101:
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                break;
+
+            case BORDER_REFLECT:
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                break;
+
+            case BORDER_REPLICATE:
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    }
+}}}
+
+#endif // HAVE_OPENCV_CUDAFILTERS
+
+#endif // CUDA_DISABLER
diff --git a/modules/cudaimgproc/src/cuda/debayer.cu b/modules/cudaimgproc/src/cuda/debayer.cu
new file mode 100644
index 00000000000..0da78139807
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/debayer.cu
@@ -0,0 +1,544 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/color.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct Bayer2BGR;
+
+    template <> struct Bayer2BGR<uchar>
+    {
+        uchar3 res0;
+        uchar3 res1;
+        uchar3 res2;
+        uchar3 res3;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
+        {
+            uchar4 patch[3][3];
+            patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
+            patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
+            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
+            patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
+            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
+            patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
+            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            if ((s_y & 1) ^ start_with_green)
+            {
+                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
+                const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1;
+
+                const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2;
+                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2;
+
+                const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1;
+                const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1;
+
+                const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2;
+                const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = t1;
+                    res0.y = patch[1][1].x;
+                    res0.z = t0;
+
+                    res1.x = patch[1][1].y;
+                    res1.y = t3;
+                    res1.z = t2;
+
+                    res2.x = t5;
+                    res2.y = patch[1][1].z;
+                    res2.z = t4;
+
+                    res3.x = patch[1][1].w;
+                    res3.y = t7;
+                    res3.z = t6;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = patch[1][1].x;
+                    res0.z = t1;
+
+                    res1.x = t2;
+                    res1.y = t3;
+                    res1.z = patch[1][1].y;
+
+                    res2.x = t4;
+                    res2.y = patch[1][1].z;
+                    res2.z = t5;
+
+                    res3.x = t6;
+                    res3.y = t7;
+                    res3.z = patch[1][1].w;
+                }
+            }
+            else
+            {
+                const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2;
+                const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2;
+
+                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
+                const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1;
+
+                const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2;
+                const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2;
+
+                const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1;
+                const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = patch[1][1].x;
+                    res0.y = t1;
+                    res0.z = t0;
+
+                    res1.x = t3;
+                    res1.y = patch[1][1].y;
+                    res1.z = t2;
+
+                    res2.x = patch[1][1].z;
+                    res2.y = t5;
+                    res2.z = t4;
+
+                    res3.x = t7;
+                    res3.y = patch[1][1].w;
+                    res3.z = t6;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = t1;
+                    res0.z = patch[1][1].x;
+
+                    res1.x = t2;
+                    res1.y = patch[1][1].y;
+                    res1.z = t3;
+
+                    res2.x = t4;
+                    res2.y = t5;
+                    res2.z = patch[1][1].z;
+
+                    res3.x = t6;
+                    res3.y = patch[1][1].w;
+                    res3.z = t7;
+                }
+            }
+        }
+    };
+
+    template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
+    template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
+    {
+        typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
+    {
+        return make_uchar4(pix.x, pix.y, pix.z, 255);
+    }
+
+    template <typename D>
+    __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 2) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<uchar> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+        if (d_x + 2 < src.cols)
+            dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
+        if (d_x + 3 < src.cols)
+            dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
+    }
+
+    template <> struct Bayer2BGR<ushort>
+    {
+        ushort3 res0;
+        ushort3 res1;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
+        {
+            ushort2 patch[3][3];
+            patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
+            patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
+            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
+            patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
+            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
+            patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
+            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            if ((s_y & 1) ^ start_with_green)
+            {
+                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
+                const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1;
+
+                const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2;
+                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = t1;
+                    res0.y = patch[1][1].x;
+                    res0.z = t0;
+
+                    res1.x = patch[1][1].y;
+                    res1.y = t3;
+                    res1.z = t2;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = patch[1][1].x;
+                    res0.z = t1;
+
+                    res1.x = t2;
+                    res1.y = t3;
+                    res1.z = patch[1][1].y;
+                }
+            }
+            else
+            {
+                const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2;
+                const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2;
+
+                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
+                const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = patch[1][1].x;
+                    res0.y = t1;
+                    res0.z = t0;
+
+                    res1.x = t3;
+                    res1.y = patch[1][1].y;
+                    res1.z = t2;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = t1;
+                    res0.z = patch[1][1].x;
+
+                    res1.x = t2;
+                    res1.y = patch[1][1].y;
+                    res1.z = t3;
+                }
+            }
+        }
+    };
+
+    template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
+    template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
+    {
+        typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
+    {
+        return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
+    }
+
+    template <typename D>
+    __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 1) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<ushort> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+    }
+
+    template <int cn>
+    void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <int cn>
+    void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<ushort, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////
+    // Bayer Demosaicing (Malvar, He, and Cutler)
+    //
+    // by Morgan McGuire, Williams College
+    // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
+    //
+    // ported to CUDA
+
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    template <typename DstType>
+    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
+    {
+        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
+        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
+        const float   kCx =  4.0f / 8.0f,     kCy =  6.0f / 8.0f,     kCz =  5.0f / 8.0f    /*kCw =  5.0f / 8.0f*/;
+        const float /*kDx =  0.0f / 8.0f,*/   kDy =  2.0f / 8.0f,     kDz = -1.0f / 8.0f    /*kDw = -1.0f / 8.0f*/;
+        const float   kEx = -1.0f / 8.0f,     kEy = -1.5f / 8.0f,   /*kEz = -1.0f / 8.0f,*/   kEw =  0.5f / 8.0f  ;
+        const float   kFx =  2.0f / 8.0f,   /*kFy =  0.0f / 8.0f,*/   kFz =  4.0f / 8.0f    /*kFw =  0.0f / 8.0f*/;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
+            return;
+
+        int2 center;
+        center.x = x + sourceOffset.x;
+        center.y = y + sourceOffset.y;
+
+        int4 xCoord;
+        xCoord.x = center.x - 2;
+        xCoord.y = center.x - 1;
+        xCoord.z = center.x + 1;
+        xCoord.w = center.x + 2;
+
+        int4 yCoord;
+        yCoord.x = center.y - 2;
+        yCoord.y = center.y - 1;
+        yCoord.z = center.y + 1;
+        yCoord.w = center.y + 2;
+
+        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
+
+        float4 Dvec;
+        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
+        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
+        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
+        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
+
+        float4 value;
+        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
+        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
+        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
+        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
+
+        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
+        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
+        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
+        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
+        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
+
+        float4 PATTERN;
+        PATTERN.x = kCx * C;
+        PATTERN.y = kCy * C;
+        PATTERN.z = kCz * C;
+        PATTERN.w = PATTERN.z;
+
+        float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
+
+        // There are five filter patterns (identity, cross, checker,
+        // theta, phi). Precompute the terms from all of them and then
+        // use swizzles to assign to color channels.
+        //
+        // Channel Matches
+        // x cross (e.g., EE G)
+        // y checker (e.g., EE B)
+        // z theta (e.g., EO R)
+        // w phi (e.g., EO B)
+
+        #define A value.x  // A0 + A1
+        #define B value.y  // B0 + B1
+        #define E value.z  // E0 + E1
+        #define F value.w  // F0 + F1
+
+        float3 temp;
+
+        // PATTERN.yzw += (kD.yz * D).xyy;
+        temp.x = kDy * D;
+        temp.y = kDz * D;
+        PATTERN.y += temp.x;
+        PATTERN.z += temp.y;
+        PATTERN.w += temp.y;
+
+        // PATTERN += (kA.xyz * A).xyzx;
+        temp.x = kAx * A;
+        temp.y = kAy * A;
+        temp.z = kAz * A;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.z;
+        PATTERN.w += temp.x;
+
+        // PATTERN += (kE.xyw * E).xyxz;
+        temp.x = kEx * E;
+        temp.y = kEy * E;
+        temp.z = kEw * E;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.x;
+        PATTERN.w += temp.z;
+
+        // PATTERN.xw += kB.xw * B;
+        PATTERN.x += kBx * B;
+        PATTERN.w += kBw * B;
+
+        // PATTERN.xz += kF.xz * F;
+        PATTERN.x += kFx * F;
+        PATTERN.z += kFz * F;
+
+        // Determine which of four types of pixels we are on.
+        int2 alternate;
+        alternate.x = (x + firstRed.x) % 2;
+        alternate.y = (y + firstRed.y) % 2;
+
+        // in BGR sequence;
+        uchar3 pixelColor =
+            (alternate.y == 0) ?
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
+                    make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
+                    make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
+
+        dst(y, x) = toDst<DstType>(pixelColor);
+    }
+
+    template <int cn>
+    void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        bindTexture(&sourceTex, src);
+
+        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/generalized_hough.cu b/modules/cudaimgproc/src/cuda/generalized_hough.cu
new file mode 100644
index 00000000000..232c625f9f8
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/generalized_hough.cu
@@ -0,0 +1,824 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDAARITHM
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace ght
+    {
+        __device__ int g_counter;
+
+        template <typename T, int PIXELS_PER_THREAD>
+        __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy,
+                                           unsigned int* coordList, float* thetaList)
+        {
+            __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
+            __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
+            __shared__ int s_sizes[4];
+            __shared__ int s_globStart[4];
+
+            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (threadIdx.x == 0)
+                s_sizes[threadIdx.y] = 0;
+            __syncthreads();
+
+            if (y < edges.rows)
+            {
+                // fill the queue
+                const uchar* edgesRow = edges.ptr(y);
+                const T* dxRow = dx.ptr(y);
+                const T* dyRow = dy.ptr(y);
+
+                for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
+                {
+                    const T dxVal = dxRow[xx];
+                    const T dyVal = dyRow[xx];
+
+                    if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
+                    {
+                        const unsigned int coord = (y << 16) | xx;
+
+                        float theta = ::atan2f(dyVal, dxVal);
+                        if (theta < 0)
+                            theta += 2.0f * CV_PI_F;
+
+                        const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
+
+                        s_coordLists[threadIdx.y][qidx] = coord;
+                        s_thetaLists[threadIdx.y][qidx] = theta;
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            // let one thread reserve the space required in the global list
+            if (threadIdx.x == 0 && threadIdx.y == 0)
+            {
+                // find how many items are stored in each list
+                int totalSize = 0;
+                for (int i = 0; i < blockDim.y; ++i)
+                {
+                    s_globStart[i] = totalSize;
+                    totalSize += s_sizes[i];
+                }
+
+                // calculate the offset in the global list
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
+                for (int i = 0; i < blockDim.y; ++i)
+                    s_globStart[i] += globalOffset;
+            }
+
+            __syncthreads();
+
+            // copy local queues to global queue
+            const int qsize = s_sizes[threadIdx.y];
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+            {
+                coordList[gidx] = s_coordLists[threadIdx.y][i];
+                thetaList[gidx] = s_thetaLists[threadIdx.y][i];
+            }
+        }
+
+        template <typename T>
+        int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
+        {
+            const int PIXELS_PER_THREAD = 8;
+
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 4);
+            const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
+
+            buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+
+        template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+
+        __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
+                                    PtrStep<short2> r_table, int* r_sizes, int maxSize,
+                                    const short2 templCenter, const float thetaScale)
+        {
+            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (tid >= pointsCount)
+                return;
+
+            const unsigned int coord = coordList[tid];
+            short2 p;
+            p.x = (coord & 0xFFFF);
+            p.y = (coord >> 16) & 0xFFFF;
+
+            const float theta = thetaList[tid];
+            const int n = __float2int_rn(theta * thetaScale);
+
+            const int ind = ::atomicAdd(r_sizes + n, 1);
+            if (ind < maxSize)
+                r_table(n, ind) = saturate_cast<short2>(p - templCenter);
+        }
+
+        void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                             PtrStepSz<short2> r_table, int* r_sizes,
+                             short2 templCenter, int levels)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(pointsCount, block.x));
+
+            const float thetaScale = levels / (2.0f * CV_PI_F);
+
+            buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Ballard_Pos
+
+        __global__ void Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
+                                             const PtrStep<short2> r_table, const int* r_sizes,
+                                             PtrStepSzi hist,
+                                             const float idp, const float thetaScale)
+        {
+            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (tid >= pointsCount)
+                return;
+
+            const unsigned int coord = coordList[tid];
+            short2 p;
+            p.x = (coord & 0xFFFF);
+            p.y = (coord >> 16) & 0xFFFF;
+
+            const float theta = thetaList[tid];
+            const int n = __float2int_rn(theta * thetaScale);
+
+            const short2* r_row = r_table.ptr(n);
+            const int r_row_size = r_sizes[n];
+
+            for (int j = 0; j < r_row_size; ++j)
+            {
+                short2 c = saturate_cast<short2>(p - r_row[j]);
+
+                c.x = __float2int_rn(c.x * idp);
+                c.y = __float2int_rn(c.y * idp);
+
+                if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
+                    ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
+            }
+        }
+
+        void Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                      PtrStepSz<short2> r_table, const int* r_sizes,
+                                      PtrStepSzi hist,
+                                      float dp, int levels)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(pointsCount, block.x));
+
+            const float idp = 1.0f / dp;
+            const float thetaScale = levels / (2.0f * CV_PI_F);
+
+            Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes,
+                                                  const int maxSize, const float dp, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= hist.cols - 2 || y >= hist.rows - 2)
+                return;
+
+            const int curVotes = hist(y + 1, x + 1);
+
+            if (curVotes > threshold &&
+                curVotes >  hist(y + 1, x) &&
+                curVotes >= hist(y + 1, x + 2) &&
+                curVotes >  hist(y, x + 1) &&
+                curVotes >= hist(y + 2, x + 1))
+            {
+                const int ind = ::atomicAdd(&g_counter, 1);
+
+                if (ind < maxSize)
+                {
+                    out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
+                    votes[ind] = make_int3(curVotes, 0, 0);
+                }
+            }
+        }
+
+        int Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
+
+            Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Guil_Full
+
+        struct FeatureTable
+        {
+            uchar* p1_pos_data;
+            size_t p1_pos_step;
+
+            uchar* p1_theta_data;
+            size_t p1_theta_step;
+
+            uchar* p2_pos_data;
+            size_t p2_pos_step;
+
+            uchar* d12_data;
+            size_t d12_step;
+
+            uchar* r1_data;
+            size_t r1_step;
+
+            uchar* r2_data;
+            size_t r2_step;
+        };
+
+        __constant__ FeatureTable c_templFeatures;
+        __constant__ FeatureTable c_imageFeatures;
+
+        void Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
+        {
+            FeatureTable tbl;
+
+            tbl.p1_pos_data = p1_pos.data;
+            tbl.p1_pos_step = p1_pos.step;
+
+            tbl.p1_theta_data = p1_theta.data;
+            tbl.p1_theta_step = p1_theta.step;
+
+            tbl.p2_pos_data = p2_pos.data;
+            tbl.p2_pos_step = p2_pos.step;
+
+            tbl.d12_data = d12.data;
+            tbl.d12_step = d12.step;
+
+            tbl.r1_data = r1.data;
+            tbl.r1_step = r1.step;
+
+            tbl.r2_data = r2.data;
+            tbl.r2_step = r2.step;
+
+            cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
+        }
+        void Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
+        {
+            FeatureTable tbl;
+
+            tbl.p1_pos_data = p1_pos.data;
+            tbl.p1_pos_step = p1_pos.step;
+
+            tbl.p1_theta_data = p1_theta.data;
+            tbl.p1_theta_step = p1_theta.step;
+
+            tbl.p2_pos_data = p2_pos.data;
+            tbl.p2_pos_step = p2_pos.step;
+
+            tbl.d12_data = d12.data;
+            tbl.d12_step = d12.step;
+
+            tbl.r1_data = r1.data;
+            tbl.r1_step = r1.step;
+
+            tbl.r2_data = r2.data;
+            tbl.r2_step = r2.step;
+
+            cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
+        }
+
+        struct TemplFeatureTable
+        {
+            static __device__ float2* p1_pos(int n)
+            {
+                return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
+            }
+            static __device__ float* p1_theta(int n)
+            {
+                return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
+            }
+            static __device__ float2* p2_pos(int n)
+            {
+                return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
+            }
+
+            static __device__ float* d12(int n)
+            {
+                return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
+            }
+
+            static __device__ float2* r1(int n)
+            {
+                return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
+            }
+            static __device__ float2* r2(int n)
+            {
+                return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
+            }
+        };
+        struct ImageFeatureTable
+        {
+            static __device__ float2* p1_pos(int n)
+            {
+                return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
+            }
+            static __device__ float* p1_theta(int n)
+            {
+                return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
+            }
+            static __device__ float2* p2_pos(int n)
+            {
+                return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
+            }
+
+            static __device__ float* d12(int n)
+            {
+                return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
+            }
+
+            static __device__ float2* r1(int n)
+            {
+                return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
+            }
+            static __device__ float2* r2(int n)
+            {
+                return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
+            }
+        };
+
+        __device__ float clampAngle(float a)
+        {
+            float res = a;
+
+            while (res > 2.0f * CV_PI_F)
+                res -= 2.0f * CV_PI_F;
+            while (res < 0.0f)
+                res += 2.0f * CV_PI_F;
+
+            return res;
+        }
+
+        __device__ bool angleEq(float a, float b, float eps)
+        {
+            return (::fabs(clampAngle(a - b)) <= eps);
+        }
+
+        template <class FT, bool isTempl>
+        __global__ void Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
+                                                   int* sizes, const int maxSize,
+                                                   const float xi, const float angleEpsilon, const float alphaScale,
+                                                   const float2 center, const float maxDist)
+        {
+            const float p1_theta = thetaList[blockIdx.x];
+            const unsigned int coord1 = coordList[blockIdx.x];
+            float2 p1_pos;
+            p1_pos.x = (coord1 & 0xFFFF);
+            p1_pos.y = (coord1 >> 16) & 0xFFFF;
+
+            for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
+            {
+                const float p2_theta = thetaList[i];
+                const unsigned int coord2 = coordList[i];
+                float2 p2_pos;
+                p2_pos.x = (coord2 & 0xFFFF);
+                p2_pos.y = (coord2 >> 16) & 0xFFFF;
+
+                if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
+                {
+                    const float2 d = p1_pos - p2_pos;
+
+                    float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
+                    float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
+
+                    if (d12 > maxDist)
+                        continue;
+
+                    float2 r1 = p1_pos - center;
+                    float2 r2 = p2_pos - center;
+
+                    const int n = __float2int_rn(alpha12 * alphaScale);
+
+                    const int ind = ::atomicAdd(sizes + n, 1);
+
+                    if (ind < maxSize)
+                    {
+                        if (!isTempl)
+                        {
+                            FT::p1_pos(n)[ind] = p1_pos;
+                            FT::p2_pos(n)[ind] = p2_pos;
+                        }
+
+                        FT::p1_theta(n)[ind] = p1_theta;
+
+                        FT::d12(n)[ind] = d12;
+
+                        if (isTempl)
+                        {
+                            FT::r1(n)[ind] = r1;
+                            FT::r2(n)[ind] = r2;
+                        }
+                    }
+                }
+            }
+        }
+
+        template <class FT, bool isTempl>
+        void Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                               int* sizes, int maxSize,
+                                               float xi, float angleEpsilon, int levels,
+                                               float2 center, float maxDist)
+        {
+            const dim3 block(256);
+            const dim3 grid(pointsCount);
+
+            const float alphaScale = levels / (2.0f * CV_PI_F);
+
+            Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
+                                                                     sizes, maxSize,
+                                                                     xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
+                                                                     center, maxDist);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            thrust::device_ptr<int> sizesPtr(sizes);
+            thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
+        }
+
+        void Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                 int* sizes, int maxSize,
+                                                 float xi, float angleEpsilon, int levels,
+                                                 float2 center, float maxDist)
+        {
+            Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
+                                                                       sizes, maxSize,
+                                                                       xi, angleEpsilon, levels,
+                                                                       center, maxDist);
+        }
+        void Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                 int* sizes, int maxSize,
+                                                 float xi, float angleEpsilon, int levels,
+                                                 float2 center, float maxDist)
+        {
+            Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
+                                                                        sizes, maxSize,
+                                                                        xi, angleEpsilon, levels,
+                                                                        center, maxDist);
+        }
+
+        __global__ void Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
+                                            const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
+        {
+            extern __shared__ int s_OHist[];
+            for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
+                s_OHist[i] = 0;
+            __syncthreads();
+
+            const int tIdx = blockIdx.x;
+            const int level = blockIdx.y;
+
+            const int tSize = templSizes[level];
+
+            if (tIdx < tSize)
+            {
+                const int imSize = imageSizes[level];
+
+                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
+
+                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
+                {
+                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
+
+                    const float angle = clampAngle(im_p1_theta - t_p1_theta);
+
+                    if (angle >= minAngle && angle <= maxAngle)
+                    {
+                        const int n = __float2int_rn((angle - minAngle) * iAngleStep);
+                        Emulation::smem::atomicAdd(&s_OHist[n], 1);
+                    }
+                }
+            }
+            __syncthreads();
+
+            for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
+                ::atomicAdd(OHist + i, s_OHist[i]);
+        }
+
+        void Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
+                                     float minAngle, float maxAngle, float angleStep, int angleRange,
+                                     int levels, int tMaxSize)
+        {
+            const dim3 block(256);
+            const dim3 grid(tMaxSize, levels + 1);
+
+            minAngle *= (CV_PI_F / 180.0f);
+            maxAngle *= (CV_PI_F / 180.0f);
+            angleStep *= (CV_PI_F / 180.0f);
+
+            const size_t smemSize = (angleRange + 1) * sizeof(float);
+
+            Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
+                                                           minAngle, maxAngle, 1.0f / angleStep, angleRange);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
+                                            const float angle, const float angleEpsilon,
+                                            const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
+        {
+            extern __shared__ int s_SHist[];
+            for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
+                s_SHist[i] = 0;
+            __syncthreads();
+
+            const int tIdx = blockIdx.x;
+            const int level = blockIdx.y;
+
+            const int tSize = templSizes[level];
+
+            if (tIdx < tSize)
+            {
+                const int imSize = imageSizes[level];
+
+                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
+                const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
+
+                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
+                {
+                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
+                    const float im_d12 = ImageFeatureTable::d12(level)[i];
+
+                    if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
+                    {
+                        const float scale = im_d12 / t_d12;
+
+                        if (scale >= minScale && scale <= maxScale)
+                        {
+                            const int s = __float2int_rn((scale - minScale) * iScaleStep);
+                            Emulation::smem::atomicAdd(&s_SHist[s], 1);
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+
+            for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
+                ::atomicAdd(SHist + i, s_SHist[i]);
+        }
+
+        void Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
+                                     float angle, float angleEpsilon,
+                                     float minScale, float maxScale, float iScaleStep, int scaleRange,
+                                     int levels, int tMaxSize)
+        {
+            const dim3 block(256);
+            const dim3 grid(tMaxSize, levels + 1);
+
+            angle *= (CV_PI_F / 180.0f);
+            angleEpsilon *= (CV_PI_F / 180.0f);
+
+            const size_t smemSize = (scaleRange + 1) * sizeof(float);
+
+            Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
+                                                           angle, angleEpsilon,
+                                                           minScale, maxScale, iScaleStep, scaleRange);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
+                                            const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
+                                            const float idp)
+        {
+            const int tIdx = blockIdx.x;
+            const int level = blockIdx.y;
+
+            const int tSize = templSizes[level];
+
+            if (tIdx < tSize)
+            {
+                const int imSize = imageSizes[level];
+
+                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
+
+                float2 r1 = TemplFeatureTable::r1(level)[tIdx];
+                float2 r2 = TemplFeatureTable::r2(level)[tIdx];
+
+                r1 = r1 * scale;
+                r2 = r2 * scale;
+
+                r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
+                r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
+
+                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
+                {
+                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
+
+                    const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
+                    const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
+
+                    if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
+                    {
+                        float2 c1, c2;
+
+                        c1 = im_p1_pos - r1;
+                        c1 = c1 * idp;
+
+                        c2 = im_p2_pos - r2;
+                        c2 = c2 * idp;
+
+                        if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
+                            continue;
+
+                        if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
+                            ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
+                    }
+                }
+            }
+        }
+
+        void Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
+                                     float angle, float angleEpsilon, float scale,
+                                     float dp,
+                                     int levels, int tMaxSize)
+        {
+            const dim3 block(256);
+            const dim3 grid(tMaxSize, levels + 1);
+
+            angle *= (CV_PI_F / 180.0f);
+            angleEpsilon *= (CV_PI_F / 180.0f);
+
+            const float sinVal = ::sinf(angle);
+            const float cosVal = ::cosf(angle);
+
+            cudaSafeCall( cudaFuncSetCacheConfig(Guil_Full_calcPHist, cudaFuncCachePreferL1) );
+
+            Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
+                                                 angle, sinVal, cosVal, angleEpsilon, scale,
+                                                 1.0f / dp);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
+                                                const float angle, const int angleVotes, const float scale, const int scaleVotes,
+                                                const float dp, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= hist.cols - 2 || y >= hist.rows - 2)
+                return;
+
+            const int curVotes = hist(y + 1, x + 1);
+
+            if (curVotes > threshold &&
+                curVotes >  hist(y + 1, x) &&
+                curVotes >= hist(y + 1, x + 2) &&
+                curVotes >  hist(y, x + 1) &&
+                curVotes >= hist(y + 2, x + 1))
+            {
+                const int ind = ::atomicAdd(&g_counter, 1);
+
+                if (ind < maxSize)
+                {
+                    out[ind] = make_float4(x * dp, y * dp, scale, angle);
+                    votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
+                }
+            }
+        }
+
+        int Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
+                                        float angle, int angleVotes, float scale, int scaleVotes,
+                                        float dp, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
+
+            Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
+                                                     angle, angleVotes, scale, scaleVotes,
+                                                     dp, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+    }
+}}}
+
+#endif // HAVE_OPENCV_CUDAARITHM
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/gftt.cu b/modules/cudaimgproc/src/cuda/gftt.cu
new file mode 100644
index 00000000000..ab8713f868a
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/gftt.cu
@@ -0,0 +1,150 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include <thrust/execution_policy.h>
+namespace cv { namespace cuda { namespace device
+{
+    namespace gfft
+    {
+        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __device__ int g_counter = 0;
+
+        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
+        {
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
+            {
+                float val = tex2D(eigTex, j, i);
+
+                if (val > threshold)
+                {
+                    float maxVal = val;
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
+
+                    if (val == maxVal)
+                    {
+                        const int ind = ::atomicAdd(&g_counter, 1);
+
+                        if (ind < max_count)
+                            corners[ind] = make_float2(j, i);
+                    }
+                }
+            }
+        }
+
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream)
+        {
+            void* counter_ptr;
+            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
+
+            bindTexture(&eigTex, eig);
+
+            dim3 block(16, 16);
+            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
+
+            if (mask.data)
+                findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
+            else
+                findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            int count;
+            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            if (stream)
+                cudaSafeCall(cudaStreamSynchronize(stream));
+            else
+                cudaSafeCall( cudaDeviceSynchronize() );
+            return std::min(count, max_count);
+        }
+
+        class EigGreater
+        {
+        public:
+            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
+            {
+                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
+            }
+        };
+
+
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
+        {
+            bindTexture(&eigTex, eig);
+
+            thrust::device_ptr<float2> ptr(corners);
+#if THRUST_VERSION >= 100802
+            if (stream)
+                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater());
+            else
+                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater());
+#else
+            thrust::sort(ptr, ptr + count, EigGreater());
+#endif
+        }
+    } // namespace optical_flow
+}}}
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/hist.cu b/modules/cudaimgproc/src/cuda/hist.cu
new file mode 100644
index 00000000000..be13091f12a
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/hist.cu
@@ -0,0 +1,299 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace hist
+{
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
+    {
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+
+                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    Emulation::smem::atomicAdd(&shist[data], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
+    }
+
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
+    {
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
+            const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+                unsigned int m = maskRowPtr[x];
+
+                if ((m >>  0) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+
+                if ((m >>  8) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+
+                if ((m >>  16) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+
+                if ((m >>  24) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    unsigned int m = ((const uchar*)maskRowPtr)[x];
+
+                    if (m)
+                        Emulation::smem::atomicAdd(&shist[data], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
+    }
+
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+namespace hist
+{
+    __device__ __forceinline__ void histEvenInc(int* shist, uint data, int binSize, int lowerLevel, int upperLevel)
+    {
+        if (data >= lowerLevel && data <= upperLevel)
+        {
+            const uint ind = (data - lowerLevel) / binSize;
+            Emulation::smem::atomicAdd(shist + ind, 1);
+        }
+    }
+
+    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
+                               int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
+    {
+        extern __shared__ int shist[];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        if (tid < binCount)
+            shist[tid] = 0;
+
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const uchar* rowPtr = src + y * step;
+            const uint* rowPtr4 = (uint*) rowPtr;
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                const uint data = rowPtr4[x];
+
+                histEvenInc(shist, (data >>  0) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >>  8) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    const uchar data = rowPtr[x];
+                    histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        if (tid < binCount)
+        {
+            const int histVal = shist[tid];
+
+            if (histVal > 0)
+                ::atomicAdd(hist + tid, histVal);
+        }
+    }
+
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        const int binSize = divUp(upperLevel - lowerLevel, binCount);
+
+        const size_t smem_size = binCount * sizeof(int);
+
+        histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+namespace hist
+{
+    __constant__ int c_lut[256];
+
+    struct EqualizeHist : unary_function<uchar, uchar>
+    {
+        float scale;
+
+        __host__ EqualizeHist(float _scale) : scale(_scale) {}
+
+        __device__ __forceinline__ uchar operator ()(uchar val) const
+        {
+            const int lut = c_lut[val];
+            return __float2int_rn(scale * lut);
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace hist
+{
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+    {
+        if (stream == 0)
+            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
+
+        const float scale = 255.0f / (src.cols * src.rows);
+
+        device::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
+    }
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/hough_circles.cu b/modules/cudaimgproc/src/cuda/hough_circles.cu
new file mode 100644
index 00000000000..db1623eceb3
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/hough_circles.cu
@@ -0,0 +1,260 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/dynamic_smem.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough_circles
+    {
+        __device__ int g_counter;
+
+        ////////////////////////////////////////////////////////////////////////
+        // circlesAccumCenters
+
+        __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
+                                            PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
+        {
+            const int SHIFT = 10;
+            const int ONE = 1 << SHIFT;
+
+            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (tid >= count)
+                return;
+
+            const unsigned int val = list[tid];
+
+            const int x = (val & 0xFFFF);
+            const int y = (val >> 16) & 0xFFFF;
+
+            const int vx = dx(y, x);
+            const int vy = dy(y, x);
+
+            if (vx == 0 && vy == 0)
+                return;
+
+            const float mag = ::sqrtf(vx * vx + vy * vy);
+
+            const int x0 = __float2int_rn((x * idp) * ONE);
+            const int y0 = __float2int_rn((y * idp) * ONE);
+
+            int sx = __float2int_rn((vx * idp) * ONE / mag);
+            int sy = __float2int_rn((vy * idp) * ONE / mag);
+
+            // Step from minRadius to maxRadius in both directions of the gradient
+            for (int k1 = 0; k1 < 2; ++k1)
+            {
+                int x1 = x0 + minRadius * sx;
+                int y1 = y0 + minRadius * sy;
+
+                for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
+                {
+                    const int x2 = x1 >> SHIFT;
+                    const int y2 = y1 >> SHIFT;
+
+                    if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
+                        break;
+
+                    ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
+                }
+
+                sx = -sx;
+                sy = -sy;
+            }
+        }
+
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(count, block.x));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
+
+            circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // buildCentersList
+
+        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < accum.cols - 2 && y < accum.rows - 2)
+            {
+                const int top = accum(y, x + 1);
+
+                const int left = accum(y + 1, x);
+                const int cur = accum(y + 1, x + 1);
+                const int right = accum(y + 1, x + 2);
+
+                const int bottom = accum(y + 2, x + 1);
+
+                if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
+                {
+                    const unsigned int val = (y << 16) | x;
+                    const int idx = ::atomicAdd(&g_counter, 1);
+                    centers[idx] = val;
+                }
+            }
+        }
+
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
+
+            buildCentersList<<<grid, block>>>(accum, centers, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // circlesAccumRadius
+
+        __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
+                                           float3* circles, const int maxCircles, const float dp,
+                                           const int minRadius, const int maxRadius, const int histSize, const int threshold)
+        {
+            int* smem = DynamicSharedMem<int>();
+
+            for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
+                smem[i] = 0;
+            __syncthreads();
+
+            unsigned int val = centers[blockIdx.x];
+
+            float cx = (val & 0xFFFF);
+            float cy = (val >> 16) & 0xFFFF;
+
+            cx = (cx + 0.5f) * dp;
+            cy = (cy + 0.5f) * dp;
+
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
+                if (rad >= minRadius && rad <= maxRadius)
+                {
+                    const int r = __float2int_rn(rad - minRadius);
+
+                    Emulation::smem::atomicAdd(&smem[r + 1], 1);
+                }
+            }
+
+            __syncthreads();
+
+            for (int i = threadIdx.x; i < histSize; i += blockDim.x)
+            {
+                const int curVotes = smem[i + 1];
+
+                if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
+                {
+                    const int ind = ::atomicAdd(&g_counter, 1);
+                    if (ind < maxCircles)
+                        circles[ind] = make_float3(cx, cy, i + minRadius);
+                }
+            }
+        }
+
+        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(has20 ? 1024 : 512);
+            const dim3 grid(centersCount);
+
+            const int histSize = maxRadius - minRadius + 1;
+            size_t smemSize = (histSize + 2) * sizeof(int);
+
+            circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxCircles);
+
+            return totalCount;
+        }
+    }
+}}}
+
+#endif // HAVE_OPENCV_CUDAFILTERS
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/hough_lines.cu b/modules/cudaimgproc/src/cuda/hough_lines.cu
new file mode 100644
index 00000000000..9a93cbf147a
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/hough_lines.cu
@@ -0,0 +1,212 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/dynamic_smem.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough_lines
+    {
+        __device__ int g_counter;
+
+        ////////////////////////////////////////////////////////////////////////
+        // linesAccum
+
+        __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
+        {
+            const int n = blockIdx.x;
+            const float ang = n * theta;
+
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
+
+            const int shift = (numrho - 1) / 2;
+
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                const unsigned int val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;
+
+                ::atomicAdd(accumRow + r + 1, 1);
+            }
+        }
+
+        __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
+        {
+            int* smem = DynamicSharedMem<int>();
+
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                smem[i] = 0;
+
+            __syncthreads();
+
+            const int n = blockIdx.x;
+            const float ang = n * theta;
+
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
+
+            const int shift = (numrho - 1) / 2;
+
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                const unsigned int val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;
+
+                Emulation::smem::atomicAdd(&smem[r + 1], 1);
+            }
+
+            __syncthreads();
+
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                accumRow[i] = smem[i];
+        }
+
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
+        {
+            const dim3 block(has20 ? 1024 : 512);
+            const dim3 grid(accum.rows - 2);
+
+            size_t smemSize = (accum.cols - 1) * sizeof(int);
+
+            if (smemSize < sharedMemPerBlock - 1000)
+                linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+            else
+                linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // linesGetResult
+
+        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
+        {
+            const int r = blockIdx.x * blockDim.x + threadIdx.x;
+            const int n = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (r >= accum.cols - 2 || n >= accum.rows - 2)
+                return;
+
+            const int curVotes = accum(n + 1, r + 1);
+
+            if (curVotes > threshold &&
+                curVotes >  accum(n + 1, r) &&
+                curVotes >= accum(n + 1, r + 2) &&
+                curVotes >  accum(n, r + 1) &&
+                curVotes >= accum(n + 2, r + 1))
+            {
+                const float radius = (r - (numrho - 1) * 0.5f) * rho;
+                const float angle = n * theta;
+
+                const int ind = ::atomicAdd(&g_counter, 1);
+                if (ind < maxSize)
+                {
+                    out[ind] = make_float2(radius, angle);
+                    votes[ind] = curVotes;
+                }
+            }
+        }
+
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
+
+            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            if (doSort && totalCount > 0)
+            {
+                thrust::device_ptr<float2> outPtr(out);
+                thrust::device_ptr<int> votesPtr(votes);
+                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
+            }
+
+            return totalCount;
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/hough_segments.cu b/modules/cudaimgproc/src/cuda/hough_segments.cu
new file mode 100644
index 00000000000..ca433d30db3
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/hough_segments.cu
@@ -0,0 +1,249 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough_segments
+    {
+        __device__ int g_counter;
+
+        texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void houghLinesProbabilistic(const PtrStepSzi accum,
+                                                int4* out, const int maxSize,
+                                                const float rho, const float theta,
+                                                const int lineGap, const int lineLength,
+                                                const int rows, const int cols)
+        {
+            const int r = blockIdx.x * blockDim.x + threadIdx.x;
+            const int n = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (r >= accum.cols - 2 || n >= accum.rows - 2)
+                return;
+
+            const int curVotes = accum(n + 1, r + 1);
+
+            if (curVotes >= lineLength &&
+                curVotes > accum(n, r) &&
+                curVotes > accum(n, r + 1) &&
+                curVotes > accum(n, r + 2) &&
+                curVotes > accum(n + 1, r) &&
+                curVotes > accum(n + 1, r + 2) &&
+                curVotes > accum(n + 2, r) &&
+                curVotes > accum(n + 2, r + 1) &&
+                curVotes > accum(n + 2, r + 2))
+            {
+                const float radius = (r - (accum.cols - 2 - 1) * 0.5f) * rho;
+                const float angle = n * theta;
+
+                float cosa;
+                float sina;
+                sincosf(angle, &sina, &cosa);
+
+                float2 p0 = make_float2(cosa * radius, sina * radius);
+                float2 dir = make_float2(-sina, cosa);
+
+                float2 pb[4] = {make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1)};
+                float a;
+
+                if (dir.x != 0)
+                {
+                    a = -p0.x / dir.x;
+                    pb[0].x = 0;
+                    pb[0].y = p0.y + a * dir.y;
+
+                    a = (cols - 1 - p0.x) / dir.x;
+                    pb[1].x = cols - 1;
+                    pb[1].y = p0.y + a * dir.y;
+                }
+                if (dir.y != 0)
+                {
+                    a = -p0.y / dir.y;
+                    pb[2].x = p0.x + a * dir.x;
+                    pb[2].y = 0;
+
+                    a = (rows - 1 - p0.y) / dir.y;
+                    pb[3].x = p0.x + a * dir.x;
+                    pb[3].y = rows - 1;
+                }
+
+                if (pb[0].x == 0 && (pb[0].y >= 0 && pb[0].y < rows))
+                {
+                    p0 = pb[0];
+                    if (dir.x < 0)
+                        dir = -dir;
+                }
+                else if (pb[1].x == cols - 1 && (pb[1].y >= 0 && pb[1].y < rows))
+                {
+                    p0 = pb[1];
+                    if (dir.x > 0)
+                        dir = -dir;
+                }
+                else if (pb[2].y == 0 && (pb[2].x >= 0 && pb[2].x < cols))
+                {
+                    p0 = pb[2];
+                    if (dir.y < 0)
+                        dir = -dir;
+                }
+                else if (pb[3].y == rows - 1 && (pb[3].x >= 0 && pb[3].x < cols))
+                {
+                    p0 = pb[3];
+                    if (dir.y > 0)
+                        dir = -dir;
+                }
+
+                float2 d;
+                if (::fabsf(dir.x) > ::fabsf(dir.y))
+                {
+                    d.x = dir.x > 0 ? 1 : -1;
+                    d.y = dir.y / ::fabsf(dir.x);
+                }
+                else
+                {
+                    d.x = dir.x / ::fabsf(dir.y);
+                    d.y = dir.y > 0 ? 1 : -1;
+                }
+
+                float2 line_end[2];
+                int gap;
+                bool inLine = false;
+
+                float2 p1 = p0;
+                if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
+                    return;
+
+                for (;;)
+                {
+                    if (tex2D(tex_mask, p1.x, p1.y))
+                    {
+                        gap = 0;
+
+                        if (!inLine)
+                        {
+                            line_end[0] = p1;
+                            line_end[1] = p1;
+                            inLine = true;
+                        }
+                        else
+                        {
+                            line_end[1] = p1;
+                        }
+                    }
+                    else if (inLine)
+                    {
+                        if (++gap > lineGap)
+                        {
+                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
+                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
+
+                            if (good_line)
+                            {
+                                const int ind = ::atomicAdd(&g_counter, 1);
+                                if (ind < maxSize)
+                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
+                            }
+
+                            gap = 0;
+                            inLine = false;
+                        }
+                    }
+
+                    p1 = p1 + d;
+                    if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
+                    {
+                        if (inLine)
+                        {
+                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
+                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
+
+                            if (good_line)
+                            {
+                                const int ind = ::atomicAdd(&g_counter, 1);
+                                if (ind < maxSize)
+                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
+                            }
+
+                        }
+                        break;
+                    }
+                }
+            }
+        }
+
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            bindTexture(&tex_mask, mask);
+
+            houghLinesProbabilistic<<<grid, block>>>(accum,
+                                                     out, maxSize,
+                                                     rho, theta,
+                                                     lineGap, lineLength,
+                                                     mask.rows, mask.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/match_template.cu b/modules/cudaimgproc/src/cuda/match_template.cu
new file mode 100644
index 00000000000..87ee71e1e72
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/match_template.cu
@@ -0,0 +1,916 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace match_template
+    {
+        __device__ __forceinline__ float sum(float v) { return v; }
+        __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
+        __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
+        __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
+
+        __device__ __forceinline__ float first(float v) { return v; }
+        __device__ __forceinline__ float first(float2 v) { return v.x; }
+        __device__ __forceinline__ float first(float3 v) { return v.x; }
+        __device__ __forceinline__ float first(float4 v) { return v.x; }
+
+        __device__ __forceinline__ float mul(float a, float b) { return a * b; }
+        __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+        __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+        __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+        __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
+        __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+        __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+        __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+        __device__ __forceinline__ float sub(float a, float b) { return a - b; }
+        __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+        __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+        __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+        __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
+        __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+        __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+        __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+        //////////////////////////////////////////////////////////////////////
+        // Naive_CCORR
+
+        template <typename T, int cn>
+        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
+        {
+            typedef typename TypeVec<T, cn>::vec_type Type;
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef res = VecTraits<Typef>::all(0);
+
+                for (int i = 0; i < h; ++i)
+                {
+                    const Type* image_ptr = (const Type*)image.ptr(y + i);
+                    const Type* templ_ptr = (const Type*)templ.ptr(i);
+                    for (int j = 0; j < w; ++j)
+                        res = res + mul(image_ptr[x + j], templ_ptr[j]);
+                }
+
+                result.ptr(y)[x] = sum(res);
+            }
+        }
+
+        template <typename T, int cn>
+        void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+
+        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Naive_SQDIFF
+
+        template <typename T, int cn>
+        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
+        {
+            typedef typename TypeVec<T, cn>::vec_type Type;
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef res = VecTraits<Typef>::all(0);
+                Typef delta;
+
+                for (int i = 0; i < h; ++i)
+                {
+                    const Type* image_ptr = (const Type*)image.ptr(y + i);
+                    const Type* templ_ptr = (const Type*)templ.ptr(i);
+                    for (int j = 0; j < w; ++j)
+                    {
+                        delta = sub(image_ptr[x + j], templ_ptr[j]);
+                        res = res + delta * delta;
+                    }
+                }
+
+                result.ptr(y)[x] = sum(res);
+            }
+        }
+
+        template <typename T, int cn>
+        void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_SQDIFF
+
+        template <int cn>
+        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<double> image_sqsum, double templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
+            }
+        }
+
+        template <int cn>
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, int cn,
+                                             cudaStream_t stream)
+        {
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
+            };
+
+            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_SQDIFF_NORMED
+
+        // normAcc* are accurate normalization routines which make CUDA matchTemplate
+        // consistent with CPU one
+
+        __device__ float normAcc(float num, float denum)
+        {
+            if (::fabs(num) < denum)
+                return num / denum;
+            if (::fabs(num) < denum * 1.125f)
+                return num > 0 ? 1 : -1;
+            return 0;
+        }
+
+
+        __device__ float normAcc_SQDIFF(float num, float denum)
+        {
+            if (::fabs(num) < denum)
+                return num / denum;
+            if (::fabs(num) < denum * 1.125f)
+                return num > 0 ? 1 : -1;
+            return 1;
+        }
+
+
+        template <int cn>
+        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
+                int w, int h, const PtrStep<double> image_sqsum,
+                double templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
+                                                  sqrtf(image_sqsum_ * templ_sqsum));
+            }
+        }
+
+        template <int cn>
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
+                                                    PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
+                                                    PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+            static const caller_t callers[] =
+            {
+                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
+            };
+
+            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_CCOFF
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<int> image_sum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_ = (float)(
+                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
+                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
+                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
+                const PtrStep<int> image_sum_r,
+                const PtrStep<int> image_sum_g,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC2(
+                int w, int h,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                int templ_sum_r, int templ_sum_g,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
+                    w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
+                    image_sum_r, image_sum_g, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
+                int w, int h,
+                float templ_sum_scale_r,
+                float templ_sum_scale_g,
+                float templ_sum_scale_b,
+                const PtrStep<int> image_sum_r,
+                const PtrStep<int> image_sum_g,
+                const PtrStep<int> image_sum_b,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g
+                                         - image_sum_b_ * templ_sum_scale_b;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC3(
+                int w, int h,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
+                    w, h,
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
+                    (float)templ_sum_b / (w * h),
+                    image_sum_r, image_sum_g, image_sum_b, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
+                int w, int h,
+                float templ_sum_scale_r,
+                float templ_sum_scale_g,
+                float templ_sum_scale_b,
+                float templ_sum_scale_a,
+                const PtrStep<int> image_sum_r,
+                const PtrStep<int> image_sum_g,
+                const PtrStep<int> image_sum_b,
+                const PtrStep<int> image_sum_a,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sum_a_ = (float)(
+                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
+                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g
+                                         - image_sum_b_ * templ_sum_scale_b
+                                         - image_sum_a_ * templ_sum_scale_a;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC4(
+                int w, int h,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                const PtrStepSz<int> image_sum_a,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
+                int templ_sum_a,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
+                    w, h,
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
+                    (float)templ_sum_b / (w * h),
+                    (float)templ_sum_a / (w * h),
+                    image_sum_r, image_sum_g, image_sum_b, image_sum_a,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_CCOFF_NORMED
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
+                int w, int h, float weight,
+                float templ_sum_scale, float templ_sqsum_scale,
+                const PtrStep<int> image_sum,
+                const PtrStep<double> image_sqsum,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float ccorr = result.ptr(y)[x];
+                float image_sum_ = (float)(
+                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
+                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
+                        (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
+                result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
+                                           sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8U(
+                    int w, int h, const PtrStepSz<int> image_sum,
+                    const PtrStepSz<double> image_sqsum,
+                    int templ_sum, double templ_sqsum,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale = templ_sum * weight;
+            float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
+                    w, h, weight, templ_sum_scale, templ_sqsum_scale,
+                    image_sum, image_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g,
+                float templ_sqsum_scale,
+                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
+                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                             - image_sum_g_ * templ_sum_scale_g;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                    int w, int h,
+                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                    int templ_sum_r, double templ_sqsum_r,
+                    int templ_sum_g, double templ_sqsum_g,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
+                float templ_sqsum_scale,
+                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
+                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
+                const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sqsum_b_ = (float)(
+                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
+                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                             - image_sum_g_ * templ_sum_scale_g
+                                             - image_sum_b_ * templ_sum_scale_b;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                    int w, int h,
+                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                    const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                    int templ_sum_r, double templ_sqsum_r,
+                    int templ_sum_g, double templ_sqsum_g,
+                    int templ_sum_b, double templ_sqsum_b,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sum_scale_b = templ_sum_b * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
+                float templ_sum_scale_a, float templ_sqsum_scale,
+                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
+                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
+                const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
+                const PtrStep<int> image_sum_a, const PtrStep<double> image_sqsum_a,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sqsum_b_ = (float)(
+                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
+                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
+                float image_sum_a_ = (float)(
+                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
+                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
+                float image_sqsum_a_ = (float)(
+                        (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
+                        (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
+                                             - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
+                                                         + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                    int w, int h,
+                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                    const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                    const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a,
+                    int templ_sum_r, double templ_sqsum_r,
+                    int templ_sum_g, double templ_sqsum_g,
+                    int templ_sum_b, double templ_sqsum_b,
+                    int templ_sum_a, double templ_sqsum_a,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sum_scale_b = templ_sum_b * weight;
+            float templ_sum_scale_a = templ_sum_a * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
+                                      + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
+                    image_sum_a, image_sqsum_a,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // normalize
+
+        template <int cn>
+        __global__ void normalizeKernel_8U(
+                int w, int h, const PtrStep<double> image_sqsum,
+                double templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
+            }
+        }
+
+        void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum,
+                          double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            switch (cn)
+            {
+            case 1:
+                normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 2:
+                normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 3:
+                normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 4:
+                normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // extractFirstChannel
+
+        template <int cn>
+        __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
+        {
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef val = ((const Typef*)image.ptr(y))[x];
+                result.ptr(y)[x] = first(val);
+            }
+        }
+
+        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            switch (cn)
+            {
+            case 1:
+                extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 2:
+                extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 3:
+                extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 4:
+                extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            }
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } //namespace match_template
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/mean_shift.cu b/modules/cudaimgproc/src/cuda/mean_shift.cu
new file mode 100644
index 00000000000..3b3b93f94e4
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/mean_shift.cu
@@ -0,0 +1,182 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        texture<uchar4, 2> tex_meanshift;
+
+        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
+                                        size_t out_step, int cols, int rows,
+                                        int sp, int sr, int maxIter, float eps)
+        {
+            int isr2 = sr*sr;
+            uchar4 c = tex2D(tex_meanshift, x0, y0 );
+
+            // iterate meanshift procedure
+            for( int iter = 0; iter < maxIter; iter++ )
+            {
+                int count = 0;
+                int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
+                float icount;
+
+                //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
+                int minx = x0-sp;
+                int miny = y0-sp;
+                int maxx = x0+sp;
+                int maxy = y0+sp;
+
+                for( int y = miny; y <= maxy; y++)
+                {
+                    int rowCount = 0;
+                    for( int x = minx; x <= maxx; x++ )
+                    {
+                        uchar4 t = tex2D( tex_meanshift, x, y );
+
+                        int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
+                        if( norm2 <= isr2 )
+                        {
+                            s0 += t.x; s1 += t.y; s2 += t.z;
+                            sx += x; rowCount++;
+                        }
+                    }
+                    count += rowCount;
+                    sy += y*rowCount;
+                }
+
+                if( count == 0 )
+                    break;
+
+                icount = 1.f/count;
+                int x1 = __float2int_rz(sx*icount);
+                int y1 = __float2int_rz(sy*icount);
+                s0 = __float2int_rz(s0*icount);
+                s1 = __float2int_rz(s1*icount);
+                s2 = __float2int_rz(s2*icount);
+
+                int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
+
+                bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);
+
+                x0 = x1; y0 = y1;
+                c.x = s0; c.y = s1; c.z = s2;
+
+                if( stopFlag )
+                    break;
+            }
+
+            int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);
+            *(uchar4*)(out + base) = c;
+
+            return make_short2((short)x0, (short)y0);
+        }
+
+        __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
+        {
+            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
+            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if( x0 < cols && y0 < rows )
+                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
+        }
+
+        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
+        {
+            dim3 grid(1, 1, 1);
+            dim3 threads(32, 8, 1);
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
+            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
+
+            meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
+                                             unsigned char* outsp, size_t outspstep,
+                                             int cols, int rows,
+                                             int sp, int sr, int maxIter, float eps)
+        {
+            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
+            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if( x0 < cols && y0 < rows )
+            {
+                int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
+                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
+            }
+        }
+
+        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
+        {
+            dim3 grid(1, 1, 1);
+            dim3 threads(32, 8, 1);
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
+            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
+
+            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
+#endif
diff --git a/modules/cudaimgproc/src/cvt_color_internal.h b/modules/cudaimgproc/src/cvt_color_internal.h
new file mode 100644
index 00000000000..ea89dbee790
--- /dev/null
+++ b/modules/cudaimgproc/src/cvt_color_internal.h
@@ -0,0 +1,275 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __cvt_color_internal_h__
+#define __cvt_color_internal_h__
+
+#include "opencv2/core/cuda.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+#define OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name) \
+    void name(const GpuMat& _src, GpuMat& _dst, Stream& stream);
+
+#define OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(name)       \
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _32f)
+
+#define OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(name)    \
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _8u)   \
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _32f)
+
+#define OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(name)    \
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _8u)        \
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _32f)       \
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _FULL_8u)   \
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _FULL_32f)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_RGBA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_GRAY)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_GRAY)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_GRAY)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_GRAY)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(GRAY_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(GRAY_to_BGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_YUV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_YUV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_YUV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_YUV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_YUV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_YUV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_YUV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_YUV4)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV4_to_BGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_YCrCb)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_YCrCb)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_YCrCb4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_YCrCb4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_YCrCb)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_YCrCb)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_YCrCb4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_YCrCb4)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_BGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_XYZ)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_XYZ)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_XYZ4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_XYZ4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_XYZ)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_XYZ)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_XYZ4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_XYZ4)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ4_to_BGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGB_to_HSV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGBA_to_HSV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGB_to_HSV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGBA_to_HSV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGR_to_HSV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGRA_to_HSV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGR_to_HSV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGRA_to_HSV4)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV4_to_BGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGB_to_HLS)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGBA_to_HLS)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGB_to_HLS4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGBA_to_HLS4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGR_to_HLS)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGRA_to_HLS)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGR_to_HLS4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGRA_to_HLS4)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS4_to_BGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGB_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGBA_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGB_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGBA_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGR_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGRA_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGR_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGRA_to_Lab4)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGB_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGBA_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGB_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGBA_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGR_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGRA_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGR_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGRA_to_Lab4)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_BGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_LRGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_LRGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_LRGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_LRGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_LBGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_LBGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_LBGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_LBGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGB_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGBA_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGB_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGBA_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGR_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGRA_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGR_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGRA_to_Luv4)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGB_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGBA_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGB_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGBA_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGR_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGRA_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGR_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGRA_to_Luv4)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_BGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_LRGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_LRGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_LRGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_LRGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_LBGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_LBGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_LBGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_LBGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR_to_BGR555)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR_to_BGR565)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(RGB_to_BGR555)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(RGB_to_BGR565)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGRA_to_BGR555)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGRA_to_BGR565)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(RGBA_to_BGR555)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(RGBA_to_BGR565)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR555_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR565_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR555_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR565_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR555_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR565_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR555_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR565_to_BGRA)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(GRAY_to_BGR555)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(GRAY_to_BGR565)
+
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR555_to_GRAY)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR565_to_GRAY)
+
+    #undef OPENCV_CUDA_DECLARE_CVTCOLOR_ONE
+    #undef OPENCV_CUDA_DECLARE_CVTCOLOR_ALL
+    #undef OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F
+    #undef OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL
+}}}
+
+#endif
diff --git a/modules/cudaimgproc/src/generalized_hough.cpp b/modules/cudaimgproc/src/generalized_hough.cpp
new file mode 100644
index 00000000000..18e00f33145
--- /dev/null
+++ b/modules/cudaimgproc/src/generalized_hough.cpp
@@ -0,0 +1,885 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_CUDAARITHM)
+
+Ptr<GeneralizedHoughBallard> cv::cuda::createGeneralizedHoughBallard() { throw_no_cuda(); return Ptr<GeneralizedHoughBallard>(); }
+
+Ptr<GeneralizedHoughGuil> cv::cuda::createGeneralizedHoughGuil() { throw_no_cuda(); return Ptr<GeneralizedHoughGuil>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace ght
+    {
+        template <typename T>
+        int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                             PtrStepSz<short2> r_table, int* r_sizes,
+                             short2 templCenter, int levels);
+
+        void Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                      PtrStepSz<short2> r_table, const int* r_sizes,
+                                      PtrStepSzi hist,
+                                      float dp, int levels);
+        int Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold);
+
+        void Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
+        void Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
+        void Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                 int* sizes, int maxSize,
+                                                 float xi, float angleEpsilon, int levels,
+                                                 float2 center, float maxDist);
+        void Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                 int* sizes, int maxSize,
+                                                 float xi, float angleEpsilon, int levels,
+                                                 float2 center, float maxDist);
+        void Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
+                                     float minAngle, float maxAngle, float angleStep, int angleRange,
+                                     int levels, int tMaxSize);
+        void Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
+                                     float angle, float angleEpsilon,
+                                     float minScale, float maxScale, float iScaleStep, int scaleRange,
+                                     int levels, int tMaxSize);
+        void Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
+                                     float angle, float angleEpsilon, float scale,
+                                     float dp,
+                                     int levels, int tMaxSize);
+        int Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
+                                        float angle, int angleVotes, float scale, int scaleVotes,
+                                        float dp, int threshold);
+    }
+}}}
+
+// common
+
+namespace
+{
+    class GeneralizedHoughBase
+    {
+    protected:
+        GeneralizedHoughBase();
+        virtual ~GeneralizedHoughBase() {}
+
+        void setTemplateImpl(InputArray templ, Point templCenter);
+        void setTemplateImpl(InputArray edges, InputArray dx, InputArray dy, Point templCenter);
+
+        void detectImpl(InputArray image, OutputArray positions, OutputArray votes);
+        void detectImpl(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes);
+
+        void buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy);
+
+        virtual void processTempl() = 0;
+        virtual void processImage() = 0;
+
+        int cannyLowThresh_;
+        int cannyHighThresh_;
+        double minDist_;
+        double dp_;
+        int maxBufferSize_;
+
+        Size templSize_;
+        Point templCenter_;
+        GpuMat templEdges_;
+        GpuMat templDx_;
+        GpuMat templDy_;
+
+        Size imageSize_;
+        GpuMat imageEdges_;
+        GpuMat imageDx_;
+        GpuMat imageDy_;
+
+        GpuMat edgePointList_;
+
+        GpuMat outBuf_;
+        int posCount_;
+
+    private:
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        void calcEdges(InputArray src, GpuMat& edges, GpuMat& dx, GpuMat& dy);
+#endif
+
+        void filterMinDist();
+        void convertTo(OutputArray positions, OutputArray votes);
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        Ptr<cuda::CannyEdgeDetector> canny_;
+        Ptr<cuda::Filter> filterDx_;
+        Ptr<cuda::Filter> filterDy_;
+#endif
+
+        std::vector<float4> oldPosBuf_;
+        std::vector<int3> oldVoteBuf_;
+        std::vector<float4> newPosBuf_;
+        std::vector<int3> newVoteBuf_;
+        std::vector<int> indexies_;
+    };
+
+    GeneralizedHoughBase::GeneralizedHoughBase()
+    {
+        cannyLowThresh_ = 50;
+        cannyHighThresh_ = 100;
+        minDist_ = 1.0;
+        dp_ = 1.0;
+
+        maxBufferSize_ = 10000;
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        canny_ = cuda::createCannyEdgeDetector(cannyLowThresh_, cannyHighThresh_);
+        filterDx_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 1, 0);
+        filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
+#endif
+    }
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+    void GeneralizedHoughBase::calcEdges(InputArray _src, GpuMat& edges, GpuMat& dx, GpuMat& dy)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+        CV_Assert( cannyLowThresh_ > 0 && cannyLowThresh_ < cannyHighThresh_ );
+
+        ensureSizeIsEnough(src.size(), CV_32SC1, dx);
+        ensureSizeIsEnough(src.size(), CV_32SC1, dy);
+
+        filterDx_->apply(src, dx);
+        filterDy_->apply(src, dy);
+
+        ensureSizeIsEnough(src.size(), CV_8UC1, edges);
+
+        canny_->setLowThreshold(cannyLowThresh_);
+        canny_->setHighThreshold(cannyHighThresh_);
+        canny_->detect(dx, dy, edges);
+    }
+#endif
+
+    void GeneralizedHoughBase::setTemplateImpl(InputArray templ, Point templCenter)
+    {
+#ifndef HAVE_OPENCV_CUDAFILTERS
+        CV_UNUSED(templ);
+        CV_UNUSED(templCenter);
+        throw_no_cuda();
+#else
+        calcEdges(templ, templEdges_, templDx_, templDy_);
+
+        if (templCenter == Point(-1, -1))
+            templCenter = Point(templEdges_.cols / 2, templEdges_.rows / 2);
+
+        templSize_ = templEdges_.size();
+        templCenter_ = templCenter;
+
+        processTempl();
+#endif
+    }
+
+    void GeneralizedHoughBase::setTemplateImpl(InputArray edges, InputArray dx, InputArray dy, Point templCenter)
+    {
+        edges.getGpuMat().copyTo(templEdges_);
+        dx.getGpuMat().copyTo(templDx_);
+        dy.getGpuMat().copyTo(templDy_);
+
+        CV_Assert( templEdges_.type() == CV_8UC1 );
+        CV_Assert( templDx_.type() == CV_32FC1 && templDx_.size() == templEdges_.size() );
+        CV_Assert( templDy_.type() == templDx_.type() && templDy_.size() == templEdges_.size() );
+
+        if (templCenter == Point(-1, -1))
+            templCenter = Point(templEdges_.cols / 2, templEdges_.rows / 2);
+
+        templSize_ = templEdges_.size();
+        templCenter_ = templCenter;
+
+        processTempl();
+    }
+
+    void GeneralizedHoughBase::detectImpl(InputArray image, OutputArray positions, OutputArray votes)
+    {
+#ifndef HAVE_OPENCV_CUDAFILTERS
+        CV_UNUSED(image);
+        CV_UNUSED(positions);
+        CV_UNUSED(votes);
+        throw_no_cuda();
+#else
+        calcEdges(image, imageEdges_, imageDx_, imageDy_);
+
+        imageSize_ = imageEdges_.size();
+
+        posCount_ = 0;
+
+        processImage();
+
+        if (posCount_ == 0)
+        {
+            positions.release();
+            if (votes.needed())
+                votes.release();
+        }
+        else
+        {
+            if (minDist_ > 1)
+                filterMinDist();
+            convertTo(positions, votes);
+        }
+#endif
+    }
+
+    void GeneralizedHoughBase::detectImpl(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes)
+    {
+        edges.getGpuMat().copyTo(imageEdges_);
+        dx.getGpuMat().copyTo(imageDx_);
+        dy.getGpuMat().copyTo(imageDy_);
+
+        CV_Assert( imageEdges_.type() == CV_8UC1 );
+        CV_Assert( imageDx_.type() == CV_32FC1 && imageDx_.size() == imageEdges_.size() );
+        CV_Assert( imageDy_.type() == imageDx_.type() && imageDy_.size() == imageEdges_.size() );
+
+        imageSize_ = imageEdges_.size();
+
+        posCount_ = 0;
+
+        processImage();
+
+        if (posCount_ == 0)
+        {
+            positions.release();
+            if (votes.needed())
+                votes.release();
+        }
+        else
+        {
+            if (minDist_ > 1)
+                filterMinDist();
+            convertTo(positions, votes);
+        }
+    }
+
+    void GeneralizedHoughBase::buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy)
+    {
+        using namespace cv::cuda::device::ght;
+
+        typedef int (*func_t)(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        static const func_t funcs[] =
+        {
+            0,
+            0,
+            0,
+            buildEdgePointList_gpu<short>,
+            buildEdgePointList_gpu<int>,
+            buildEdgePointList_gpu<float>,
+            0
+        };
+
+        CV_Assert( edges.type() == CV_8UC1 );
+        CV_Assert( dx.size() == edges.size() );
+        CV_Assert( dy.type() == dx.type() && dy.size() == edges.size() );
+
+        const func_t func = funcs[dx.depth()];
+        CV_Assert( func != 0 );
+
+        edgePointList_.cols = (int) (edgePointList_.step / sizeof(int));
+        ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList_);
+
+        edgePointList_.cols = func(edges, dx, dy, edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1));
+    }
+
+    struct IndexCmp
+    {
+        const int3* aux;
+
+        explicit IndexCmp(const int3* _aux) : aux(_aux) {}
+
+        bool operator ()(int l1, int l2) const
+        {
+            return aux[l1].x > aux[l2].x;
+        }
+    };
+
+    void GeneralizedHoughBase::filterMinDist()
+    {
+        oldPosBuf_.resize(posCount_);
+        oldVoteBuf_.resize(posCount_);
+
+        cudaSafeCall( cudaMemcpy(&oldPosBuf_[0], outBuf_.ptr(0), posCount_ * sizeof(float4), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&oldVoteBuf_[0], outBuf_.ptr(1), posCount_ * sizeof(int3), cudaMemcpyDeviceToHost) );
+
+        indexies_.resize(posCount_);
+        for (int i = 0; i < posCount_; ++i)
+            indexies_[i] = i;
+        std::sort(indexies_.begin(), indexies_.end(), IndexCmp(&oldVoteBuf_[0]));
+
+        newPosBuf_.clear();
+        newVoteBuf_.clear();
+        newPosBuf_.reserve(posCount_);
+        newVoteBuf_.reserve(posCount_);
+
+        const int cellSize = cvRound(minDist_);
+        const int gridWidth = (imageSize_.width + cellSize - 1) / cellSize;
+        const int gridHeight = (imageSize_.height + cellSize - 1) / cellSize;
+
+        std::vector< std::vector<Point2f> > grid(gridWidth * gridHeight);
+
+        const double minDist2 = minDist_ * minDist_;
+
+        for (int i = 0; i < posCount_; ++i)
+        {
+            const int ind = indexies_[i];
+
+            Point2f p(oldPosBuf_[ind].x, oldPosBuf_[ind].y);
+
+            bool good = true;
+
+            const int xCell = static_cast<int>(p.x / cellSize);
+            const int yCell = static_cast<int>(p.y / cellSize);
+
+            int x1 = xCell - 1;
+            int y1 = yCell - 1;
+            int x2 = xCell + 1;
+            int y2 = yCell + 1;
+
+            // boundary check
+            x1 = std::max(0, x1);
+            y1 = std::max(0, y1);
+            x2 = std::min(gridWidth - 1, x2);
+            y2 = std::min(gridHeight - 1, y2);
+
+            for (int yy = y1; yy <= y2; ++yy)
+            {
+                for (int xx = x1; xx <= x2; ++xx)
+                {
+                    const std::vector<Point2f>& m = grid[yy * gridWidth + xx];
+
+                    for(size_t j = 0; j < m.size(); ++j)
+                    {
+                        const Point2f d = p - m[j];
+
+                        if (d.ddot(d) < minDist2)
+                        {
+                            good = false;
+                            goto break_out;
+                        }
+                    }
+                }
+            }
+
+            break_out:
+
+            if(good)
+            {
+                grid[yCell * gridWidth + xCell].push_back(p);
+
+                newPosBuf_.push_back(oldPosBuf_[ind]);
+                newVoteBuf_.push_back(oldVoteBuf_[ind]);
+            }
+        }
+
+        posCount_ = static_cast<int>(newPosBuf_.size());
+        cudaSafeCall( cudaMemcpy(outBuf_.ptr(0), &newPosBuf_[0], posCount_ * sizeof(float4), cudaMemcpyHostToDevice) );
+        cudaSafeCall( cudaMemcpy(outBuf_.ptr(1), &newVoteBuf_[0], posCount_ * sizeof(int3), cudaMemcpyHostToDevice) );
+    }
+
+    void GeneralizedHoughBase::convertTo(OutputArray positions, OutputArray votes)
+    {
+        ensureSizeIsEnough(1, posCount_, CV_32FC4, positions);
+        GpuMat(1, posCount_, CV_32FC4, outBuf_.ptr(0), outBuf_.step).copyTo(positions);
+
+        if (votes.needed())
+        {
+            ensureSizeIsEnough(1, posCount_, CV_32FC3, votes);
+            GpuMat(1, posCount_, CV_32FC4, outBuf_.ptr(1), outBuf_.step).copyTo(votes);
+        }
+    }
+}
+
+// GeneralizedHoughBallard
+
+namespace
+{
+    class GeneralizedHoughBallardImpl : public GeneralizedHoughBallard, private GeneralizedHoughBase
+    {
+    public:
+        GeneralizedHoughBallardImpl();
+
+        void setTemplate(InputArray templ, Point templCenter) { setTemplateImpl(templ, templCenter); }
+        void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter) { setTemplateImpl(edges, dx, dy, templCenter); }
+
+        void detect(InputArray image, OutputArray positions, OutputArray votes) { detectImpl(image, positions, votes); }
+        void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes) { detectImpl(edges, dx, dy, positions, votes); }
+
+        void setCannyLowThresh(int cannyLowThresh) { cannyLowThresh_ = cannyLowThresh; }
+        int getCannyLowThresh() const { return cannyLowThresh_; }
+
+        void setCannyHighThresh(int cannyHighThresh) { cannyHighThresh_ = cannyHighThresh; }
+        int getCannyHighThresh() const { return cannyHighThresh_; }
+
+        void setMinDist(double minDist) { minDist_ = minDist; }
+        double getMinDist() const { return minDist_; }
+
+        void setDp(double dp) { dp_ = dp; }
+        double getDp() const { return dp_; }
+
+        void setMaxBufferSize(int maxBufferSize) { maxBufferSize_ = maxBufferSize; }
+        int getMaxBufferSize() const { return maxBufferSize_; }
+
+        void setLevels(int levels) { levels_ = levels; }
+        int getLevels() const { return levels_; }
+
+        void setVotesThreshold(int votesThreshold) { votesThreshold_ = votesThreshold; }
+        int getVotesThreshold() const { return votesThreshold_; }
+
+    private:
+        void processTempl();
+        void processImage();
+
+        void calcHist();
+        void findPosInHist();
+
+        int levels_;
+        int votesThreshold_;
+
+        GpuMat r_table_;
+        GpuMat r_sizes_;
+
+        GpuMat hist_;
+    };
+
+    GeneralizedHoughBallardImpl::GeneralizedHoughBallardImpl()
+    {
+        levels_ = 360;
+        votesThreshold_ = 100;
+    }
+
+    void GeneralizedHoughBallardImpl::processTempl()
+    {
+        using namespace cv::cuda::device::ght;
+
+        CV_Assert( levels_ > 0 );
+
+        buildEdgePointList(templEdges_, templDx_, templDy_);
+
+        ensureSizeIsEnough(levels_ + 1, maxBufferSize_, CV_16SC2, r_table_);
+        ensureSizeIsEnough(1, levels_ + 1, CV_32SC1, r_sizes_);
+        r_sizes_.setTo(Scalar::all(0));
+
+        if (edgePointList_.cols > 0)
+        {
+            buildRTable_gpu(edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1), edgePointList_.cols,
+                            r_table_, r_sizes_.ptr<int>(), make_short2(templCenter_.x, templCenter_.y), levels_);
+            cuda::min(r_sizes_, maxBufferSize_, r_sizes_);
+        }
+    }
+
+    void GeneralizedHoughBallardImpl::processImage()
+    {
+        calcHist();
+        findPosInHist();
+    }
+
+    void GeneralizedHoughBallardImpl::calcHist()
+    {
+        using namespace cv::cuda::device::ght;
+
+        CV_Assert( levels_ > 0 && r_table_.rows == (levels_ + 1) && r_sizes_.cols == (levels_ + 1) );
+        CV_Assert( dp_ > 0.0);
+
+        const double idp = 1.0 / dp_;
+
+        buildEdgePointList(imageEdges_, imageDx_, imageDy_);
+
+        ensureSizeIsEnough(cvCeil(imageSize_.height * idp) + 2, cvCeil(imageSize_.width * idp) + 2, CV_32SC1, hist_);
+        hist_.setTo(Scalar::all(0));
+
+        if (edgePointList_.cols > 0)
+        {
+            Ballard_Pos_calcHist_gpu(edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1), edgePointList_.cols,
+                                     r_table_, r_sizes_.ptr<int>(),
+                                     hist_,
+                                     (float)dp_, levels_);
+        }
+    }
+
+    void GeneralizedHoughBallardImpl::findPosInHist()
+    {
+        using namespace cv::cuda::device::ght;
+
+        CV_Assert( votesThreshold_ > 0 );
+
+        ensureSizeIsEnough(2, maxBufferSize_, CV_32FC4, outBuf_);
+
+        posCount_ = Ballard_Pos_findPosInHist_gpu(hist_, outBuf_.ptr<float4>(0), outBuf_.ptr<int3>(1), maxBufferSize_, (float)dp_, votesThreshold_);
+    }
+}
+
+Ptr<GeneralizedHoughBallard> cv::cuda::createGeneralizedHoughBallard()
+{
+    return makePtr<GeneralizedHoughBallardImpl>();
+}
+
+// GeneralizedHoughGuil
+
+namespace
+{
+    class GeneralizedHoughGuilImpl : public GeneralizedHoughGuil, private GeneralizedHoughBase
+    {
+    public:
+        GeneralizedHoughGuilImpl();
+
+        void setTemplate(InputArray templ, Point templCenter) { setTemplateImpl(templ, templCenter); }
+        void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter) { setTemplateImpl(edges, dx, dy, templCenter); }
+
+        void detect(InputArray image, OutputArray positions, OutputArray votes) { detectImpl(image, positions, votes); }
+        void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes) { detectImpl(edges, dx, dy, positions, votes); }
+
+        void setCannyLowThresh(int cannyLowThresh) { cannyLowThresh_ = cannyLowThresh; }
+        int getCannyLowThresh() const { return cannyLowThresh_; }
+
+        void setCannyHighThresh(int cannyHighThresh) { cannyHighThresh_ = cannyHighThresh; }
+        int getCannyHighThresh() const { return cannyHighThresh_; }
+
+        void setMinDist(double minDist) { minDist_ = minDist; }
+        double getMinDist() const { return minDist_; }
+
+        void setDp(double dp) { dp_ = dp; }
+        double getDp() const { return dp_; }
+
+        void setMaxBufferSize(int maxBufferSize) { maxBufferSize_ = maxBufferSize; }
+        int getMaxBufferSize() const { return maxBufferSize_; }
+
+        void setXi(double xi) { xi_ = xi; }
+        double getXi() const { return xi_; }
+
+        void setLevels(int levels) { levels_ = levels; }
+        int getLevels() const { return levels_; }
+
+        void setAngleEpsilon(double angleEpsilon) { angleEpsilon_ = angleEpsilon; }
+        double getAngleEpsilon() const { return angleEpsilon_; }
+
+        void setMinAngle(double minAngle) { minAngle_ = minAngle; }
+        double getMinAngle() const { return minAngle_; }
+
+        void setMaxAngle(double maxAngle) { maxAngle_ = maxAngle; }
+        double getMaxAngle() const { return maxAngle_; }
+
+        void setAngleStep(double angleStep) { angleStep_ = angleStep; }
+        double getAngleStep() const { return angleStep_; }
+
+        void setAngleThresh(int angleThresh) { angleThresh_ = angleThresh; }
+        int getAngleThresh() const { return angleThresh_; }
+
+        void setMinScale(double minScale) { minScale_ = minScale; }
+        double getMinScale() const { return minScale_; }
+
+        void setMaxScale(double maxScale) { maxScale_ = maxScale; }
+        double getMaxScale() const { return maxScale_; }
+
+        void setScaleStep(double scaleStep) { scaleStep_ = scaleStep; }
+        double getScaleStep() const { return scaleStep_; }
+
+        void setScaleThresh(int scaleThresh) { scaleThresh_ = scaleThresh; }
+        int getScaleThresh() const { return scaleThresh_; }
+
+        void setPosThresh(int posThresh) { posThresh_ = posThresh; }
+        int getPosThresh() const { return posThresh_; }
+
+    private:
+        void processTempl();
+        void processImage();
+
+        double xi_;
+        int levels_;
+        double angleEpsilon_;
+
+        double minAngle_;
+        double maxAngle_;
+        double angleStep_;
+        int angleThresh_;
+
+        double minScale_;
+        double maxScale_;
+        double scaleStep_;
+        int scaleThresh_;
+
+        int posThresh_;
+
+        struct Feature
+        {
+            GpuMat p1_pos;
+            GpuMat p1_theta;
+            GpuMat p2_pos;
+
+            GpuMat d12;
+
+            GpuMat r1;
+            GpuMat r2;
+
+            GpuMat sizes;
+            int maxSize;
+
+            void create(int levels, int maxCapacity, bool isTempl);
+        };
+
+        typedef void (*set_func_t)(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
+        typedef void (*build_func_t)(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                     int* sizes, int maxSize,
+                                     float xi, float angleEpsilon, int levels,
+                                     float2 center, float maxDist);
+
+        void buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
+                              set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center = Point2d());
+
+        void calcOrientation();
+        void calcScale(double angle);
+        void calcPosition(double angle, int angleVotes, double scale, int scaleVotes);
+
+        Feature templFeatures_;
+        Feature imageFeatures_;
+
+        std::vector< std::pair<double, int> > angles_;
+        std::vector< std::pair<double, int> > scales_;
+
+        GpuMat hist_;
+        std::vector<int> h_buf_;
+    };
+
+    GeneralizedHoughGuilImpl::GeneralizedHoughGuilImpl()
+    {
+        maxBufferSize_ = 1000;
+
+        xi_ = 90.0;
+        levels_ = 360;
+        angleEpsilon_ = 1.0;
+
+        minAngle_ = 0.0;
+        maxAngle_ = 360.0;
+        angleStep_ = 1.0;
+        angleThresh_ = 15000;
+
+        minScale_ = 0.5;
+        maxScale_ = 2.0;
+        scaleStep_ = 0.05;
+        scaleThresh_ = 1000;
+
+        posThresh_ = 100;
+    }
+
+    void GeneralizedHoughGuilImpl::processTempl()
+    {
+        using namespace cv::cuda::device::ght;
+
+        buildFeatureList(templEdges_, templDx_, templDy_, templFeatures_,
+            Guil_Full_setTemplFeatures, Guil_Full_buildTemplFeatureList_gpu,
+            true, templCenter_);
+
+        h_buf_.resize(templFeatures_.sizes.cols);
+        cudaSafeCall( cudaMemcpy(&h_buf_[0], templFeatures_.sizes.data, h_buf_.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+        templFeatures_.maxSize = *std::max_element(h_buf_.begin(), h_buf_.end());
+    }
+
+    void GeneralizedHoughGuilImpl::processImage()
+    {
+        using namespace cv::cuda::device::ght;
+
+        CV_Assert( levels_ > 0 );
+        CV_Assert( templFeatures_.sizes.cols == levels_ + 1 );
+        CV_Assert( minAngle_ >= 0.0 && minAngle_ < maxAngle_ && maxAngle_ <= 360.0 );
+        CV_Assert( angleStep_ > 0.0 && angleStep_ < 360.0 );
+        CV_Assert( angleThresh_ > 0 );
+        CV_Assert( minScale_ > 0.0 && minScale_ < maxScale_ );
+        CV_Assert( scaleStep_ > 0.0 );
+        CV_Assert( scaleThresh_ > 0 );
+        CV_Assert( dp_ > 0.0 );
+        CV_Assert( posThresh_ > 0 );
+
+        const double iAngleStep = 1.0 / angleStep_;
+        const int angleRange = cvCeil((maxAngle_ - minAngle_) * iAngleStep);
+
+        const double iScaleStep = 1.0 / scaleStep_;
+        const int scaleRange = cvCeil((maxScale_ - minScale_) * iScaleStep);
+
+        const double idp = 1.0 / dp_;
+        const int histRows = cvCeil(imageSize_.height * idp);
+        const int histCols = cvCeil(imageSize_.width * idp);
+
+        ensureSizeIsEnough(histRows + 2, std::max(angleRange + 1, std::max(scaleRange + 1, histCols + 2)), CV_32SC1, hist_);
+        h_buf_.resize(std::max(angleRange + 1, scaleRange + 1));
+
+        ensureSizeIsEnough(2, maxBufferSize_, CV_32FC4, outBuf_);
+
+        buildFeatureList(imageEdges_, imageDx_, imageDy_, imageFeatures_,
+            Guil_Full_setImageFeatures, Guil_Full_buildImageFeatureList_gpu,
+            false);
+
+        calcOrientation();
+
+        for (size_t i = 0; i < angles_.size(); ++i)
+        {
+            const double angle = angles_[i].first;
+            const int angleVotes = angles_[i].second;
+
+            calcScale(angle);
+
+            for (size_t j = 0; j < scales_.size(); ++j)
+            {
+                const double scale = scales_[j].first;
+                const int scaleVotes = scales_[j].second;
+
+                calcPosition(angle, angleVotes, scale, scaleVotes);
+            }
+        }
+    }
+
+    void GeneralizedHoughGuilImpl::Feature::create(int levels, int maxCapacity, bool isTempl)
+    {
+        if (!isTempl)
+        {
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, p1_pos);
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, p2_pos);
+        }
+
+        ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC1, p1_theta);
+
+        ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC1, d12);
+
+        if (isTempl)
+        {
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, r1);
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, r2);
+        }
+
+        ensureSizeIsEnough(1, levels + 1, CV_32SC1, sizes);
+        sizes.setTo(Scalar::all(0));
+
+        maxSize = 0;
+    }
+
+    void GeneralizedHoughGuilImpl::buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
+                                                    set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center)
+    {
+        CV_Assert( levels_ > 0 );
+
+        const double maxDist = sqrt((double) templSize_.width * templSize_.width + templSize_.height * templSize_.height) * maxScale_;
+
+        features.create(levels_, maxBufferSize_, isTempl);
+        set_func(features.p1_pos, features.p1_theta, features.p2_pos, features.d12, features.r1, features.r2);
+
+        buildEdgePointList(edges, dx, dy);
+
+        if (edgePointList_.cols > 0)
+        {
+            build_func(edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1), edgePointList_.cols,
+                features.sizes.ptr<int>(), maxBufferSize_, (float)xi_, (float)angleEpsilon_, levels_, make_float2((float)center.x, (float)center.y), (float)maxDist);
+        }
+    }
+
+    void GeneralizedHoughGuilImpl::calcOrientation()
+    {
+        using namespace cv::cuda::device::ght;
+
+        const double iAngleStep = 1.0 / angleStep_;
+        const int angleRange = cvCeil((maxAngle_ - minAngle_) * iAngleStep);
+
+        hist_.setTo(Scalar::all(0));
+        Guil_Full_calcOHist_gpu(templFeatures_.sizes.ptr<int>(), imageFeatures_.sizes.ptr<int>(0), hist_.ptr<int>(),
+                                (float)minAngle_, (float)maxAngle_, (float)angleStep_, angleRange, levels_, templFeatures_.maxSize);
+        cudaSafeCall( cudaMemcpy(&h_buf_[0], hist_.data, h_buf_.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+
+        angles_.clear();
+
+        for (int n = 0; n < angleRange; ++n)
+        {
+            if (h_buf_[n] >= angleThresh_)
+            {
+                const double angle = minAngle_ + n * angleStep_;
+                angles_.push_back(std::make_pair(angle, h_buf_[n]));
+            }
+        }
+    }
+
+    void GeneralizedHoughGuilImpl::calcScale(double angle)
+    {
+        using namespace cv::cuda::device::ght;
+
+        const double iScaleStep = 1.0 / scaleStep_;
+        const int scaleRange = cvCeil((maxScale_ - minScale_) * iScaleStep);
+
+        hist_.setTo(Scalar::all(0));
+        Guil_Full_calcSHist_gpu(templFeatures_.sizes.ptr<int>(), imageFeatures_.sizes.ptr<int>(0), hist_.ptr<int>(),
+                                (float)angle, (float)angleEpsilon_, (float)minScale_, (float)maxScale_,
+                                (float)iScaleStep, scaleRange, levels_, templFeatures_.maxSize);
+        cudaSafeCall( cudaMemcpy(&h_buf_[0], hist_.data, h_buf_.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+
+        scales_.clear();
+
+        for (int s = 0; s < scaleRange; ++s)
+        {
+            if (h_buf_[s] >= scaleThresh_)
+            {
+                const double scale = minScale_ + s * scaleStep_;
+                scales_.push_back(std::make_pair(scale, h_buf_[s]));
+            }
+        }
+    }
+
+    void GeneralizedHoughGuilImpl::calcPosition(double angle, int angleVotes, double scale, int scaleVotes)
+    {
+        using namespace cv::cuda::device::ght;
+
+        hist_.setTo(Scalar::all(0));
+        Guil_Full_calcPHist_gpu(templFeatures_.sizes.ptr<int>(), imageFeatures_.sizes.ptr<int>(0), hist_,
+                                (float)angle, (float)angleEpsilon_, (float)scale, (float)dp_, levels_, templFeatures_.maxSize);
+
+        posCount_ = Guil_Full_findPosInHist_gpu(hist_, outBuf_.ptr<float4>(0), outBuf_.ptr<int3>(1),
+                                                posCount_, maxBufferSize_, (float)angle, angleVotes,
+                                                (float)scale, scaleVotes, (float)dp_, posThresh_);
+    }
+}
+
+Ptr<GeneralizedHoughGuil> cv::cuda::createGeneralizedHoughGuil()
+{
+    return makePtr<GeneralizedHoughGuilImpl>();
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp
new file mode 100644
index 00000000000..bf5d01b1174
--- /dev/null
+++ b/modules/cudaimgproc/src/gftt.cpp
@@ -0,0 +1,215 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_CUDAARITHM)
+
+Ptr<cuda::CornersDetector> cv::cuda::createGoodFeaturesToTrackDetector(int, int, double, double, int, bool, double) { throw_no_cuda(); return Ptr<cuda::CornersDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace gfft
+    {
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream);
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    class GoodFeaturesToTrackDetector : public CornersDetector
+    {
+    public:
+        GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
+                                    int blockSize, bool useHarrisDetector, double harrisK);
+
+        void detect(InputArray image, OutputArray corners, InputArray mask, Stream& stream);
+
+    private:
+        int maxCorners_;
+        double qualityLevel_;
+        double minDistance_;
+
+        Ptr<cuda::CornernessCriteria> cornerCriteria_;
+
+        GpuMat Dx_;
+        GpuMat Dy_;
+        GpuMat buf_;
+        GpuMat eig_;
+        GpuMat tmpCorners_;
+    };
+
+    GoodFeaturesToTrackDetector::GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
+                                                             int blockSize, bool useHarrisDetector, double harrisK) :
+        maxCorners_(maxCorners), qualityLevel_(qualityLevel), minDistance_(minDistance)
+    {
+        CV_Assert( qualityLevel_ > 0 && minDistance_ >= 0 && maxCorners_ >= 0 );
+
+        cornerCriteria_ = useHarrisDetector ?
+                    cuda::createHarrisCorner(srcType, blockSize, 3, harrisK) :
+                    cuda::createMinEigenValCorner(srcType, blockSize, 3);
+    }
+
+    void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream)
+    {
+        using namespace cv::cuda::device::gfft;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat mask = _mask.getGpuMat();
+
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
+
+        ensureSizeIsEnough(image.size(), CV_32FC1, eig_);
+        cornerCriteria_->compute(image, eig_, stream);
+
+        double maxVal = 0;
+        cuda::minMax(eig_, 0, &maxVal);
+        cudaStream_t stream_ = StreamAccessor::getStream(stream);
+        ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+
+        int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, stream_);
+
+        if (total == 0)
+        {
+            _corners.release();
+            return;
+        }
+
+        sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);
+
+        if (minDistance_ < 1)
+        {
+            tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners, stream);
+        }
+        else
+        {
+            std::vector<Point2f> tmp(total);
+            Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
+            tmpCorners_.colRange(0, total).download(tmpMat, stream);
+            stream.waitForCompletion();
+            std::vector<Point2f> tmp2;
+            tmp2.reserve(total);
+
+            const int cell_size = cvRound(minDistance_);
+            const int grid_width = (image.cols + cell_size - 1) / cell_size;
+            const int grid_height = (image.rows + cell_size - 1) / cell_size;
+
+            std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+
+            for (int i = 0; i < total; ++i)
+            {
+                Point2f p = tmp[i];
+
+                bool good = true;
+
+                int x_cell = static_cast<int>(p.x / cell_size);
+                int y_cell = static_cast<int>(p.y / cell_size);
+
+                int x1 = x_cell - 1;
+                int y1 = y_cell - 1;
+                int x2 = x_cell + 1;
+                int y2 = y_cell + 1;
+
+                // boundary check
+                x1 = std::max(0, x1);
+                y1 = std::max(0, y1);
+                x2 = std::min(grid_width - 1, x2);
+                y2 = std::min(grid_height - 1, y2);
+
+                for (int yy = y1; yy <= y2; yy++)
+                {
+                    for (int xx = x1; xx <= x2; xx++)
+                    {
+                        std::vector<Point2f>& m = grid[yy * grid_width + xx];
+
+                        if (!m.empty())
+                        {
+                            for(size_t j = 0; j < m.size(); j++)
+                            {
+                                float dx = p.x - m[j].x;
+                                float dy = p.y - m[j].y;
+
+                                if (dx * dx + dy * dy < minDistance_ * minDistance_)
+                                {
+                                    good = false;
+                                    goto break_out;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                break_out:
+
+                if(good)
+                {
+                    grid[y_cell * grid_width + x_cell].push_back(p);
+
+                    tmp2.push_back(p);
+
+                    if (maxCorners_ > 0 && tmp2.size() == static_cast<size_t>(maxCorners_))
+                        break;
+                }
+            }
+
+            _corners.create(1, static_cast<int>(tmp2.size()), CV_32FC2);
+            GpuMat corners = _corners.getGpuMat();
+
+            corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]), stream);
+        }
+    }
+}
+
+Ptr<cuda::CornersDetector> cv::cuda::createGoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
+                                                                     int blockSize, bool useHarrisDetector, double harrisK)
+{
+    return Ptr<cuda::CornersDetector>(
+        new GoodFeaturesToTrackDetector(srcType, maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, harrisK));
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
new file mode 100644
index 00000000000..6e219b641bf
--- /dev/null
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -0,0 +1,584 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::calcHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::equalizeHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+cv::Ptr<cv::cuda::CLAHE> cv::cuda::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr<cv::cuda::CLAHE>(); }
+
+void cv::cuda::evenLevels(OutputArray, int, int, int, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::histEven(InputArray, OutputArray, int, int, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::histEven(InputArray, GpuMat*, int*, int*, int*, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::histRange(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+////////////////////////////////////////////////////////////////////////
+// calcHist
+
+namespace hist
+{
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
+}
+
+void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
+{
+    calcHist(_src, cv::cuda::GpuMat(), _hist, stream);
+}
+
+void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC1 );
+    CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
+    CV_Assert( mask.empty() || mask.size() == src.size() );
+
+    _hist.create(1, 256, CV_32SC1);
+    GpuMat hist = _hist.getGpuMat();
+
+    hist.setTo(Scalar::all(0), stream);
+
+    if (mask.empty())
+        hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+    else
+        hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// equalizeHist
+
+namespace hist
+{
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
+}
+
+void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream)
+{
+    GpuMat src = getInputMat(_src, _stream);
+
+    CV_Assert( src.type() == CV_8UC1 );
+
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    int intBufSize;
+    nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );
+
+    size_t bufSize = intBufSize + 2 * 256 * sizeof(int);
+
+    BufferPool pool(_stream);
+    GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);
+
+    GpuMat hist(1, 256, CV_32SC1, buf.data);
+    GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int));
+    GpuMat intBuf(1, intBufSize, CV_8UC1, buf.data + 2 * 256 * sizeof(int));
+
+    cuda::calcHist(src, hist, _stream);
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+    NppStreamHandler h(stream);
+
+    nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );
+
+    hist::equalizeHist(src, dst, lut.ptr<int>(), stream);
+}
+
+////////////////////////////////////////////////////////////////////////
+// CLAHE
+
+namespace clahe
+{
+    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream);
+    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream);
+}
+
+namespace
+{
+    class CLAHE_Impl : public cv::cuda::CLAHE
+    {
+    public:
+        CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
+
+        void apply(cv::InputArray src, cv::OutputArray dst);
+        void apply(InputArray src, OutputArray dst, Stream& stream);
+
+        void setClipLimit(double clipLimit);
+        double getClipLimit() const;
+
+        void setTilesGridSize(cv::Size tileGridSize);
+        cv::Size getTilesGridSize() const;
+
+        void collectGarbage();
+
+    private:
+        double clipLimit_;
+        int tilesX_;
+        int tilesY_;
+
+        GpuMat srcExt_;
+        GpuMat lut_;
+    };
+
+    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
+        clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
+    {
+    }
+
+    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
+    {
+        apply(_src, _dst, Stream::Null());
+    }
+
+    void CLAHE_Impl::apply(InputArray _src, OutputArray _dst, Stream& s)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+
+        _dst.create( src.size(), src.type() );
+        GpuMat dst = _dst.getGpuMat();
+
+        const int histSize = 256;
+
+        ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
+
+        cudaStream_t stream = StreamAccessor::getStream(s);
+
+        cv::Size tileSize;
+        GpuMat srcForLut;
+
+        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+        {
+            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
+            srcForLut = src;
+        }
+        else
+        {
+#ifndef HAVE_OPENCV_CUDAARITHM
+            throw_no_cuda();
+#else
+            cv::cuda::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar(), s);
+#endif
+
+            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
+            srcForLut = srcExt_;
+        }
+
+        const int tileSizeTotal = tileSize.area();
+        const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
+
+        int clipLimit = 0;
+        if (clipLimit_ > 0.0)
+        {
+            clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
+            clipLimit = std::max(clipLimit, 1);
+        }
+
+        clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream);
+
+        clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream);
+    }
+
+    void CLAHE_Impl::setClipLimit(double clipLimit)
+    {
+        clipLimit_ = clipLimit;
+    }
+
+    double CLAHE_Impl::getClipLimit() const
+    {
+        return clipLimit_;
+    }
+
+    void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
+    {
+        tilesX_ = tileGridSize.width;
+        tilesY_ = tileGridSize.height;
+    }
+
+    cv::Size CLAHE_Impl::getTilesGridSize() const
+    {
+        return cv::Size(tilesX_, tilesY_);
+    }
+
+    void CLAHE_Impl::collectGarbage()
+    {
+        srcExt_.release();
+        lut_.release();
+    }
+}
+
+cv::Ptr<cv::cuda::CLAHE> cv::cuda::createCLAHE(double clipLimit, cv::Size tileGridSize)
+{
+    return makePtr<CLAHE_Impl>(clipLimit, tileGridSize.width, tileGridSize.height);
+}
+
+////////////////////////////////////////////////////////////////////////
+// NPP Histogram
+
+namespace
+{
+    typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize);
+    typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize);
+
+    template<int SDEPTH> struct NppHistogramEvenFuncC1
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
+            int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
+    };
+    template<int SDEPTH> struct NppHistogramEvenFuncC4
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI,
+            Npp32s * pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u * pBuffer);
+    };
+
+    template<int SDEPTH, typename NppHistogramEvenFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
+    struct NppHistogramEvenC1
+    {
+        typedef typename NppHistogramEvenFuncC1<SDEPTH>::src_t src_t;
+
+        static void hist(const GpuMat& src, OutputArray _hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
+        {
+            const int levels = histSize + 1;
+
+            _hist.create(1, histSize, CV_32S);
+            GpuMat hist = _hist.getGpuMat();
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            int buf_size;
+            get_buf_size(sz, levels, &buf_size);
+
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
+                lowerLevel, upperLevel, buf.ptr<Npp8u>()) );
+
+            if (!stream)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppHistogramEvenFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
+    struct NppHistogramEvenC4
+    {
+        typedef typename NppHistogramEvenFuncC4<SDEPTH>::src_t src_t;
+
+        static void hist(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
+        {
+            int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1};
+            hist[0].create(1, histSize[0], CV_32S);
+            hist[1].create(1, histSize[1], CV_32S);
+            hist[2].create(1, histSize[2], CV_32S);
+            hist[3].create(1, histSize[3], CV_32S);
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};
+
+            int buf_size;
+            get_buf_size(sz, levels, &buf_size);
+
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr<Npp8u>()) );
+
+            if (!stream)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    template<int SDEPTH> struct NppHistogramRangeFuncC1
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef Npp32s level_t;
+        enum {LEVEL_TYPE_CODE=CV_32SC1};
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
+            const Npp32s* pLevels, int nLevels, Npp8u* pBuffer);
+    };
+    template<> struct NppHistogramRangeFuncC1<CV_32F>
+    {
+        typedef Npp32f src_t;
+        typedef Npp32f level_t;
+        enum {LEVEL_TYPE_CODE=CV_32FC1};
+
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
+            const Npp32f* pLevels, int nLevels, Npp8u* pBuffer);
+    };
+    template<int SDEPTH> struct NppHistogramRangeFuncC4
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef Npp32s level_t;
+        enum {LEVEL_TYPE_CODE=CV_32SC1};
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
+            const Npp32s* pLevels[4], int nLevels[4], Npp8u* pBuffer);
+    };
+    template<> struct NppHistogramRangeFuncC4<CV_32F>
+    {
+        typedef Npp32f src_t;
+        typedef Npp32f level_t;
+        enum {LEVEL_TYPE_CODE=CV_32FC1};
+
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
+            const Npp32f* pLevels[4], int nLevels[4], Npp8u* pBuffer);
+    };
+
+    template<int SDEPTH, typename NppHistogramRangeFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
+    struct NppHistogramRangeC1
+    {
+        typedef typename NppHistogramRangeFuncC1<SDEPTH>::src_t src_t;
+        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
+        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
+
+        static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, Stream& stream)
+        {
+            CV_Assert( levels.type() == LEVEL_TYPE_CODE && levels.rows == 1 );
+
+            _hist.create(1, levels.cols - 1, CV_32S);
+            GpuMat hist = _hist.getGpuMat();
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            int buf_size;
+            get_buf_size(sz, levels.cols, &buf_size);
+
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buf.ptr<Npp8u>()) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppHistogramRangeFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
+    struct NppHistogramRangeC4
+    {
+        typedef typename NppHistogramRangeFuncC4<SDEPTH>::src_t src_t;
+        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
+        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
+
+        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
+        {
+            CV_Assert( levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1 );
+            CV_Assert( levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1 );
+            CV_Assert( levels[2].type() == LEVEL_TYPE_CODE && levels[2].rows == 1 );
+            CV_Assert( levels[3].type() == LEVEL_TYPE_CODE && levels[3].rows == 1 );
+
+            hist[0].create(1, levels[0].cols - 1, CV_32S);
+            hist[1].create(1, levels[1].cols - 1, CV_32S);
+            hist[2].create(1, levels[2].cols - 1, CV_32S);
+            hist[3].create(1, levels[3].cols - 1, CV_32S);
+
+            Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};
+            int nLevels[] = {levels[0].cols, levels[1].cols, levels[2].cols, levels[3].cols};
+            const level_t* pLevels[] = {levels[0].ptr<level_t>(), levels[1].ptr<level_t>(), levels[2].ptr<level_t>(), levels[3].ptr<level_t>()};
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            int buf_size;
+            get_buf_size(sz, nLevels, &buf_size);
+
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, pLevels, nLevels, buf.ptr<Npp8u>()) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream)
+{
+    const int kind = _levels.kind();
+
+    _levels.create(1, nLevels, CV_32SC1);
+
+    Mat host_levels;
+    if (kind == _InputArray::CUDA_GPU_MAT)
+        host_levels.create(1, nLevels, CV_32SC1);
+    else
+        host_levels = _levels.getMat();
+
+    nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );
+
+    if (kind == _InputArray::CUDA_GPU_MAT)
+        _levels.getGpuMatRef().upload(host_levels, stream);
+}
+
+namespace hist
+{
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
+}
+
+namespace
+{
+    void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
+    {
+        hist.create(1, histSize, CV_32S);
+        cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
+        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
+    }
+}
+
+void cv::cuda::histEven(InputArray _src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
+{
+    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, int levels, int lowerLevel, int upperLevel, Stream& stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
+        0,
+        NppHistogramEvenC1<CV_16U, nppiHistogramEven_16u_C1R, nppiHistogramEvenGetBufferSize_16u_C1R>::hist,
+        NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R, nppiHistogramEvenGetBufferSize_16s_C1R>::hist
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    if (src.depth() == CV_8U && deviceSupports(FEATURE_SET_COMPUTE_30))
+    {
+        histEven8u(src, hist.getGpuMatRef(), histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+        return;
+    }
+
+    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );
+
+    hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream);
+}
+
+void cv::cuda::histEven(InputArray _src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
+{
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4], Stream& stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist,
+        0,
+        NppHistogramEvenC4<CV_16U, nppiHistogramEven_16u_C4R, nppiHistogramEvenGetBufferSize_16u_C4R>::hist,
+        NppHistogramEvenC4<CV_16S, nppiHistogramEven_16s_C4R, nppiHistogramEvenGetBufferSize_16s_C4R>::hist
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 );
+
+    hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream);
+}
+
+void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, Stream& stream)
+{
+    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, Stream& stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist,
+        0,
+        NppHistogramRangeC1<CV_16U, nppiHistogramRange_16u_C1R, nppiHistogramRangeGetBufferSize_16u_C1R>::hist,
+        NppHistogramRangeC1<CV_16S, nppiHistogramRange_16s_C1R, nppiHistogramRangeGetBufferSize_16s_C1R>::hist,
+        0,
+        NppHistogramRangeC1<CV_32F, nppiHistogramRange_32f_C1R, nppiHistogramRangeGetBufferSize_32f_C1R>::hist
+    };
+
+    GpuMat src = _src.getGpuMat();
+    GpuMat levels = _levels.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1 );
+
+    hist_callers[src.depth()](src, hist, levels, stream);
+}
+
+void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
+{
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist,
+        0,
+        NppHistogramRangeC4<CV_16U, nppiHistogramRange_16u_C4R, nppiHistogramRangeGetBufferSize_16u_C4R>::hist,
+        NppHistogramRangeC4<CV_16S, nppiHistogramRange_16s_C4R, nppiHistogramRangeGetBufferSize_16s_C4R>::hist,
+        0,
+        NppHistogramRangeC4<CV_32F, nppiHistogramRange_32f_C4R, nppiHistogramRangeGetBufferSize_32f_C4R>::hist
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4 );
+
+    hist_callers[src.depth()](src, hist, levels, stream);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/hough_circles.cpp b/modules/cudaimgproc/src/hough_circles.cpp
new file mode 100644
index 00000000000..0fa962d71b4
--- /dev/null
+++ b/modules/cudaimgproc/src/hough_circles.cpp
@@ -0,0 +1,318 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_CUDAFILTERS)
+
+Ptr<cuda::HoughCirclesDetector> cv::cuda::createHoughCirclesDetector(float, float, int, int, int, int, int) { throw_no_cuda(); return Ptr<HoughCirclesDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough
+    {
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+    }
+
+    namespace hough_circles
+    {
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp);
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold);
+        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20);
+    }
+}}}
+
+namespace
+{
+    class HoughCirclesDetectorImpl : public HoughCirclesDetector
+    {
+    public:
+        HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles);
+
+        void detect(InputArray src, OutputArray circles, Stream& stream);
+
+        void setDp(float dp) { dp_ = dp; }
+        float getDp() const { return dp_; }
+
+        void setMinDist(float minDist) { minDist_ = minDist; }
+        float getMinDist() const { return minDist_; }
+
+        void setCannyThreshold(int cannyThreshold) { cannyThreshold_ = cannyThreshold; }
+        int getCannyThreshold() const { return cannyThreshold_; }
+
+        void setVotesThreshold(int votesThreshold) { votesThreshold_ = votesThreshold; }
+        int getVotesThreshold() const { return votesThreshold_; }
+
+        void setMinRadius(int minRadius) { minRadius_ = minRadius; }
+        int getMinRadius() const { return minRadius_; }
+
+        void setMaxRadius(int maxRadius) { maxRadius_ = maxRadius; }
+        int getMaxRadius() const { return maxRadius_; }
+
+        void setMaxCircles(int maxCircles) { maxCircles_ = maxCircles; }
+        int getMaxCircles() const { return maxCircles_; }
+
+        void write(FileStorage& fs) const
+        {
+            writeFormat(fs);
+            fs << "name" << "HoughCirclesDetector_CUDA"
+            << "dp" << dp_
+            << "minDist" << minDist_
+            << "cannyThreshold" << cannyThreshold_
+            << "votesThreshold" << votesThreshold_
+            << "minRadius" << minRadius_
+            << "maxRadius" << maxRadius_
+            << "maxCircles" << maxCircles_;
+        }
+
+        void read(const FileNode& fn)
+        {
+            CV_Assert( String(fn["name"]) == "HoughCirclesDetector_CUDA" );
+            dp_ = (float)fn["dp"];
+            minDist_ = (float)fn["minDist"];
+            cannyThreshold_ = (int)fn["cannyThreshold"];
+            votesThreshold_ = (int)fn["votesThreshold"];
+            minRadius_ = (int)fn["minRadius"];
+            maxRadius_ = (int)fn["maxRadius"];
+            maxCircles_ = (int)fn["maxCircles"];
+        }
+
+    private:
+        float dp_;
+        float minDist_;
+        int cannyThreshold_;
+        int votesThreshold_;
+        int minRadius_;
+        int maxRadius_;
+        int maxCircles_;
+
+        GpuMat dx_, dy_;
+        GpuMat edges_;
+        GpuMat accum_;
+        Mat tt; //CPU copy of accum_
+        GpuMat list_;
+        GpuMat result_;
+        Ptr<cuda::Filter> filterDx_;
+        Ptr<cuda::Filter> filterDy_;
+        Ptr<cuda::CannyEdgeDetector> canny_;
+    };
+
+    bool centersCompare(Vec3f a, Vec3f b) {return (a[2] > b[2]);}
+
+    HoughCirclesDetectorImpl::HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold,
+                                                       int minRadius, int maxRadius, int maxCircles) :
+        dp_(dp), minDist_(minDist), cannyThreshold_(cannyThreshold), votesThreshold_(votesThreshold),
+        minRadius_(minRadius), maxRadius_(maxRadius), maxCircles_(maxCircles)
+    {
+        canny_ = cuda::createCannyEdgeDetector(std::max(cannyThreshold_ / 2, 1), cannyThreshold_);
+
+        filterDx_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 1, 0);
+        filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
+    }
+
+    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream)
+    {
+        // TODO : implement async version
+        CV_UNUSED(stream);
+
+        using namespace cv::cuda::device::hough;
+        using namespace cv::cuda::device::hough_circles;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+        CV_Assert( src.cols < std::numeric_limits<unsigned short>::max() );
+        CV_Assert( src.rows < std::numeric_limits<unsigned short>::max() );
+        CV_Assert( dp_ > 0 );
+        CV_Assert( minRadius_ > 0 && maxRadius_ > minRadius_ );
+        CV_Assert( cannyThreshold_ > 0 );
+        CV_Assert( votesThreshold_ > 0 );
+        CV_Assert( maxCircles_ > 0 );
+
+        const float idp = 1.0f / dp_;
+
+        filterDx_->apply(src, dx_);
+        filterDy_->apply(src, dy_);
+
+        canny_->setLowThreshold(std::max(cannyThreshold_ / 2, 1));
+        canny_->setHighThreshold(cannyThreshold_);
+
+        canny_->detect(dx_, dy_, edges_);
+
+        ensureSizeIsEnough(2, src.size().area(), CV_32SC1, list_);
+        unsigned int* srcPoints = list_.ptr<unsigned int>(0);
+        unsigned int* centers = list_.ptr<unsigned int>(1);
+
+        const int pointsCount = buildPointList_gpu(edges_, srcPoints);
+        if (pointsCount == 0)
+        {
+            circles.release();
+            return;
+        }
+
+        ensureSizeIsEnough(cvCeil(src.rows * idp) + 2, cvCeil(src.cols * idp) + 2, CV_32SC1, accum_);
+        accum_.setTo(Scalar::all(0));
+
+        circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp);
+
+        accum_.download(tt);
+
+        int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_);
+        if (centersCount == 0)
+        {
+            circles.release();
+            return;
+        }
+
+        if (minDist_ > 1)
+        {
+            AutoBuffer<ushort2> oldBuf_(centersCount);
+            AutoBuffer<ushort2> newBuf_(centersCount);
+            int newCount = 0;
+
+            ushort2* oldBuf = oldBuf_.data();
+            ushort2* newBuf = newBuf_.data();
+
+            cudaSafeCall( cudaMemcpy(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost) );
+
+            const int cellSize = cvRound(minDist_);
+            const int gridWidth = (src.cols + cellSize - 1) / cellSize;
+            const int gridHeight = (src.rows + cellSize - 1) / cellSize;
+
+            std::vector< std::vector<ushort2> > grid(gridWidth * gridHeight);
+
+            const float minDist2 = minDist_ * minDist_;
+
+            std::vector<Vec3f> sortBuf;
+            for(int i=0; i<centersCount; i++){
+                Vec3f temp;
+                temp[0] = oldBuf[i].x;
+                temp[1] = oldBuf[i].y;
+                temp[2] = tt.at<int>(temp[1]+1, temp[0]+1);
+                sortBuf.push_back(temp);
+            }
+            std::sort(sortBuf.begin(), sortBuf.end(), centersCompare);
+
+            for (int i = 0; i < centersCount; ++i)
+            {
+                ushort2 p;
+                p.x = sortBuf[i][0];
+                p.y = sortBuf[i][1];
+
+                bool good = true;
+
+                int xCell = static_cast<int>(p.x / cellSize);
+                int yCell = static_cast<int>(p.y / cellSize);
+
+                int x1 = xCell - 1;
+                int y1 = yCell - 1;
+                int x2 = xCell + 1;
+                int y2 = yCell + 1;
+
+                // boundary check
+                x1 = std::max(0, x1);
+                y1 = std::max(0, y1);
+                x2 = std::min(gridWidth - 1, x2);
+                y2 = std::min(gridHeight - 1, y2);
+
+                for (int yy = y1; yy <= y2; ++yy)
+                {
+                    for (int xx = x1; xx <= x2; ++xx)
+                    {
+                        std::vector<ushort2>& m = grid[yy * gridWidth + xx];
+
+                        for(size_t j = 0; j < m.size(); ++j)
+                        {
+                            float dx = (float)(p.x - m[j].x);
+                            float dy = (float)(p.y - m[j].y);
+
+                            if (dx * dx + dy * dy < minDist2)
+                            {
+                                good = false;
+                                goto break_out;
+                            }
+                        }
+                    }
+                }
+
+                break_out:
+
+                if(good)
+                {
+                    grid[yCell * gridWidth + xCell].push_back(p);
+
+                    newBuf[newCount++] = p;
+                }
+            }
+
+            cudaSafeCall( cudaMemcpy(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice) );
+            centersCount = newCount;
+        }
+
+        ensureSizeIsEnough(1, maxCircles_, CV_32FC3, result_);
+
+        int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, result_.ptr<float3>(), maxCircles_,
+                                                  dp_, minRadius_, maxRadius_, votesThreshold_, deviceSupports(FEATURE_SET_COMPUTE_20));
+
+        if (circlesCount == 0)
+        {
+            circles.release();
+            return;
+        }
+
+        result_.cols = circlesCount;
+        result_.copyTo(circles);
+    }
+}
+
+Ptr<HoughCirclesDetector> cv::cuda::createHoughCirclesDetector(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles)
+{
+    return makePtr<HoughCirclesDetectorImpl>(dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius, maxCircles);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/hough_lines.cpp b/modules/cudaimgproc/src/hough_lines.cpp
new file mode 100644
index 00000000000..e112e09a3a8
--- /dev/null
+++ b/modules/cudaimgproc/src/hough_lines.cpp
@@ -0,0 +1,212 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::HoughLinesDetector> cv::cuda::createHoughLinesDetector(float, float, int, bool, int) { throw_no_cuda(); return Ptr<HoughLinesDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough
+    {
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+    }
+
+    namespace hough_lines
+    {
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort);
+    }
+}}}
+
+namespace
+{
+    class HoughLinesDetectorImpl : public HoughLinesDetector
+    {
+    public:
+        HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines) :
+            rho_(rho), theta_(theta), threshold_(threshold), doSort_(doSort), maxLines_(maxLines)
+        {
+        }
+
+        void detect(InputArray src, OutputArray lines, Stream& stream);
+        void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream);
+
+        void setRho(float rho) { rho_ = rho; }
+        float getRho() const { return rho_; }
+
+        void setTheta(float theta) { theta_ = theta; }
+        float getTheta() const { return theta_; }
+
+        void setThreshold(int threshold) { threshold_ = threshold; }
+        int getThreshold() const { return threshold_; }
+
+        void setDoSort(bool doSort) { doSort_ = doSort; }
+        bool getDoSort() const { return doSort_; }
+
+        void setMaxLines(int maxLines) { maxLines_ = maxLines; }
+        int getMaxLines() const { return maxLines_; }
+
+        void write(FileStorage& fs) const
+        {
+            writeFormat(fs);
+            fs << "name" << "HoughLinesDetector_CUDA"
+            << "rho" << rho_
+            << "theta" << theta_
+            << "threshold" << threshold_
+            << "doSort" << doSort_
+            << "maxLines" << maxLines_;
+        }
+
+        void read(const FileNode& fn)
+        {
+            CV_Assert( String(fn["name"]) == "HoughLinesDetector_CUDA" );
+            rho_ = (float)fn["rho"];
+            theta_ = (float)fn["theta"];
+            threshold_ = (int)fn["threshold"];
+            doSort_ = (int)fn["doSort"] != 0;
+            maxLines_ = (int)fn["maxLines"];
+        }
+
+    private:
+        float rho_;
+        float theta_;
+        int threshold_;
+        bool doSort_;
+        int maxLines_;
+
+        GpuMat accum_;
+        GpuMat list_;
+        GpuMat result_;
+    };
+
+    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
+    {
+        // TODO : implement async version
+        CV_UNUSED(stream);
+
+        using namespace cv::cuda::device::hough;
+        using namespace cv::cuda::device::hough_lines;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+        CV_Assert( src.cols < std::numeric_limits<unsigned short>::max() );
+        CV_Assert( src.rows < std::numeric_limits<unsigned short>::max() );
+
+        ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_);
+        unsigned int* srcPoints = list_.ptr<unsigned int>();
+
+        const int pointsCount = buildPointList_gpu(src, srcPoints);
+        if (pointsCount == 0)
+        {
+            lines.release();
+            return;
+        }
+
+        const int numangle = cvRound(CV_PI / theta_);
+        const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho_);
+        CV_Assert( numangle > 0 && numrho > 0 );
+
+        ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_);
+        accum_.setTo(Scalar::all(0));
+
+        DeviceInfo devInfo;
+        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+
+        ensureSizeIsEnough(2, maxLines_, CV_32FC2, result_);
+
+        int linesCount = linesGetResult_gpu(accum_, result_.ptr<float2>(0), result_.ptr<int>(1), maxLines_, rho_, theta_, threshold_, doSort_);
+
+        if (linesCount == 0)
+        {
+            lines.release();
+            return;
+        }
+
+        result_.cols = linesCount;
+        result_.copyTo(lines);
+    }
+
+    void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream)
+    {
+        GpuMat d_lines = _d_lines.getGpuMat();
+
+        if (d_lines.empty())
+        {
+            h_lines.release();
+            if (h_votes.needed())
+                h_votes.release();
+            return;
+        }
+
+        CV_Assert( d_lines.rows == 2 && d_lines.type() == CV_32FC2 );
+
+        if (stream)
+            d_lines.row(0).download(h_lines, stream);
+        else
+            d_lines.row(0).download(h_lines);
+
+        if (h_votes.needed())
+        {
+            GpuMat d_votes(1, d_lines.cols, CV_32SC1, d_lines.ptr<int>(1));
+            if (stream)
+                d_votes.download(h_votes, stream);
+            else
+                d_votes.download(h_votes);
+        }
+    }
+}
+
+Ptr<HoughLinesDetector> cv::cuda::createHoughLinesDetector(float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    return makePtr<HoughLinesDetectorImpl>(rho, theta, threshold, doSort, maxLines);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/hough_segments.cpp b/modules/cudaimgproc/src/hough_segments.cpp
new file mode 100644
index 00000000000..34ee4744619
--- /dev/null
+++ b/modules/cudaimgproc/src/hough_segments.cpp
@@ -0,0 +1,187 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::HoughSegmentDetector> cv::cuda::createHoughSegmentDetector(float, float, int, int, int) { throw_no_cuda(); return Ptr<HoughSegmentDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough
+    {
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+    }
+
+    namespace hough_lines
+    {
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
+    }
+
+    namespace hough_segments
+    {
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength);
+    }
+}}}
+
+namespace
+{
+    class HoughSegmentDetectorImpl : public HoughSegmentDetector
+    {
+    public:
+        HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines) :
+            rho_(rho), theta_(theta), minLineLength_(minLineLength), maxLineGap_(maxLineGap), maxLines_(maxLines)
+        {
+        }
+
+        void detect(InputArray src, OutputArray lines, Stream& stream);
+
+        void setRho(float rho) { rho_ = rho; }
+        float getRho() const { return rho_; }
+
+        void setTheta(float theta) { theta_ = theta; }
+        float getTheta() const { return theta_; }
+
+        void setMinLineLength(int minLineLength) { minLineLength_ = minLineLength; }
+        int getMinLineLength() const { return minLineLength_; }
+
+        void setMaxLineGap(int maxLineGap) { maxLineGap_ = maxLineGap; }
+        int getMaxLineGap() const { return maxLineGap_; }
+
+        void setMaxLines(int maxLines) { maxLines_ = maxLines; }
+        int getMaxLines() const { return maxLines_; }
+
+        void write(FileStorage& fs) const
+        {
+            writeFormat(fs);
+            fs << "name" << "PHoughLinesDetector_CUDA"
+            << "rho" << rho_
+            << "theta" << theta_
+            << "minLineLength" << minLineLength_
+            << "maxLineGap" << maxLineGap_
+            << "maxLines" << maxLines_;
+        }
+
+        void read(const FileNode& fn)
+        {
+            CV_Assert( String(fn["name"]) == "PHoughLinesDetector_CUDA" );
+            rho_ = (float)fn["rho"];
+            theta_ = (float)fn["theta"];
+            minLineLength_ = (int)fn["minLineLength"];
+            maxLineGap_ = (int)fn["maxLineGap"];
+            maxLines_ = (int)fn["maxLines"];
+        }
+
+    private:
+        float rho_;
+        float theta_;
+        int minLineLength_;
+        int maxLineGap_;
+        int maxLines_;
+
+        GpuMat accum_;
+        GpuMat list_;
+        GpuMat result_;
+    };
+
+    void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
+    {
+        // TODO : implement async version
+        CV_UNUSED(stream);
+
+        using namespace cv::cuda::device::hough;
+        using namespace cv::cuda::device::hough_lines;
+        using namespace cv::cuda::device::hough_segments;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+        CV_Assert( src.cols < std::numeric_limits<unsigned short>::max() );
+        CV_Assert( src.rows < std::numeric_limits<unsigned short>::max() );
+
+        ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_);
+        unsigned int* srcPoints = list_.ptr<unsigned int>();
+
+        const int pointsCount = buildPointList_gpu(src, srcPoints);
+        if (pointsCount == 0)
+        {
+            lines.release();
+            return;
+        }
+
+        const int numangle = cvRound(CV_PI / theta_);
+        const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho_);
+        CV_Assert( numangle > 0 && numrho > 0 );
+
+        ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_);
+        accum_.setTo(Scalar::all(0));
+
+        DeviceInfo devInfo;
+        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+
+        ensureSizeIsEnough(1, maxLines_, CV_32SC4, result_);
+
+        int linesCount = houghLinesProbabilistic_gpu(src, accum_, result_.ptr<int4>(), maxLines_, rho_, theta_, maxLineGap_, minLineLength_);
+
+        if (linesCount == 0)
+        {
+            lines.release();
+            return;
+        }
+
+        result_.cols = linesCount;
+        result_.copyTo(lines);
+    }
+}
+
+Ptr<HoughSegmentDetector> cv::cuda::createHoughSegmentDetector(float rho, float theta, int minLineLength, int maxLineGap, int maxLines)
+{
+    return makePtr<HoughSegmentDetectorImpl>(rho, theta, minLineLength, maxLineGap, maxLines);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/match_template.cpp b/modules/cudaimgproc/src/match_template.cpp
new file mode 100644
index 00000000000..25c42dfd96c
--- /dev/null
+++ b/modules/cudaimgproc/src/match_template.cpp
@@ -0,0 +1,644 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDAARITHM) || defined (CUDA_DISABLER)
+
+Ptr<cuda::TemplateMatching> cv::cuda::createTemplateMatching(int, int, Size) { throw_no_cuda(); return Ptr<cuda::TemplateMatching>(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace match_template
+    {
+        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+
+        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result,
+            int cn, cudaStream_t stream);
+
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result,
+            int cn, cudaStream_t stream);
+
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC2(
+            int w, int h,
+            const PtrStepSz<int> image_sum_r,
+            const PtrStepSz<int> image_sum_g,
+            int templ_sum_r,
+            int templ_sum_g,
+            PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC3(
+                int w, int h,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC4(
+                int w, int h,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                const PtrStepSz<int> image_sum_a,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
+                int templ_sum_a,
+                PtrStepSzf result, cudaStream_t stream);
+
+
+        void matchTemplatePrepared_CCOFF_NORMED_8U(
+                int w, int h, const PtrStepSz<int> image_sum,
+                const PtrStepSz<double> image_sqsum,
+                int templ_sum, double templ_sqsum,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                int w, int h,
+                const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                int templ_sum_r, double templ_sqsum_r,
+                int templ_sum_g, double templ_sqsum_g,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                int w, int h,
+                const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                int templ_sum_r, double templ_sqsum_r,
+                int templ_sum_g, double templ_sqsum_g,
+                int templ_sum_b, double templ_sqsum_b,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                int w, int h,
+                const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a,
+                int templ_sum_r, double templ_sqsum_r,
+                int templ_sum_g, double templ_sqsum_g,
+                int templ_sum_b, double templ_sqsum_b,
+                int templ_sum_a, double templ_sqsum_a,
+                PtrStepSzf result, cudaStream_t stream);
+
+        void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum,
+                          double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);
+
+        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    // Evaluates optimal template's area threshold. If
+    // template's area is less  than the threshold, we use naive match
+    // template version, otherwise FFT-based (if available)
+    int getTemplateThreshold(int method, int depth)
+    {
+        switch (method)
+        {
+        case TM_CCORR:
+            if (depth == CV_32F) return 250;
+            if (depth == CV_8U) return 300;
+            break;
+
+        case TM_SQDIFF:
+            if (depth == CV_8U) return 300;
+            break;
+        }
+
+        CV_Error(Error::StsBadArg, "unsupported match template mode");
+        return 0;
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // CCORR_32F
+
+    class Match_CCORR_32F : public TemplateMatching
+    {
+    public:
+        explicit Match_CCORR_32F(Size user_block_size);
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        Ptr<cuda::Convolution> conv_;
+        GpuMat result_;
+    };
+
+    Match_CCORR_32F::Match_CCORR_32F(Size user_block_size)
+    {
+        conv_ = cuda::createConvolution(user_block_size);
+    }
+
+    void Match_CCORR_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& _stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_32F );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
+        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_32F))
+        {
+            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), stream);
+            return;
+        }
+
+        if (image.channels() == 1)
+        {
+            conv_->convolve(image.reshape(1), templ.reshape(1), result, true, _stream);
+        }
+        else
+        {
+            conv_->convolve(image.reshape(1), templ.reshape(1), result_, true, _stream);
+            extractFirstChannel_32F(result_, result, image.channels(), stream);
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // CCORR_8U
+
+    class Match_CCORR_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_CCORR_8U(Size user_block_size) : match32F_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat imagef_, templf_;
+        Match_CCORR_32F match32F_;
+    };
+
+    void Match_CCORR_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_8U))
+        {
+            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+            GpuMat result = _result.getGpuMat();
+
+            matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            return;
+        }
+
+        image.convertTo(imagef_, CV_32F, stream);
+        templ.convertTo(templf_, CV_32F, stream);
+
+        match32F_.match(imagef_, templf_, _result, stream);
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // CCORR_NORMED_8U
+
+    class Match_CCORR_NORMED_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_CCORR_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        Match_CCORR_8U match_CCORR_;
+        GpuMat image_sqsums_;
+    };
+
+    void Match_CCORR_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream);
+
+        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
+
+        normalize_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_32F
+
+    class Match_SQDIFF_32F : public TemplateMatching
+    {
+    public:
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+    };
+
+    void Match_SQDIFF_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_32F );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
+        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_8U
+
+    class Match_SQDIFF_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_SQDIFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat image_sqsums_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_SQDIFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        if (templ.size().area() < getTemplateThreshold(TM_SQDIFF, CV_8U))
+        {
+            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+            GpuMat result = _result.getGpuMat();
+
+            matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            return;
+        }
+
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream);
+
+        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_NORMED_8U
+
+    class Match_SQDIFF_NORMED_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_SQDIFF_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat image_sqsums_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_SQDIFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream);
+
+        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // CCOFF_8U
+
+    class Match_CCOEFF_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_CCOEFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        std::vector<GpuMat> images_;
+        std::vector<GpuMat> image_sums_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_CCOEFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        if (image.channels() == 1)
+        {
+            image_sums_.resize(1);
+            cuda::integral(image, image_sums_[0], stream);
+
+            int templ_sum = (int) cuda::sum(templ)[0];
+
+            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sums_[0], templ_sum, result, StreamAccessor::getStream(stream));
+        }
+        else
+        {
+            cuda::split(image, images_);
+
+            image_sums_.resize(images_.size());
+            for (int i = 0; i < image.channels(); ++i)
+                cuda::integral(images_[i], image_sums_[i], stream);
+
+            Scalar templ_sum = cuda::sum(templ);
+
+            switch (image.channels())
+            {
+            case 2:
+                matchTemplatePrepared_CCOFF_8UC2(
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1],
+                        (int) templ_sum[0], (int) templ_sum[1],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 3:
+                matchTemplatePrepared_CCOFF_8UC3(
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2],
+                        (int) templ_sum[0], (int) templ_sum[1], (int) templ_sum[2],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 4:
+                matchTemplatePrepared_CCOFF_8UC4(
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], image_sums_[3],
+                        (int) templ_sum[0], (int) templ_sum[1], (int) templ_sum[2], (int) templ_sum[3],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            default:
+                CV_Error(Error::StsBadArg, "unsupported number of channels");
+            }
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // CCOFF_NORMED_8U
+
+    class Match_CCOEFF_NORMED_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_CCOEFF_NORMED_8U(Size user_block_size) : match_CCORR_32F_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat imagef_, templf_;
+        Match_CCORR_32F match_CCORR_32F_;
+        std::vector<GpuMat> images_;
+        std::vector<GpuMat> image_sums_;
+        std::vector<GpuMat> image_sqsums_;
+    };
+
+    void Match_CCOEFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        image.convertTo(imagef_, CV_32F, stream);
+        templ.convertTo(templf_, CV_32F, stream);
+
+        match_CCORR_32F_.match(imagef_, templf_, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        if (image.channels() == 1)
+        {
+            image_sums_.resize(1);
+            cuda::integral(image, image_sums_[0], stream);
+
+            image_sqsums_.resize(1);
+            cuda::sqrIntegral(image, image_sqsums_[0], stream);
+
+            int templ_sum = (int) cuda::sum(templ)[0];
+            double templ_sqsum = cuda::sqrSum(templ)[0];
+
+            matchTemplatePrepared_CCOFF_NORMED_8U(
+                    templ.cols, templ.rows, image_sums_[0], image_sqsums_[0],
+                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
+        }
+        else
+        {
+            cuda::split(image, images_);
+
+            image_sums_.resize(images_.size());
+            image_sqsums_.resize(images_.size());
+            for (int i = 0; i < image.channels(); ++i)
+            {
+                cuda::integral(images_[i], image_sums_[i], stream);
+                cuda::sqrIntegral(images_[i], image_sqsums_[i], stream);
+            }
+
+            Scalar templ_sum = cuda::sum(templ);
+            Scalar templ_sqsum = cuda::sqrSum(templ);
+
+            switch (image.channels())
+            {
+            case 2:
+                matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                        templ.cols, templ.rows,
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
+                        (int)templ_sum[0], templ_sqsum[0],
+                        (int)templ_sum[1], templ_sqsum[1],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 3:
+                matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                        templ.cols, templ.rows,
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
+                        image_sums_[2], image_sqsums_[2],
+                        (int)templ_sum[0], templ_sqsum[0],
+                        (int)templ_sum[1], templ_sqsum[1],
+                        (int)templ_sum[2], templ_sqsum[2],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 4:
+                matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                        templ.cols, templ.rows,
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
+                        image_sums_[2], image_sqsums_[2],
+                        image_sums_[3], image_sqsums_[3],
+                        (int)templ_sum[0], templ_sqsum[0],
+                        (int)templ_sum[1], templ_sqsum[1],
+                        (int)templ_sum[2], templ_sqsum[2],
+                        (int)templ_sum[3], templ_sqsum[3],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            default:
+                CV_Error(Error::StsBadArg, "unsupported number of channels");
+            }
+        }
+    }
+}
+
+Ptr<cuda::TemplateMatching> cv::cuda::createTemplateMatching(int srcType, int method, Size user_block_size)
+{
+    const int sdepth = CV_MAT_DEPTH(srcType);
+
+    CV_Assert( sdepth == CV_8U || sdepth == CV_32F );
+
+    if (sdepth == CV_32F)
+    {
+        switch (method)
+        {
+        case TM_SQDIFF:
+            return makePtr<Match_SQDIFF_32F>();
+
+        case TM_CCORR:
+            return makePtr<Match_CCORR_32F>(user_block_size);
+
+        default:
+            CV_Error( Error::StsBadFlag, "Unsopported method" );
+            return Ptr<cuda::TemplateMatching>();
+        }
+    }
+    else
+    {
+        switch (method)
+        {
+        case TM_SQDIFF:
+            return makePtr<Match_SQDIFF_8U>(user_block_size);
+
+        case TM_SQDIFF_NORMED:
+            return makePtr<Match_SQDIFF_NORMED_8U>(user_block_size);
+
+        case TM_CCORR:
+            return makePtr<Match_CCORR_8U>(user_block_size);
+
+        case TM_CCORR_NORMED:
+            return makePtr<Match_CCORR_NORMED_8U>(user_block_size);
+
+        case TM_CCOEFF:
+            return makePtr<Match_CCOEFF_8U>(user_block_size);
+
+        case TM_CCOEFF_NORMED:
+            return makePtr<Match_CCOEFF_NORMED_8U>(user_block_size);
+
+        default:
+            CV_Error( Error::StsBadFlag, "Unsopported method" );
+            return Ptr<cuda::TemplateMatching>();
+        }
+    }
+}
+
+#endif
diff --git a/modules/cudaimgproc/src/mean_shift.cpp b/modules/cudaimgproc/src/mean_shift.cpp
new file mode 100644
index 00000000000..c0e4999fa30
--- /dev/null
+++ b/modules/cudaimgproc/src/mean_shift.cpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::meanShiftFiltering(InputArray, OutputArray, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
+void cv::cuda::meanShiftProc(InputArray, OutputArray, OutputArray, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+////////////////////////////////////////////////////////////////////////
+// meanShiftFiltering
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::meanShiftFiltering(InputArray _src, OutputArray _dst, int sp, int sr, TermCriteria criteria, Stream& stream)
+{
+    using namespace ::cv::cuda::device::imgproc;
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC4 );
+
+    _dst.create(src.size(), CV_8UC4);
+    GpuMat dst = _dst.getGpuMat();
+
+    if (!(criteria.type & TermCriteria::MAX_ITER))
+        criteria.maxCount = 5;
+
+    int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
+
+    if (!(criteria.type & TermCriteria::EPS))
+        criteria.epsilon = 1.f;
+
+    float eps = (float) std::max(criteria.epsilon, 0.0);
+
+    meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// meanShiftProc_CUDA
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::meanShiftProc(InputArray _src, OutputArray _dstr, OutputArray _dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
+{
+    using namespace ::cv::cuda::device::imgproc;
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC4 );
+
+    _dstr.create(src.size(), CV_8UC4);
+    _dstsp.create(src.size(), CV_16SC2);
+
+    GpuMat dstr = _dstr.getGpuMat();
+    GpuMat dstsp = _dstsp.getGpuMat();
+
+    if (!(criteria.type & TermCriteria::MAX_ITER))
+        criteria.maxCount = 5;
+
+    int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
+
+    if (!(criteria.type & TermCriteria::EPS))
+        criteria.epsilon = 1.f;
+
+    float eps = (float) std::max(criteria.epsilon, 0.0);
+
+    meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/mssegmentation.cpp b/modules/cudaimgproc/src/mssegmentation.cpp
new file mode 100644
index 00000000000..de8ed841468
--- /dev/null
+++ b/modules/cudaimgproc/src/mssegmentation.cpp
@@ -0,0 +1,394 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
+
+#else
+
+// Auxiliray stuff
+namespace
+{
+
+//
+// Declarations
+//
+
+class DjSets
+{
+public:
+    DjSets(int n);
+    int find(int elem);
+    int merge(int set1, int set2);
+
+    std::vector<int> parent;
+    std::vector<int> rank;
+    std::vector<int> size;
+private:
+    DjSets(const DjSets&);
+    void operator =(const DjSets&);
+};
+
+
+template <typename T>
+struct GraphEdge
+{
+    GraphEdge() {}
+    GraphEdge(int to_, int next_, const T& val_) : to(to_), next(next_), val(val_) {}
+    int to;
+    int next;
+    T val;
+};
+
+
+template <typename T>
+class Graph
+{
+public:
+    typedef GraphEdge<T> Edge;
+
+    Graph(int numv, int nume_max);
+
+    void addEdge(int from, int to, const T& val=T());
+
+    std::vector<int> start;
+    std::vector<Edge> edges;
+
+    int numv;
+    int nume_max;
+    int nume;
+private:
+    Graph(const Graph&);
+    void operator =(const Graph&);
+};
+
+
+struct SegmLinkVal
+{
+    SegmLinkVal() {}
+    SegmLinkVal(int dr_, int dsp_) : dr(dr_), dsp(dsp_) {}
+    bool operator <(const SegmLinkVal& other) const
+    {
+        return dr + dsp < other.dr + other.dsp;
+    }
+    int dr;
+    int dsp;
+};
+
+
+struct SegmLink
+{
+    SegmLink() {}
+    SegmLink(int from_, int to_, const SegmLinkVal& val_)
+        : from(from_), to(to_), val(val_) {}
+    bool operator <(const SegmLink& other) const
+    {
+        return val < other.val;
+    }
+    int from;
+    int to;
+    SegmLinkVal val;
+};
+
+//
+// Implementation
+//
+
+DjSets::DjSets(int n) : parent(n), rank(n, 0), size(n, 1)
+{
+    for (int i = 0; i < n; ++i)
+        parent[i] = i;
+}
+
+
+inline int DjSets::find(int elem)
+{
+    int set = elem;
+    while (set != parent[set])
+        set = parent[set];
+    while (elem != parent[elem])
+    {
+        int next = parent[elem];
+        parent[elem] = set;
+        elem = next;
+    }
+    return set;
+}
+
+
+inline int DjSets::merge(int set1, int set2)
+{
+    if (rank[set1] < rank[set2])
+    {
+        parent[set1] = set2;
+        size[set2] += size[set1];
+        return set2;
+    }
+    if (rank[set2] < rank[set1])
+    {
+        parent[set2] = set1;
+        size[set1] += size[set2];
+        return set1;
+    }
+    parent[set1] = set2;
+    rank[set2]++;
+    size[set2] += size[set1];
+    return set2;
+}
+
+
+template <typename T>
+Graph<T>::Graph(int numv_, int nume_max_) : start(numv_, -1), edges(nume_max_)
+{
+    this->numv = numv_;
+    this->nume_max = nume_max_;
+    nume = 0;
+}
+
+
+template <typename T>
+inline void Graph<T>::addEdge(int from, int to, const T& val)
+{
+    edges[nume] = Edge(to, start[from], val);
+    start[from] = nume;
+    nume++;
+}
+
+
+inline int pix(int y, int x, int ncols)
+{
+    return y * ncols + x;
+}
+
+
+inline int sqr(int x)
+{
+    return x * x;
+}
+
+
+inline int dist2(const cv::Vec4b& lhs, const cv::Vec4b& rhs)
+{
+    return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]) + sqr(lhs[2] - rhs[2]);
+}
+
+
+inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
+{
+    return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]);
+}
+
+} // anonymous namespace
+
+
+void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC4 );
+
+    const int nrows = src.rows;
+    const int ncols = src.cols;
+    const int hr = sr;
+    const int hsp = sp;
+
+    // Perform mean shift procedure and obtain region and spatial maps
+    GpuMat d_rmap, d_spmap;
+    cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria, stream);
+
+    stream.waitForCompletion();
+
+    Mat rmap(d_rmap);
+    Mat spmap(d_spmap);
+
+    Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
+                                        + (nrows - 1) + (ncols - 1));
+
+    // Make region adjacent graph from image
+    Vec4b r1;
+    Vec4b r2[4];
+    Vec2s sp1;
+    Vec2s sp2[4];
+    int dr[4];
+    int dsp[4];
+    for (int y = 0; y < nrows - 1; ++y)
+    {
+        Vec4b* ry = rmap.ptr<Vec4b>(y);
+        Vec4b* ryp = rmap.ptr<Vec4b>(y + 1);
+        Vec2s* spy = spmap.ptr<Vec2s>(y);
+        Vec2s* spyp = spmap.ptr<Vec2s>(y + 1);
+        for (int x = 0; x < ncols - 1; ++x)
+        {
+            r1 = ry[x];
+            sp1 = spy[x];
+
+            r2[0] = ry[x + 1];
+            r2[1] = ryp[x];
+            r2[2] = ryp[x + 1];
+            r2[3] = ryp[x];
+
+            sp2[0] = spy[x + 1];
+            sp2[1] = spyp[x];
+            sp2[2] = spyp[x + 1];
+            sp2[3] = spyp[x];
+
+            dr[0] = dist2(r1, r2[0]);
+            dr[1] = dist2(r1, r2[1]);
+            dr[2] = dist2(r1, r2[2]);
+            dsp[0] = dist2(sp1, sp2[0]);
+            dsp[1] = dist2(sp1, sp2[1]);
+            dsp[2] = dist2(sp1, sp2[2]);
+
+            r1 = ry[x + 1];
+            sp1 = spy[x + 1];
+
+            dr[3] = dist2(r1, r2[3]);
+            dsp[3] = dist2(sp1, sp2[3]);
+
+            g.addEdge(pix(y, x, ncols), pix(y, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+            g.addEdge(pix(y, x, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[1], dsp[1]));
+            g.addEdge(pix(y, x, ncols), pix(y + 1, x + 1, ncols), SegmLinkVal(dr[2], dsp[2]));
+            g.addEdge(pix(y, x + 1, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[3], dsp[3]));
+        }
+    }
+    for (int y = 0; y < nrows - 1; ++y)
+    {
+        r1 = rmap.at<Vec4b>(y, ncols - 1);
+        r2[0] = rmap.at<Vec4b>(y + 1, ncols - 1);
+        sp1 = spmap.at<Vec2s>(y, ncols - 1);
+        sp2[0] = spmap.at<Vec2s>(y + 1, ncols - 1);
+        dr[0] = dist2(r1, r2[0]);
+        dsp[0] = dist2(sp1, sp2[0]);
+        g.addEdge(pix(y, ncols - 1, ncols), pix(y + 1, ncols - 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+    }
+    for (int x = 0; x < ncols - 1; ++x)
+    {
+        r1 = rmap.at<Vec4b>(nrows - 1, x);
+        r2[0] = rmap.at<Vec4b>(nrows - 1, x + 1);
+        sp1 = spmap.at<Vec2s>(nrows - 1, x);
+        sp2[0] = spmap.at<Vec2s>(nrows - 1, x + 1);
+        dr[0] = dist2(r1, r2[0]);
+        dsp[0] = dist2(sp1, sp2[0]);
+        g.addEdge(pix(nrows - 1, x, ncols), pix(nrows - 1, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+    }
+
+    DjSets comps(g.numv);
+
+    // Find adjacent components
+    for (int v = 0; v < g.numv; ++v)
+    {
+        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
+        {
+            int c1 = comps.find(v);
+            int c2 = comps.find(g.edges[e_it].to);
+            if (c1 != c2 && g.edges[e_it].val.dr < hr && g.edges[e_it].val.dsp < hsp)
+                comps.merge(c1, c2);
+        }
+    }
+
+    std::vector<SegmLink> edges;
+    edges.reserve(g.numv);
+
+    // Prepare edges connecting different components
+    for (int v = 0; v < g.numv; ++v)
+    {
+        int c1 = comps.find(v);
+        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
+        {
+            int c2 = comps.find(g.edges[e_it].to);
+            if (c1 != c2)
+                edges.push_back(SegmLink(c1, c2, g.edges[e_it].val));
+        }
+    }
+
+    // Sort all graph's edges connecting different components (in ascending order)
+    std::sort(edges.begin(), edges.end());
+
+    // Exclude small components (starting from the nearest couple)
+    for (size_t i = 0; i < edges.size(); ++i)
+    {
+        int c1 = comps.find(edges[i].from);
+        int c2 = comps.find(edges[i].to);
+        if (c1 != c2 && (comps.size[c1] < minsize || comps.size[c2] < minsize))
+            comps.merge(c1, c2);
+    }
+
+    // Compute sum of the pixel's colors which are in the same segment
+    Mat h_src(src);
+    std::vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
+    for (int y = 0; y < nrows; ++y)
+    {
+        Vec4b* h_srcy = h_src.ptr<Vec4b>(y);
+        for (int x = 0; x < ncols; ++x)
+        {
+            int parent = comps.find(pix(y, x, ncols));
+            Vec4b col = h_srcy[x];
+            Vec4i& sumcol = sumcols[parent];
+            sumcol[0] += col[0];
+            sumcol[1] += col[1];
+            sumcol[2] += col[2];
+        }
+    }
+
+    // Create final image, color of each segment is the average color of its pixels
+    Mat dst(src.size(), src.type());
+
+    for (int y = 0; y < nrows; ++y)
+    {
+        Vec4b* dsty = dst.ptr<Vec4b>(y);
+        for (int x = 0; x < ncols; ++x)
+        {
+            int parent = comps.find(pix(y, x, ncols));
+            const Vec4i& sumcol = sumcols[parent];
+            Vec4b& dstcol = dsty[x];
+            dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
+            dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
+            dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
+            dstcol[3] = 255;
+        }
+    }
+    dst.copyTo(_dst);
+}
+
+#endif // #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
diff --git a/modules/cudaimgproc/src/precomp.hpp b/modules/cudaimgproc/src/precomp.hpp
new file mode 100644
index 00000000000..3bbb2a8f08a
--- /dev/null
+++ b/modules/cudaimgproc/src/precomp.hpp
@@ -0,0 +1,65 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include "opencv2/cudaimgproc.hpp"
+
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/private.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDAARITHM
+#  include "opencv2/cudaarithm.hpp"
+#endif
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+#  include "opencv2/cudafilters.hpp"
+#endif
+
+#include <limits>
+#include <algorithm>
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudaimgproc/test/test_bilateral_filter.cpp b/modules/cudaimgproc/test/test_bilateral_filter.cpp
new file mode 100644
index 00000000000..8e158096143
--- /dev/null
+++ b/modules/cudaimgproc/test/test_bilateral_filter.cpp
@@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////
+// BilateralFilter
+
+PARAM_TEST_CASE(BilateralFilter, cv::cuda::DeviceInfo, cv::Size, MatType)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int kernel_size;
+    float sigma_color;
+    float sigma_spatial;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+
+        kernel_size = 5;
+        sigma_color = 10.f;
+        sigma_spatial = 3.5f;
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(BilateralFilter, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    src.convertTo(src, type);
+    cv::cuda::GpuMat dst;
+
+    cv::cuda::bilateralFilter(loadMat(src), dst, kernel_size, sigma_color, sigma_spatial);
+
+    cv::Mat dst_gold;
+    cv::bilateralFilter(src, dst_gold, kernel_size, sigma_color, sigma_spatial);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, BilateralFilter, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(639, 481)),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_32FC1), MatType(CV_32FC3))
+    ));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaimgproc/test/test_blend.cpp b/modules/cudaimgproc/test/test_blend.cpp
new file mode 100644
index 00000000000..41934f9af62
--- /dev/null
+++ b/modules/cudaimgproc/test/test_blend.cpp
@@ -0,0 +1,126 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////////////////////////
+// Blend
+
+namespace
+{
+    template <typename T>
+    void blendLinearGold(const cv::Mat& img1, const cv::Mat& img2, const cv::Mat& weights1, const cv::Mat& weights2, cv::Mat& result_gold)
+    {
+        result_gold.create(img1.size(), img1.type());
+
+        int cn = img1.channels();
+
+        for (int y = 0; y < img1.rows; ++y)
+        {
+            const float* weights1_row = weights1.ptr<float>(y);
+            const float* weights2_row = weights2.ptr<float>(y);
+            const T* img1_row = img1.ptr<T>(y);
+            const T* img2_row = img2.ptr<T>(y);
+            T* result_gold_row = result_gold.ptr<T>(y);
+
+            for (int x = 0; x < img1.cols * cn; ++x)
+            {
+                float w1 = weights1_row[x / cn];
+                float w2 = weights2_row[x / cn];
+                result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
+            }
+        }
+    }
+}
+
+PARAM_TEST_CASE(Blend, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Blend, Accuracy)
+{
+    int depth = CV_MAT_DEPTH(type);
+
+    cv::Mat img1 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
+    cv::Mat img2 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
+    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
+    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
+
+    cv::cuda::GpuMat result;
+    cv::cuda::blendLinear(loadMat(img1, useRoi), loadMat(img2, useRoi), loadMat(weights1, useRoi), loadMat(weights2, useRoi), result);
+
+    cv::Mat result_gold;
+    if (depth == CV_8U)
+        blendLinearGold<uchar>(img1, img2, weights1, weights2, result_gold);
+    else
+        blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);
+
+    EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.0 : 1e-5);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Blend, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaimgproc/test/test_canny.cpp b/modules/cudaimgproc/test/test_canny.cpp
new file mode 100644
index 00000000000..a782a87b3b4
--- /dev/null
+++ b/modules/cudaimgproc/test/test_canny.cpp
@@ -0,0 +1,160 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////
+// Canny
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(AppertureSize, int)
+    IMPLEMENT_PARAM_CLASS(L2gradient, bool)
+}
+
+PARAM_TEST_CASE(Canny, cv::cuda::DeviceInfo, AppertureSize, L2gradient, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int apperture_size;
+    bool useL2gradient;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        apperture_size = GET_PARAM(1);
+        useL2gradient = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Canny, Accuracy)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    double low_thresh = 50.0;
+    double high_thresh = 100.0;
+
+    cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+
+    cv::cuda::GpuMat edges;
+    canny->detect(loadMat(img, useRoi), edges);
+
+    cv::Mat edges_gold;
+    cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient);
+
+    EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
+}
+
+class CannyAsyncParallelLoopBody : public cv::ParallelLoopBody
+{
+public:
+    CannyAsyncParallelLoopBody(const cv::cuda::GpuMat& d_img_, cv::cuda::GpuMat* edges_, double low_thresh_, double high_thresh_, int apperture_size_, bool useL2gradient_)
+        : d_img(d_img_), edges(edges_), low_thresh(low_thresh_), high_thresh(high_thresh_), apperture_size(apperture_size_), useL2gradient(useL2gradient_) {}
+    ~CannyAsyncParallelLoopBody() {};
+    void operator()(const cv::Range& r) const
+    {
+        for (int i = r.start; i < r.end; i++) {
+            cv::cuda::Stream stream;
+            cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+            canny->detect(d_img, edges[i], stream);
+            stream.waitForCompletion();
+        }
+    }
+protected:
+    const cv::cuda::GpuMat& d_img;
+    cv::cuda::GpuMat* edges;
+    double low_thresh;
+    double high_thresh;
+    int apperture_size;
+    bool useL2gradient;
+};
+
+#define NUM_STREAMS 64
+
+CUDA_TEST_P(Canny, Async)
+{
+    if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_30))
+    {
+        throw SkipTestException("CUDA device doesn't support texture objects");
+    }
+    else
+    {
+        const cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+
+        const cv::cuda::GpuMat d_img_roi = loadMat(img, useRoi);
+
+        double low_thresh = 50.0;
+        double high_thresh = 100.0;
+
+        // Synchronous call
+        cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+        cv::cuda::GpuMat edges_gold;
+        canny->detect(d_img_roi, edges_gold);
+
+        // Asynchronous call
+        cv::cuda::GpuMat edges[NUM_STREAMS];
+        cv::parallel_for_(cv::Range(0, NUM_STREAMS), CannyAsyncParallelLoopBody(d_img_roi, edges, low_thresh, high_thresh, apperture_size, useL2gradient));
+
+        // Compare the results of synchronous call and asynchronous call
+        for (int i = 0; i < NUM_STREAMS; i++)
+            EXPECT_MAT_NEAR(edges_gold, edges[i], 0.0);
+    }
+ }
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Canny, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(AppertureSize(3), AppertureSize(5), AppertureSize(7)),
+    testing::Values(L2gradient(false), L2gradient(true)),
+    WHOLE_SUBMAT));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaimgproc/test/test_color.cpp b/modules/cudaimgproc/test/test_color.cpp
new file mode 100644
index 00000000000..e4f91ea1bcb
--- /dev/null
+++ b/modules/cudaimgproc/test/test_color.cpp
@@ -0,0 +1,2513 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// cvtColor
+
+PARAM_TEST_CASE(CvtColor, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    cv::Mat img;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        img = randomMat(size, CV_MAKE_TYPE(depth, 3), 0.0, depth == CV_32F ? 1.0 : 255.0);
+    }
+};
+
+CUDA_TEST_P(CvtColor, BGR2RGB)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR2RGBA)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2RGBA);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2RGBA);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR2BGRA)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2BGRA);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2BGRA);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGRA2RGB)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGRA2BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGRA2RGBA)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2RGBA);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2RGBA);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR2GRAY)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, RGB2GRAY)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, GRAY2BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, GRAY2BGRA)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGRA, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGRA, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGRA2GRAY)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, RGBA2GRAY)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, BGR2BGR565)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2BGR565);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2BGR565);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, RGB2BGR565)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2BGR565);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2BGR565);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR5652BGR)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR5652RGB)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGRA2BGR565)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2BGR565);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2BGR565);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, RGBA2BGR565)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2BGR565);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2BGR565);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR5652BGRA)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652BGRA, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652BGRA, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR5652RGBA)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652RGBA, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652RGBA, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, GRAY2BGR565)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGR565);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGR565);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR5652GRAY)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR565);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5652GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5652GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR2BGR555)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2BGR555);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2BGR555);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, RGB2BGR555)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2BGR555);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2BGR555);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR5552BGR)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR5552RGB)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGRA2BGR555)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGRA2BGR555);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2BGR555);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, RGBA2BGR555)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2BGR555);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2BGR555);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR5552BGRA)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552BGRA, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552BGRA, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR5552RGBA)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552RGBA, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552RGBA, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, GRAY2BGR555)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2GRAY);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_GRAY2BGR555);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_GRAY2BGR555);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR5552GRAY)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGR555);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR5552GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR5552GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+CUDA_TEST_P(CvtColor, BGR2XYZ)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2XYZ);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2XYZ);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, RGB2XYZ)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2XYZ);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2XYZ);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, BGR2XYZ4)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2XYZ, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2XYZ);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, BGRA2XYZ4)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2BGRA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2XYZ, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2XYZ);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, XYZ2BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, XYZ2RGB)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, XYZ42BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2BGR);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, XYZ42BGRA)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2XYZ);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_XYZ2BGR, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_XYZ2BGR, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, BGR2YCrCb)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YCrCb);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YCrCb);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGB2YCrCb)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2YCrCb);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2YCrCb);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, BGR2YCrCb4)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YCrCb, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YCrCb);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGBA2YCrCb4)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YCrCb, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YCrCb);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, YCrCb2BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, YCrCb2RGB)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, YCrCb42RGB)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2RGB);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, YCrCb42RGBA)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YCrCb);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YCrCb2RGB, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YCrCb2RGB, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, BGR2HSV)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HSV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HSV);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGB2HSV)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGB2HSV4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGBA2HSV4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, BGR2HLS)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HLS);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HLS);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGB2HLS)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGB2HLS4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGBA2HLS4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HSV2BGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HSV2RGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HSV42BGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HSV42BGRA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HLS2BGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HLS2RGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HLS42RGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HLS42RGBA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, BGR2HSV_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HSV_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HSV_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGB2HSV_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGB2HSV4_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV_FULL, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV_FULL);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGBA2HSV4_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HSV_FULL, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HSV_FULL);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, BGR2HLS_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2HLS_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2HLS_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGB2HLS_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGB2HLS4_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS_FULL, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS_FULL);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, RGBA2HLS4_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2HLS_FULL, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2HLS_FULL);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HSV2BGR_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2BGR_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2BGR_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HSV2RGB_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HSV42RGB_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB_FULL);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HSV42RGBA_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HSV_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HSV2RGB_FULL, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HSV2RGB_FULL, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HLS2BGR_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2BGR_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2BGR_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HLS2RGB_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HLS42RGB_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB_FULL);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB_FULL);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, HLS42RGBA_FULL)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2HLS_FULL);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_HLS2RGB_FULL, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_HLS2RGB_FULL, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
+}
+
+CUDA_TEST_P(CvtColor, BGR2YUV)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YUV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YUV);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, RGB2YUV)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2YUV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2YUV);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, YUV2BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, YUV42BGR)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2BGR);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, YUV42BGRA)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2YUV);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2BGR, 4);
+
+    cv::Mat channels[4];
+    cv::split(src, channels);
+    channels[3] = cv::Mat(src.size(), depth, cv::Scalar::all(0));
+    cv::merge(channels, 4, src);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2BGR, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, YUV2RGB)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_RGB2YUV);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_YUV2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_YUV2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, BGR2YUV4)
+{
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2YUV, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YUV);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, RGBA2YUV4)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2YUV, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2YUV);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, BGR2Lab)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Lab);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Lab);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, RGB2Lab)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2Lab);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2Lab);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, BGRA2Lab4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Lab, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Lab);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, LBGR2Lab)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Lab);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Lab);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, LRGB2Lab)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LRGB2Lab);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LRGB2Lab);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, LBGRA2Lab4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Lab, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Lab);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, Lab2BGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, Lab2RGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, Lab2BGRA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2BGR, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, Lab2LBGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2LBGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2LBGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, Lab2LRGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2LRGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2LRGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, Lab2LRGBA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Lab);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Lab2LRGB, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Lab2LRGB, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-5);
+}
+
+CUDA_TEST_P(CvtColor, BGR2Luv)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Luv);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Luv);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, RGB2Luv)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGB2Luv);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGB2Luv);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, BGRA2Luv4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BGR2Luv, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGR2Luv);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, LBGR2Luv)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Luv);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Luv);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, LRGB2Luv)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src = img;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LRGB2Luv);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LRGB2Luv);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, LBGRA2Luv4)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2RGBA);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_LBGR2Luv, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_LBGR2Luv);
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat channels[4];
+    cv::split(h_dst, channels);
+    cv::merge(channels, 3, h_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_8U ? 1 : 1e-3);
+}
+
+CUDA_TEST_P(CvtColor, Luv2BGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+CUDA_TEST_P(CvtColor, Luv2RGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2RGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2RGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+CUDA_TEST_P(CvtColor, Luv2BGRA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2BGR, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+CUDA_TEST_P(CvtColor, Luv2LBGR)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2LBGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2LBGR);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+CUDA_TEST_P(CvtColor, Luv2LRGB)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2LRGB);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2LRGB);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+CUDA_TEST_P(CvtColor, Luv2LRGBA)
+{
+    if (depth == CV_16U)
+        return;
+
+    cv::Mat src;
+    cv::cvtColor(img, src, cv::COLOR_BGR2Luv);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_Luv2LRGB, 4);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_Luv2LRGB, 4);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_8U ? 1 : 1e-4);
+}
+
+#if defined (CUDA_VERSION) && (CUDA_VERSION >= 5000)
+
+CUDA_TEST_P(CvtColor, RGBA2mRGBA)
+{
+    if (depth != CV_8U)
+        return;
+
+    cv::Mat src = randomMat(size, CV_MAKE_TYPE(depth, 4));
+
+    cv::cuda::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2mRGBA);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2mRGBA);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1);
+}
+
+#endif // defined (CUDA_VERSION) && (CUDA_VERSION >= 5000)
+
+CUDA_TEST_P(CvtColor, BayerBG2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+CUDA_TEST_P(CvtColor, BayerBG2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+CUDA_TEST_P(CvtColor, BayerGB2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+CUDA_TEST_P(CvtColor, BayerGB2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+CUDA_TEST_P(CvtColor, BayerRG2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+CUDA_TEST_P(CvtColor, BayerRG2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+CUDA_TEST_P(CvtColor, BayerGR2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+CUDA_TEST_P(CvtColor, BayerGR2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+CUDA_TEST_P(CvtColor, BayerBG2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+CUDA_TEST_P(CvtColor, BayerGB2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+CUDA_TEST_P(CvtColor, BayerRG2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+CUDA_TEST_P(CvtColor, BayerGR2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CvtColor, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Demosaicing
+
+struct Demosaicing : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+
+    static void mosaic(const cv::Mat_<cv::Vec3b>& src, cv::Mat_<uchar>& dst, cv::Point firstRed)
+    {
+        dst.create(src.size());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                cv::Vec3b pix = src(y, x);
+
+                cv::Point alternate;
+                alternate.x = (x + firstRed.x) % 2;
+                alternate.y = (y + firstRed.y) % 2;
+
+                if (alternate.y == 0)
+                {
+                    if (alternate.x == 0)
+                    {
+                        // RG
+                        // GB
+                        dst(y, x) = pix[2];
+                    }
+                    else
+                    {
+                        // GR
+                        // BG
+                        dst(y, x) = pix[1];
+                    }
+                }
+                else
+                {
+                    if (alternate.x == 0)
+                    {
+                        // GB
+                        // RG
+                        dst(y, x) = pix[1];
+                    }
+                    else
+                    {
+                        // BG
+                        // GR
+                        dst(y, x) = pix[0];
+                    }
+                }
+            }
+        }
+    }
+};
+
+CUDA_TEST_P(Demosaicing, BayerBG2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 1));
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::demosaicing(loadMat(src), dst, cv::COLOR_BayerBG2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+CUDA_TEST_P(Demosaicing, BayerGB2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 1));
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::demosaicing(loadMat(src), dst, cv::COLOR_BayerGB2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+CUDA_TEST_P(Demosaicing, BayerRG2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 0));
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::demosaicing(loadMat(src), dst, cv::COLOR_BayerRG2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+CUDA_TEST_P(Demosaicing, BayerGR2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 0));
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::demosaicing(loadMat(src), dst, cv::COLOR_BayerGR2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 1));
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerBG2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 1));
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGB2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 0));
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerRG2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+CUDA_TEST_P(Demosaicing, BayerGR2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 0));
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGR2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, ALL_DEVICES);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// swapChannels
+
+PARAM_TEST_CASE(SwapChannels, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(SwapChannels, Accuracy)
+{
+    cv::Mat src = readImageType("stereobm/aloe-L.png", CV_8UC4);
+    ASSERT_FALSE(src.empty());
+
+    cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+
+    const int dstOrder[] = {2, 1, 0, 3};
+    cv::cuda::swapChannels(d_src, dstOrder);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BGRA2RGBA);
+
+    EXPECT_MAT_NEAR(dst_gold, d_src, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, SwapChannels, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaimgproc/test/test_corners.cpp b/modules/cudaimgproc/test/test_corners.cpp
new file mode 100644
index 00000000000..4ec54099b7d
--- /dev/null
+++ b/modules/cudaimgproc/test/test_corners.cpp
@@ -0,0 +1,151 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// CornerHarris
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(BlockSize, int);
+    IMPLEMENT_PARAM_CLASS(ApertureSize, int);
+}
+
+PARAM_TEST_CASE(CornerHarris, cv::cuda::DeviceInfo, MatType, BorderType, BlockSize, ApertureSize)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int type;
+    int borderType;
+    int blockSize;
+    int apertureSize;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        borderType = GET_PARAM(2);
+        blockSize = GET_PARAM(3);
+        apertureSize = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CornerHarris, Accuracy)
+{
+    cv::Mat src = readImageType("stereobm/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
+    double k = randomDouble(0.1, 0.9);
+
+    cv::Ptr<cv::cuda::CornernessCriteria> harris = cv::cuda::createHarrisCorner(src.type(), blockSize, apertureSize, k, borderType);
+
+    cv::cuda::GpuMat dst;
+    harris->compute(loadMat(src), dst);
+
+    cv::Mat dst_gold;
+    cv::cornerHarris(src, dst_gold, blockSize, apertureSize, k, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.02);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CornerHarris, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
+    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
+    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// cornerMinEigen
+
+PARAM_TEST_CASE(CornerMinEigen, cv::cuda::DeviceInfo, MatType, BorderType, BlockSize, ApertureSize)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int type;
+    int borderType;
+    int blockSize;
+    int apertureSize;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        borderType = GET_PARAM(2);
+        blockSize = GET_PARAM(3);
+        apertureSize = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CornerMinEigen, Accuracy)
+{
+    cv::Mat src = readImageType("stereobm/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
+    cv::Ptr<cv::cuda::CornernessCriteria> minEigenVal = cv::cuda::createMinEigenValCorner(src.type(), blockSize, apertureSize, borderType);
+
+    cv::cuda::GpuMat dst;
+    minEigenVal->compute(loadMat(src), dst);
+
+    cv::Mat dst_gold;
+    cv::cornerMinEigenVal(src, dst_gold, blockSize, apertureSize, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.02);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CornerMinEigen, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
+    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
+    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaimgproc/test/test_gftt.cpp b/modules/cudaimgproc/test/test_gftt.cpp
new file mode 100644
index 00000000000..9ad66396121
--- /dev/null
+++ b/modules/cudaimgproc/test/test_gftt.cpp
@@ -0,0 +1,133 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////
+// GoodFeaturesToTrack
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(MinDistance, double)
+}
+
+PARAM_TEST_CASE(GoodFeaturesToTrack, cv::cuda::DeviceInfo, MinDistance)
+{
+    cv::cuda::DeviceInfo devInfo;
+    double minDistance;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        minDistance = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(GoodFeaturesToTrack, Accuracy)
+{
+    cv::Mat image = readImage("opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    int maxCorners = 1000;
+    double qualityLevel = 0.01;
+
+    cv::Ptr<cv::cuda::CornersDetector> detector = cv::cuda::createGoodFeaturesToTrackDetector(image.type(), maxCorners, qualityLevel, minDistance);
+
+    cv::cuda::GpuMat d_pts;
+    detector->detect(loadMat(image), d_pts);
+
+    ASSERT_FALSE(d_pts.empty());
+
+    std::vector<cv::Point2f> pts(d_pts.cols);
+    cv::Mat pts_mat(1, d_pts.cols, CV_32FC2, (void*) &pts[0]);
+    d_pts.download(pts_mat);
+
+    std::vector<cv::Point2f> pts_gold;
+    cv::goodFeaturesToTrack(image, pts_gold, maxCorners, qualityLevel, minDistance);
+
+    ASSERT_EQ(pts_gold.size(), pts.size());
+
+    size_t mistmatch = 0;
+    for (size_t i = 0; i < pts.size(); ++i)
+    {
+        cv::Point2i a = pts_gold[i];
+        cv::Point2i b = pts[i];
+
+        bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+
+        if (!eq)
+            ++mistmatch;
+    }
+
+    double bad_ratio = static_cast<double>(mistmatch) / pts.size();
+
+    ASSERT_LE(bad_ratio, 0.01);
+}
+
+CUDA_TEST_P(GoodFeaturesToTrack, EmptyCorners)
+{
+    int maxCorners = 1000;
+    double qualityLevel = 0.01;
+
+    cv::cuda::GpuMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
+    cv::cuda::GpuMat corners(1, maxCorners, CV_32FC2);
+
+    cv::Ptr<cv::cuda::CornersDetector> detector = cv::cuda::createGoodFeaturesToTrackDetector(src.type(), maxCorners, qualityLevel, minDistance);
+
+    detector->detect(src, corners);
+
+    ASSERT_TRUE(corners.empty());
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, GoodFeaturesToTrack, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MinDistance(0.0), MinDistance(3.0))));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp
new file mode 100644
index 00000000000..eb084609e90
--- /dev/null
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@@ -0,0 +1,277 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// HistEven
+
+PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(HistEven, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    int hbins = 30;
+    float hranges[] = {50.0f, 200.0f};
+
+    cv::cuda::GpuMat hist;
+    cv::cuda::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]);
+
+    cv::Mat hist_gold;
+
+    int histSize[] = {hbins};
+    const float* ranges[] = {hranges};
+    int channels[] = {0};
+    cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
+
+    hist_gold = hist_gold.t();
+    hist_gold.convertTo(hist_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HistEven, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// CalcHist
+
+PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    cv::Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CalcHist, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::GpuMat hist;
+    cv::cuda::calcHist(loadMat(src), hist);
+
+    cv::Mat hist_gold;
+
+    const int hbins = 256;
+    const float hranges[] = {0.0f, 256.0f};
+    const int histSize[] = {hbins};
+    const float* ranges[] = {hranges};
+    const int channels[] = {0};
+
+    cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
+    hist_gold = hist_gold.reshape(1, 1);
+    hist_gold.convertTo(hist_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES));
+
+PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    cv::Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CalcHistWithMask, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+    cv::Mat mask = randomMat(size, CV_8UC1);
+    cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);
+
+    cv::cuda::GpuMat hist;
+    cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
+
+    cv::Mat hist_gold;
+
+    const int hbins = 256;
+    const float hranges[] = {0.0f, 256.0f};
+    const int histSize[] = {hbins};
+    const float* ranges[] = {hranges};
+    const int channels[] = {0};
+
+    cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
+    hist_gold = hist_gold.reshape(1, 1);
+    hist_gold.convertTo(hist_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// EqualizeHist
+
+PARAM_TEST_CASE(EqualizeHist, cv::cuda::DeviceInfo, cv::Size)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(EqualizeHist, Async)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::equalizeHist(loadMat(src), dst, stream);
+
+    stream.waitForCompletion();
+
+    cv::Mat dst_gold;
+    cv::equalizeHist(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 3.0);
+}
+
+CUDA_TEST_P(EqualizeHist, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::equalizeHist(loadMat(src), dst);
+
+    cv::Mat dst_gold;
+    cv::equalizeHist(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 3.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, EqualizeHist, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// CLAHE
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(ClipLimit, double)
+}
+
+PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    double clipLimit;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        clipLimit = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(CLAHE, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::Ptr<cv::cuda::CLAHE> clahe = cv::cuda::createCLAHE(clipLimit);
+    cv::cuda::GpuMat dst;
+    clahe->apply(loadMat(src), dst);
+
+    cv::Ptr<cv::CLAHE> clahe_gold = cv::createCLAHE(clipLimit);
+    cv::Mat dst_gold;
+    clahe_gold->apply(src, dst_gold);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CLAHE, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(0.0, 40.0)));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaimgproc/test/test_hough.cpp b/modules/cudaimgproc/test/test_hough.cpp
new file mode 100644
index 00000000000..05c5bba2390
--- /dev/null
+++ b/modules/cudaimgproc/test/test_hough.cpp
@@ -0,0 +1,261 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// HoughLines
+
+PARAM_TEST_CASE(HoughLines, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    static void generateLines(cv::Mat& img)
+    {
+        img.setTo(cv::Scalar::all(0));
+
+        cv::line(img, cv::Point(20, 0), cv::Point(20, img.rows), cv::Scalar::all(255));
+        cv::line(img, cv::Point(0, 50), cv::Point(img.cols, 50), cv::Scalar::all(255));
+        cv::line(img, cv::Point(0, 0), cv::Point(img.cols, img.rows), cv::Scalar::all(255));
+        cv::line(img, cv::Point(img.cols, 0), cv::Point(0, img.rows), cv::Scalar::all(255));
+    }
+
+    static void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
+    {
+        dst.setTo(cv::Scalar::all(0));
+
+        for (size_t i = 0; i < lines.size(); ++i)
+        {
+            float rho = lines[i][0], theta = lines[i][1];
+            cv::Point pt1, pt2;
+            double a = std::cos(theta), b = std::sin(theta);
+            double x0 = a*rho, y0 = b*rho;
+            pt1.x = cvRound(x0 + 1000*(-b));
+            pt1.y = cvRound(y0 + 1000*(a));
+            pt2.x = cvRound(x0 - 1000*(-b));
+            pt2.y = cvRound(y0 - 1000*(a));
+            cv::line(dst, pt1, pt2, cv::Scalar::all(255));
+        }
+    }
+};
+
+CUDA_TEST_P(HoughLines, Accuracy)
+{
+    const cv::cuda::DeviceInfo devInfo = GET_PARAM(0);
+    cv::cuda::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(1);
+    const bool useRoi = GET_PARAM(2);
+
+    const float rho = 1.0f;
+    const float theta = (float) (1.5 * CV_PI / 180.0);
+    const int threshold = 100;
+
+    cv::Mat src(size, CV_8UC1);
+    generateLines(src);
+
+    cv::Ptr<cv::cuda::HoughLinesDetector> hough = cv::cuda::createHoughLinesDetector(rho, theta, threshold);
+
+    cv::cuda::GpuMat d_lines;
+    hough->detect(loadMat(src, useRoi), d_lines);
+
+    std::vector<cv::Vec2f> lines;
+    hough->downloadResults(d_lines, lines);
+
+    cv::Mat dst(size, CV_8UC1);
+    drawLines(dst, lines);
+
+    ASSERT_MAT_NEAR(src, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLines, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// HoughCircles
+
+PARAM_TEST_CASE(HoughCircles, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    static void drawCircles(cv::Mat& dst, const std::vector<cv::Vec3f>& circles, bool fill)
+    {
+        dst.setTo(cv::Scalar::all(0));
+
+        for (size_t i = 0; i < circles.size(); ++i)
+            cv::circle(dst, cv::Point2f(circles[i][0], circles[i][1]), (int)circles[i][2], cv::Scalar::all(255), fill ? -1 : 1);
+    }
+};
+
+CUDA_TEST_P(HoughCircles, Accuracy)
+{
+    const cv::cuda::DeviceInfo devInfo = GET_PARAM(0);
+    cv::cuda::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(1);
+    const bool useRoi = GET_PARAM(2);
+
+    const float dp = 2.0f;
+    const float minDist = 0.0f;
+    const int minRadius = 10;
+    const int maxRadius = 20;
+    const int cannyThreshold = 100;
+    const int votesThreshold = 20;
+
+    std::vector<cv::Vec3f> circles_gold(4);
+    circles_gold[0] = cv::Vec3i(20, 20, minRadius);
+    circles_gold[1] = cv::Vec3i(90, 87, minRadius + 3);
+    circles_gold[2] = cv::Vec3i(30, 70, minRadius + 8);
+    circles_gold[3] = cv::Vec3i(80, 10, maxRadius);
+
+    cv::Mat src(size, CV_8UC1);
+    drawCircles(src, circles_gold, true);
+
+    cv::Ptr<cv::cuda::HoughCirclesDetector> houghCircles = cv::cuda::createHoughCirclesDetector(dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
+
+    cv::cuda::GpuMat d_circles;
+    houghCircles->detect(loadMat(src, useRoi), d_circles);
+
+    std::vector<cv::Vec3f> circles;
+    d_circles.download(circles);
+
+    ASSERT_FALSE(circles.empty());
+
+    for (size_t i = 0; i < circles.size(); ++i)
+    {
+        cv::Vec3f cur = circles[i];
+
+        bool found = false;
+
+        for (size_t j = 0; j < circles_gold.size(); ++j)
+        {
+            cv::Vec3f gold = circles_gold[j];
+
+            if (std::fabs(cur[0] - gold[0]) < 5 && std::fabs(cur[1] - gold[1]) < 5 && std::fabs(cur[2] - gold[2]) < 5)
+            {
+                found = true;
+                break;
+            }
+        }
+
+        ASSERT_TRUE(found);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughCircles, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// GeneralizedHough
+
+PARAM_TEST_CASE(GeneralizedHough, cv::cuda::DeviceInfo, UseRoi)
+{
+};
+
+CUDA_TEST_P(GeneralizedHough, Ballard)
+{
+    const cv::cuda::DeviceInfo devInfo = GET_PARAM(0);
+    cv::cuda::setDevice(devInfo.deviceID());
+    const bool useRoi = GET_PARAM(1);
+
+    cv::Mat templ = readImage("../cv/shared/templ.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(templ.empty());
+
+    cv::Point templCenter(templ.cols / 2, templ.rows / 2);
+
+    const size_t gold_count = 3;
+    cv::Point pos_gold[gold_count];
+    pos_gold[0] = cv::Point(templCenter.x + 10, templCenter.y + 10);
+    pos_gold[1] = cv::Point(2 * templCenter.x + 40, templCenter.y + 10);
+    pos_gold[2] = cv::Point(2 * templCenter.x + 40, 2 * templCenter.y + 40);
+
+    cv::Mat image(templ.rows * 3, templ.cols * 3, CV_8UC1, cv::Scalar::all(0));
+    for (size_t i = 0; i < gold_count; ++i)
+    {
+        cv::Rect rec(pos_gold[i].x - templCenter.x, pos_gold[i].y - templCenter.y, templ.cols, templ.rows);
+        cv::Mat imageROI = image(rec);
+        templ.copyTo(imageROI);
+    }
+
+    cv::Ptr<cv::GeneralizedHoughBallard> alg = cv::cuda::createGeneralizedHoughBallard();
+    alg->setVotesThreshold(200);
+
+    alg->setTemplate(loadMat(templ, useRoi));
+
+    cv::cuda::GpuMat d_pos;
+    alg->detect(loadMat(image, useRoi), d_pos);
+
+    std::vector<cv::Vec4f> pos;
+    d_pos.download(pos);
+
+    ASSERT_EQ(gold_count, pos.size());
+
+    for (size_t i = 0; i < gold_count; ++i)
+    {
+        cv::Point gold = pos_gold[i];
+
+        bool found = false;
+
+        for (size_t j = 0; j < pos.size(); ++j)
+        {
+            cv::Point2f p(pos[j][0], pos[j][1]);
+
+            if (::fabs(p.x - gold.x) < 2 && ::fabs(p.y - gold.y) < 2)
+            {
+                found = true;
+                break;
+            }
+        }
+
+        ASSERT_TRUE(found);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, GeneralizedHough, testing::Combine(
+    ALL_DEVICES,
+    WHOLE_SUBMAT));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaimgproc/test/test_main.cpp b/modules/cudaimgproc/test/test_main.cpp
new file mode 100644
index 00000000000..04f4fcf6e60
--- /dev/null
+++ b/modules/cudaimgproc/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cudaimgproc/test/test_match_template.cpp b/modules/cudaimgproc/test/test_match_template.cpp
new file mode 100644
index 00000000000..5ae1cd2ba53
--- /dev/null
+++ b/modules/cudaimgproc/test/test_match_template.cpp
@@ -0,0 +1,341 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate8U
+
+CV_ENUM(TemplateMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
+#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_CCOEFF_NORMED))
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
+}
+
+PARAM_TEST_CASE(MatchTemplate8U, cv::cuda::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    cv::Size templ_size;
+    int cn;
+    int method;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        templ_size = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        method = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(MatchTemplate8U, Accuracy)
+{
+    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
+    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
+
+    cv::Ptr<cv::cuda::TemplateMatching> alg = cv::cuda::createTemplateMatching(image.type(), method);
+
+    cv::cuda::GpuMat dst;
+    alg->match(loadMat(image), loadMat(templ), dst);
+
+    cv::Mat dst_gold;
+    cv::matchTemplate(image, templ, dst_gold, method);
+
+    cv::Mat h_dst(dst);
+    ASSERT_EQ(dst_gold.size(), h_dst.size());
+    ASSERT_EQ(dst_gold.type(), h_dst.type());
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            float gold_val = dst_gold.at<float>(y, x);
+            float actual_val = dst_gold.at<float>(y, x);
+            ASSERT_FLOAT_EQ(gold_val, actual_val) << y << ", " << x;
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, MatchTemplate8U, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
+    testing::Values(Channels(1), Channels(3), Channels(4)),
+    ALL_TEMPLATE_METHODS));
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate32F
+
+PARAM_TEST_CASE(MatchTemplate32F, cv::cuda::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    cv::Size templ_size;
+    int cn;
+    int method;
+
+    int n, m, h, w;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        templ_size = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        method = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(MatchTemplate32F, Regression)
+{
+    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
+    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
+
+    cv::Ptr<cv::cuda::TemplateMatching> alg = cv::cuda::createTemplateMatching(image.type(), method);
+
+    cv::cuda::GpuMat dst;
+    alg->match(loadMat(image), loadMat(templ), dst);
+
+    cv::Mat dst_gold;
+    cv::matchTemplate(image, templ, dst_gold, method);
+
+    cv::Mat h_dst(dst);
+    ASSERT_EQ(dst_gold.size(), h_dst.size());
+    ASSERT_EQ(dst_gold.type(), h_dst.type());
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            float gold_val = dst_gold.at<float>(y, x);
+            float actual_val = dst_gold.at<float>(y, x);
+            ASSERT_FLOAT_EQ(gold_val, actual_val) << y << ", " << x;
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, MatchTemplate32F, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
+    testing::Values(Channels(1), Channels(3), Channels(4)),
+    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplateBlackSource
+
+PARAM_TEST_CASE(MatchTemplateBlackSource, cv::cuda::DeviceInfo, TemplateMethod)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int method;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        method = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(MatchTemplateBlackSource, Accuracy)
+{
+    cv::Mat image = readImage("matchtemplate/black.png");
+    ASSERT_FALSE(image.empty());
+
+    cv::Mat pattern = readImage("matchtemplate/cat.png");
+    ASSERT_FALSE(pattern.empty());
+
+    cv::Ptr<cv::cuda::TemplateMatching> alg = cv::cuda::createTemplateMatching(image.type(), method);
+
+    cv::cuda::GpuMat d_dst;
+    alg->match(loadMat(image), loadMat(pattern), d_dst);
+
+    cv::Mat dst(d_dst);
+
+    double maxValue;
+    cv::Point maxLoc;
+    cv::minMaxLoc(dst, NULL, &maxValue, NULL, &maxLoc);
+
+    cv::Point maxLocGold = cv::Point(284, 12);
+
+    ASSERT_EQ(maxLocGold, maxLoc);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, MatchTemplateBlackSource, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(TemplateMethod(cv::TM_CCOEFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED))));
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate_CCOEF_NORMED
+
+PARAM_TEST_CASE(MatchTemplate_CCOEF_NORMED, cv::cuda::DeviceInfo, std::pair<std::string, std::string>)
+{
+    cv::cuda::DeviceInfo devInfo;
+    std::string imageName;
+    std::string patternName;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        imageName = GET_PARAM(1).first;
+        patternName = GET_PARAM(1).second;
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
+{
+    cv::Mat image = readImage(imageName);
+    ASSERT_FALSE(image.empty());
+
+    cv::Mat pattern = readImage(patternName);
+    ASSERT_FALSE(pattern.empty());
+
+    cv::Ptr<cv::cuda::TemplateMatching> alg = cv::cuda::createTemplateMatching(image.type(), cv::TM_CCOEFF_NORMED);
+
+    cv::cuda::GpuMat d_dst;
+    alg->match(loadMat(image), loadMat(pattern), d_dst);
+
+    cv::Mat dst(d_dst);
+
+    cv::Point minLoc, maxLoc;
+    double minVal, maxVal;
+    cv::minMaxLoc(dst, &minVal, &maxVal, &minLoc, &maxLoc);
+
+    cv::Mat dstGold;
+    cv::matchTemplate(image, pattern, dstGold, cv::TM_CCOEFF_NORMED);
+
+    double minValGold, maxValGold;
+    cv::Point minLocGold, maxLocGold;
+    cv::minMaxLoc(dstGold, &minValGold, &maxValGold, &minLocGold, &maxLocGold);
+
+    ASSERT_EQ(minLocGold, minLoc);
+    ASSERT_EQ(maxLocGold, maxLoc);
+    ASSERT_LE(maxVal, 1.0);
+    ASSERT_GE(minVal, -1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, MatchTemplate_CCOEF_NORMED, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::make_pair(std::string("matchtemplate/source-0.png"), std::string("matchtemplate/target-0.png")))));
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate_CanFindBigTemplate
+
+struct MatchTemplate_CanFindBigTemplate : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF_NORMED)
+{
+    cv::Mat scene = readImage("matchtemplate/scene.png");
+    ASSERT_FALSE(scene.empty());
+
+    cv::Mat templ = readImage("matchtemplate/template.png");
+    ASSERT_FALSE(templ.empty());
+
+    cv::Ptr<cv::cuda::TemplateMatching> alg = cv::cuda::createTemplateMatching(scene.type(), cv::TM_SQDIFF_NORMED);
+
+    cv::cuda::GpuMat d_result;
+    alg->match(loadMat(scene), loadMat(templ), d_result);
+
+    cv::Mat result(d_result);
+
+    double minVal;
+    cv::Point minLoc;
+    cv::minMaxLoc(result, &minVal, 0, &minLoc, 0);
+
+    ASSERT_GE(minVal, 0);
+    ASSERT_LT(minVal, 1e-3);
+    ASSERT_EQ(344, minLoc.x);
+    ASSERT_EQ(0, minLoc.y);
+}
+
+CUDA_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF)
+{
+    cv::Mat scene = readImage("matchtemplate/scene.png");
+    ASSERT_FALSE(scene.empty());
+
+    cv::Mat templ = readImage("matchtemplate/template.png");
+    ASSERT_FALSE(templ.empty());
+
+    cv::Ptr<cv::cuda::TemplateMatching> alg = cv::cuda::createTemplateMatching(scene.type(), cv::TM_SQDIFF);
+
+    cv::cuda::GpuMat d_result;
+    alg->match(loadMat(scene), loadMat(templ), d_result);
+
+    cv::Mat result(d_result);
+
+    double minVal;
+    cv::Point minLoc;
+    cv::minMaxLoc(result, &minVal, 0, &minLoc, 0);
+
+    ASSERT_GE(minVal, 0);
+    ASSERT_EQ(344, minLoc.x);
+    ASSERT_EQ(0, minLoc.y);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, MatchTemplate_CanFindBigTemplate, ALL_DEVICES);
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaimgproc/test/test_mean_shift.cpp b/modules/cudaimgproc/test/test_mean_shift.cpp
new file mode 100644
index 00000000000..7505619cd32
--- /dev/null
+++ b/modules/cudaimgproc/test/test_mean_shift.cpp
@@ -0,0 +1,176 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// MeanShift
+
+struct MeanShift : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    cv::Mat img;
+
+    int spatialRad;
+    int colorRad;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        img = readImageType("meanshift/cones.png", CV_8UC4);
+        ASSERT_FALSE(img.empty());
+
+        spatialRad = 30;
+        colorRad = 30;
+    }
+};
+
+CUDA_TEST_P(MeanShift, Filtering)
+{
+    cv::Mat img_template;
+    if (supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_20))
+        img_template = readImage("meanshift/con_result.png");
+    else
+        img_template = readImage("meanshift/con_result_CC1X.png");
+    ASSERT_FALSE(img_template.empty());
+
+    cv::cuda::GpuMat d_dst;
+    cv::cuda::meanShiftFiltering(loadMat(img), d_dst, spatialRad, colorRad);
+
+    ASSERT_EQ(CV_8UC4, d_dst.type());
+
+    cv::Mat dst(d_dst);
+
+    cv::Mat result;
+    cv::cvtColor(dst, result, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(img_template, result, 0.0);
+}
+
+CUDA_TEST_P(MeanShift, Proc)
+{
+    cv::FileStorage fs;
+    if (supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_20))
+        fs.open(std::string(cvtest::TS::ptr()->get_data_path()) + "meanshift/spmap.yaml", cv::FileStorage::READ);
+    else
+        fs.open(std::string(cvtest::TS::ptr()->get_data_path()) + "meanshift/spmap_CC1X.yaml", cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    cv::Mat spmap_template;
+    fs["spmap"] >> spmap_template;
+    ASSERT_FALSE(spmap_template.empty());
+
+    cv::cuda::GpuMat rmap_filtered;
+    cv::cuda::meanShiftFiltering(loadMat(img), rmap_filtered, spatialRad, colorRad);
+
+    cv::cuda::GpuMat rmap;
+    cv::cuda::GpuMat spmap;
+    cv::cuda::meanShiftProc(loadMat(img), rmap, spmap, spatialRad, colorRad);
+
+    ASSERT_EQ(CV_8UC4, rmap.type());
+
+    EXPECT_MAT_NEAR(rmap_filtered, rmap, 0.0);
+    EXPECT_MAT_NEAR(spmap_template, spmap, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, MeanShift, ALL_DEVICES);
+
+////////////////////////////////////////////////////////////////////////////////
+// MeanShiftSegmentation
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(MinSize, int);
+}
+
+PARAM_TEST_CASE(MeanShiftSegmentation, cv::cuda::DeviceInfo, MinSize)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int minsize;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        minsize = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(MeanShiftSegmentation, Regression)
+{
+    cv::Mat img = readImageType("meanshift/cones.png", CV_8UC4);
+    ASSERT_FALSE(img.empty());
+
+    std::ostringstream path;
+    path << "meanshift/cones_segmented_sp10_sr10_minsize" << minsize;
+    if (supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_20))
+        path << ".png";
+    else
+        path << "_CC1X.png";
+    cv::Mat dst_gold = readImage(path.str());
+    ASSERT_FALSE(dst_gold.empty());
+
+    cv::Mat dst;
+    cv::cuda::meanShiftSegmentation(loadMat(img), dst, 10, 10, minsize);
+
+    cv::Mat dst_rgb;
+    cv::cvtColor(dst, dst_rgb, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst_rgb, 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, MeanShiftSegmentation, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MinSize(0), MinSize(4), MinSize(20), MinSize(84), MinSize(340), MinSize(1364))));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaimgproc/test/test_precomp.hpp b/modules/cudaimgproc/test/test_precomp.hpp
new file mode 100644
index 00000000000..dd94f6f2856
--- /dev/null
+++ b/modules/cudaimgproc/test/test_precomp.hpp
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudaimgproc.hpp"
+
+#include "cvconfig.h"
+
+#endif
diff --git a/modules/cudalegacy/CMakeLists.txt b/modules/cudalegacy/CMakeLists.txt
new file mode 100644
index 00000000000..7fe342e11cb
--- /dev/null
+++ b/modules/cudalegacy/CMakeLists.txt
@@ -0,0 +1,10 @@
+if(NOT HAVE_CUDA)
+  ocv_module_disable(cudalegacy)
+endif()
+
+set(the_description "CUDA-accelerated Computer Vision (legacy)")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4130 /wd4324 /wd4512 /wd4310 -Wundef -Wmissing-declarations -Wuninitialized -Wshadow)
+
+ocv_define_module(cudalegacy opencv_core opencv_video
+  OPTIONAL opencv_objdetect opencv_imgproc opencv_calib3d opencv_cudaarithm opencv_cudafilters opencv_cudaimgproc)
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy.hpp b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
new file mode 100644
index 00000000000..ace8548e35d
--- /dev/null
+++ b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
@@ -0,0 +1,290 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDALEGACY_HPP
+#define OPENCV_CUDALEGACY_HPP
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/cudalegacy/NCV.hpp"
+#include "opencv2/cudalegacy/NPP_staging.hpp"
+#include "opencv2/cudalegacy/NCVPyramid.hpp"
+#include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
+#include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
+#include "opencv2/video/background_segm.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudalegacy Legacy support
+  @}
+*/
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudalegacy
+//! @{
+
+//
+// ImagePyramid
+//
+
+class CV_EXPORTS ImagePyramid : public Algorithm
+{
+public:
+    virtual void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const = 0;
+};
+
+CV_EXPORTS Ptr<ImagePyramid> createImagePyramid(InputArray img, int nLayers = -1, Stream& stream = Stream::Null());
+
+//
+// GMG
+//
+
+/** @brief Background/Foreground Segmentation Algorithm.
+
+The class discriminates between foreground and background pixels by building and maintaining a model
+of the background. Any pixel which does not fit this model is then deemed to be foreground. The
+class implements algorithm described in @cite Gold2012 .
+ */
+class CV_EXPORTS BackgroundSubtractorGMG : public cv::BackgroundSubtractor
+{
+public:
+    using cv::BackgroundSubtractor::apply;
+    virtual void apply(InputArray image, OutputArray fgmask, double learningRate, Stream& stream) = 0;
+
+    virtual int getMaxFeatures() const = 0;
+    virtual void setMaxFeatures(int maxFeatures) = 0;
+
+    virtual double getDefaultLearningRate() const = 0;
+    virtual void setDefaultLearningRate(double lr) = 0;
+
+    virtual int getNumFrames() const = 0;
+    virtual void setNumFrames(int nframes) = 0;
+
+    virtual int getQuantizationLevels() const = 0;
+    virtual void setQuantizationLevels(int nlevels) = 0;
+
+    virtual double getBackgroundPrior() const = 0;
+    virtual void setBackgroundPrior(double bgprior) = 0;
+
+    virtual int getSmoothingRadius() const = 0;
+    virtual void setSmoothingRadius(int radius) = 0;
+
+    virtual double getDecisionThreshold() const = 0;
+    virtual void setDecisionThreshold(double thresh) = 0;
+
+    virtual bool getUpdateBackgroundModel() const = 0;
+    virtual void setUpdateBackgroundModel(bool update) = 0;
+
+    virtual double getMinVal() const = 0;
+    virtual void setMinVal(double val) = 0;
+
+    virtual double getMaxVal() const = 0;
+    virtual void setMaxVal(double val) = 0;
+};
+
+/** @brief Creates GMG Background Subtractor
+
+@param initializationFrames Number of frames of video to use to initialize histograms.
+@param decisionThreshold Value above which pixel is determined to be FG.
+ */
+CV_EXPORTS Ptr<cuda::BackgroundSubtractorGMG>
+    createBackgroundSubtractorGMG(int initializationFrames = 120, double decisionThreshold = 0.8);
+
+//
+// FGD
+//
+
+/** @brief The class discriminates between foreground and background pixels by building and maintaining a model
+of the background.
+
+Any pixel which does not fit this model is then deemed to be foreground. The class implements
+algorithm described in @cite FGD2003 .
+@sa BackgroundSubtractor
+ */
+class CV_EXPORTS BackgroundSubtractorFGD : public cv::BackgroundSubtractor
+{
+public:
+    /** @brief Returns the output foreground regions calculated by findContours.
+
+    @param foreground_regions Output array (CPU memory).
+     */
+    virtual void getForegroundRegions(OutputArrayOfArrays foreground_regions) = 0;
+};
+
+struct CV_EXPORTS FGDParams
+{
+    int Lc;  //!< Quantized levels per 'color' component. Power of two, typically 32, 64 or 128.
+    int N1c; //!< Number of color vectors used to model normal background color variation at a given pixel.
+    int N2c; //!< Number of color vectors retained at given pixel.  Must be > N1c, typically ~ 5/3 of N1c.
+    //!< Used to allow the first N1c vectors to adapt over time to changing background.
+
+    int Lcc;  //!< Quantized levels per 'color co-occurrence' component.  Power of two, typically 16, 32 or 64.
+    int N1cc; //!< Number of color co-occurrence vectors used to model normal background color variation at a given pixel.
+    int N2cc; //!< Number of color co-occurrence vectors retained at given pixel.  Must be > N1cc, typically ~ 5/3 of N1cc.
+    //!< Used to allow the first N1cc vectors to adapt over time to changing background.
+
+    bool is_obj_without_holes; //!< If TRUE we ignore holes within foreground blobs. Defaults to TRUE.
+    int perform_morphing;     //!< Number of erode-dilate-erode foreground-blob cleanup iterations.
+    //!< These erase one-pixel junk blobs and merge almost-touching blobs. Default value is 1.
+
+    float alpha1; //!< How quickly we forget old background pixel values seen. Typically set to 0.1.
+    float alpha2; //!< "Controls speed of feature learning". Depends on T. Typical value circa 0.005.
+    float alpha3; //!< Alternate to alpha2, used (e.g.) for quicker initial convergence. Typical value 0.1.
+
+    float delta;   //!< Affects color and color co-occurrence quantization, typically set to 2.
+    float T;       //!< A percentage value which determines when new features can be recognized as new background. (Typically 0.9).
+    float minArea; //!< Discard foreground blobs whose bounding box is smaller than this threshold.
+
+    //! default Params
+    FGDParams();
+};
+
+/** @brief Creates FGD Background Subtractor
+
+@param params Algorithm's parameters. See @cite FGD2003 for explanation.
+ */
+CV_EXPORTS Ptr<cuda::BackgroundSubtractorFGD>
+    createBackgroundSubtractorFGD(const FGDParams& params = FGDParams());
+
+//
+// Optical flow
+//
+
+//! Calculates optical flow for 2 images using block matching algorithm */
+CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr,
+                                  Size block_size, Size shift_size, Size max_range, bool use_previous,
+                                  GpuMat& velx, GpuMat& vely, GpuMat& buf,
+                                  Stream& stream = Stream::Null());
+
+class CV_EXPORTS FastOpticalFlowBM
+{
+public:
+    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null());
+
+private:
+    GpuMat buffer;
+    GpuMat extended_I0;
+    GpuMat extended_I1;
+};
+
+/** @brief Interpolates frames (images) using provided optical flow (displacement field).
+
+@param frame0 First frame (32-bit floating point images, single channel).
+@param frame1 Second frame. Must have the same type and size as frame0 .
+@param fu Forward horizontal displacement.
+@param fv Forward vertical displacement.
+@param bu Backward horizontal displacement.
+@param bv Backward vertical displacement.
+@param pos New frame position.
+@param newFrame Output image.
+@param buf Temporary buffer, will have width x 6\*height size, CV_32FC1 type and contain 6
+GpuMat: occlusion masks for first frame, occlusion masks for second, interpolated forward
+horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow,
+interpolated backward vertical flow.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
+                                  const GpuMat& fu, const GpuMat& fv,
+                                  const GpuMat& bu, const GpuMat& bv,
+                                  float pos, GpuMat& newFrame, GpuMat& buf,
+                                  Stream& stream = Stream::Null());
+
+CV_EXPORTS void createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors);
+
+//
+// Labeling
+//
+
+//!performs labeling via graph cuts of a 2D regular 4-connected graph.
+CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels,
+                         GpuMat& buf, Stream& stream = Stream::Null());
+
+//!performs labeling via graph cuts of a 2D regular 8-connected graph.
+CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
+                         GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight,
+                         GpuMat& labels,
+                         GpuMat& buf, Stream& stream = Stream::Null());
+
+//! compute mask for Generalized Flood fill componetns labeling.
+CV_EXPORTS void connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& stream = Stream::Null());
+
+//! performs connected componnents labeling.
+CV_EXPORTS void labelComponents(const GpuMat& mask, GpuMat& components, int flags = 0, Stream& stream = Stream::Null());
+
+//
+// Calib3d
+//
+
+CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
+                                GpuMat& dst, Stream& stream = Stream::Null());
+
+CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
+                              const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
+                              Stream& stream = Stream::Null());
+
+/** @brief Finds the object pose from 3D-2D point correspondences.
+
+@param object Single-row matrix of object points.
+@param image Single-row matrix of image points.
+@param camera_mat 3x3 matrix of intrinsic camera parameters.
+@param dist_coef Distortion coefficients. See undistortPoints for details.
+@param rvec Output 3D rotation vector.
+@param tvec Output 3D translation vector.
+@param use_extrinsic_guess Flag to indicate that the function must use rvec and tvec as an
+initial transformation guess. It is not supported for now.
+@param num_iters Maximum number of RANSAC iterations.
+@param max_dist Euclidean distance threshold to detect whether point is inlier or not.
+@param min_inlier_count Flag to indicate that the function must stop if greater or equal number
+of inliers is achieved. It is not supported for now.
+@param inliers Output vector of inlier indices.
+ */
+CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
+                               const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false,
+                               int num_iters=100, float max_dist=8.0, int min_inlier_count=100,
+                               std::vector<int>* inliers=NULL);
+
+//! @}
+
+}}
+
+#endif /* OPENCV_CUDALEGACY_HPP */
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
new file mode 100644
index 00000000000..d0ec6a42d6e
--- /dev/null
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
@@ -0,0 +1,1032 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_hpp_
+#define _ncv_hpp_
+
+#include "opencv2/core/cvdef.h"
+
+#ifdef _WIN32
+    #define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <cuda_runtime.h>
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/utility.hpp"
+
+
+//==============================================================================
+//
+// Compile-time assert functionality
+//
+//==============================================================================
+
+//! @addtogroup cudalegacy
+//! @{
+
+/**
+* Compile-time assert namespace
+*/
+namespace NcvCTprep
+{
+    template <bool x>
+    struct CT_ASSERT_FAILURE;
+
+    template <>
+    struct CT_ASSERT_FAILURE<true> {};
+
+    template <int x>
+    struct assertTest{};
+}
+
+
+#define NCV_CT_PREP_PASTE_AUX(a,b)      a##b                         ///< Concatenation indirection macro
+#define NCV_CT_PREP_PASTE(a,b)          NCV_CT_PREP_PASTE_AUX(a, b)  ///< Concatenation macro
+
+
+/**
+* Performs compile-time assertion of a condition on the file scope
+*/
+#define NCV_CT_ASSERT(X) \
+    typedef NcvCTprep::assertTest<sizeof(NcvCTprep::CT_ASSERT_FAILURE< (bool)(X) >)> \
+    NCV_CT_PREP_PASTE(__ct_assert_typedef_, __LINE__)
+
+
+
+//==============================================================================
+//
+// Alignment macros
+//
+//==============================================================================
+
+
+#if !defined(__align__) && !defined(__CUDACC__)
+    #if defined(_WIN32) || defined(_WIN64)
+        #define __align__(n)         __declspec(align(n))
+    #elif defined(__unix__)
+        #define __align__(n)         __attribute__((__aligned__(n)))
+    #endif
+#endif
+
+
+//==============================================================================
+//
+// Integral and compound types of guaranteed size
+//
+//==============================================================================
+
+
+typedef               bool NcvBool;
+typedef          long long Ncv64s;
+
+#if defined(__APPLE__) && !defined(__CUDACC__)
+    typedef uint64_t Ncv64u;
+#else
+    typedef unsigned long long Ncv64u;
+#endif
+
+typedef                int Ncv32s;
+typedef       unsigned int Ncv32u;
+typedef              short Ncv16s;
+typedef     unsigned short Ncv16u;
+typedef        signed char Ncv8s;
+typedef      unsigned char Ncv8u;
+typedef              float Ncv32f;
+typedef             double Ncv64f;
+
+
+struct NcvRect8u
+{
+    Ncv8u x;
+    Ncv8u y;
+    Ncv8u width;
+    Ncv8u height;
+    __host__ __device__ NcvRect8u() : x(0), y(0), width(0), height(0) {};
+    __host__ __device__ NcvRect8u(Ncv8u x_, Ncv8u y_, Ncv8u width_, Ncv8u height_) : x(x_), y(y_), width(width_), height(height_) {}
+};
+
+
+struct NcvRect32s
+{
+    Ncv32s x;          ///< x-coordinate of upper left corner.
+    Ncv32s y;          ///< y-coordinate of upper left corner.
+    Ncv32s width;      ///< Rectangle width.
+    Ncv32s height;     ///< Rectangle height.
+    __host__ __device__ NcvRect32s() : x(0), y(0), width(0), height(0) {};
+    __host__ __device__ NcvRect32s(Ncv32s x_, Ncv32s y_, Ncv32s width_, Ncv32s height_)
+        : x(x_), y(y_), width(width_), height(height_) {}
+};
+
+
+struct NcvRect32u
+{
+    Ncv32u x;          ///< x-coordinate of upper left corner.
+    Ncv32u y;          ///< y-coordinate of upper left corner.
+    Ncv32u width;      ///< Rectangle width.
+    Ncv32u height;     ///< Rectangle height.
+    __host__ __device__ NcvRect32u() : x(0), y(0), width(0), height(0) {};
+    __host__ __device__ NcvRect32u(Ncv32u x_, Ncv32u y_, Ncv32u width_, Ncv32u height_)
+        : x(x_), y(y_), width(width_), height(height_) {}
+};
+
+
+struct NcvSize32s
+{
+    Ncv32s width;  ///< Rectangle width.
+    Ncv32s height; ///< Rectangle height.
+    __host__ __device__ NcvSize32s() : width(0), height(0) {};
+    __host__ __device__ NcvSize32s(Ncv32s width_, Ncv32s height_) : width(width_), height(height_) {}
+};
+
+
+struct NcvSize32u
+{
+    Ncv32u width;  ///< Rectangle width.
+    Ncv32u height; ///< Rectangle height.
+    __host__ __device__ NcvSize32u() : width(0), height(0) {};
+    __host__ __device__ NcvSize32u(Ncv32u width_, Ncv32u height_) : width(width_), height(height_) {}
+    __host__ __device__ bool operator == (const NcvSize32u &another) const {return this->width == another.width && this->height == another.height;}
+};
+
+
+struct NcvPoint2D32s
+{
+    Ncv32s x; ///< Point X.
+    Ncv32s y; ///< Point Y.
+    __host__ __device__ NcvPoint2D32s() : x(0), y(0) {};
+    __host__ __device__ NcvPoint2D32s(Ncv32s x_, Ncv32s y_) : x(x_), y(y_) {}
+};
+
+
+struct NcvPoint2D32u
+{
+    Ncv32u x; ///< Point X.
+    Ncv32u y; ///< Point Y.
+    __host__ __device__ NcvPoint2D32u() : x(0), y(0) {};
+    __host__ __device__ NcvPoint2D32u(Ncv32u x_, Ncv32u y_) : x(x_), y(y_) {}
+};
+
+//! @cond IGNORED
+
+NCV_CT_ASSERT(sizeof(NcvBool) <= 4);
+NCV_CT_ASSERT(sizeof(Ncv64s) == 8);
+NCV_CT_ASSERT(sizeof(Ncv64u) == 8);
+NCV_CT_ASSERT(sizeof(Ncv32s) == 4);
+NCV_CT_ASSERT(sizeof(Ncv32u) == 4);
+NCV_CT_ASSERT(sizeof(Ncv16s) == 2);
+NCV_CT_ASSERT(sizeof(Ncv16u) == 2);
+NCV_CT_ASSERT(sizeof(Ncv8s) == 1);
+NCV_CT_ASSERT(sizeof(Ncv8u) == 1);
+NCV_CT_ASSERT(sizeof(Ncv32f) == 4);
+NCV_CT_ASSERT(sizeof(Ncv64f) == 8);
+NCV_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));
+NCV_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));
+NCV_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));
+NCV_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));
+NCV_CT_ASSERT(sizeof(NcvPoint2D32u) == 2 * sizeof(Ncv32u));
+
+//! @endcond
+
+//==============================================================================
+//
+// Persistent constants
+//
+//==============================================================================
+
+
+const Ncv32u K_WARP_SIZE = 32;
+const Ncv32u K_LOG2_WARP_SIZE = 5;
+
+
+//==============================================================================
+//
+// Error handling
+//
+//==============================================================================
+
+
+CV_EXPORTS void ncvDebugOutput(const cv::String &msg);
+
+
+typedef void NCVDebugOutputHandler(const cv::String &msg);
+
+
+CV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
+
+
+#define ncvAssertPrintCheck(pred, msg) \
+    do \
+    { \
+        if (!(pred)) \
+        { \
+            cv::String str = cv::format("NCV Assertion Failed: %s, file=%s, line=%d", msg, __FILE__, __LINE__); \
+            ncvDebugOutput(str); \
+        } \
+    } while (0)
+
+
+#define ncvAssertPrintReturn(pred, msg, err) \
+    do \
+    { \
+        ncvAssertPrintCheck(pred, msg); \
+        if (!(pred)) return err; \
+    } while (0)
+
+
+#define ncvAssertReturn(pred, err) \
+    do \
+    { \
+        cv::String msg = cv::format("retcode=%d", (int)err); \
+        ncvAssertPrintReturn(pred, msg.c_str(), err); \
+    } while (0)
+
+
+#define ncvAssertReturnNcvStat(ncvOp) \
+    do \
+    { \
+        NCVStatus _ncvStat = ncvOp; \
+        cv::String msg = cv::format("NcvStat=%d", (int)_ncvStat); \
+        ncvAssertPrintReturn(NCV_SUCCESS==_ncvStat, msg.c_str(), _ncvStat); \
+    } while (0)
+
+
+#define ncvAssertCUDAReturn(cudacall, errCode) \
+    do \
+    { \
+        cudaError_t res = cudacall; \
+        cv::String msg = cv::format("cudaError_t=%d", (int)res); \
+        ncvAssertPrintReturn(cudaSuccess==res, msg.c_str(), errCode); \
+    } while (0)
+
+
+#define ncvAssertCUDALastErrorReturn(errCode) \
+    do \
+    { \
+        cudaError_t res = cudaGetLastError(); \
+        cv::String msg = cv::format("cudaError_t=%d", (int)res); \
+        ncvAssertPrintReturn(cudaSuccess==res, msg.c_str(), errCode); \
+    } while (0)
+
+
+/**
+* Return-codes for status notification, errors and warnings
+*/
+enum
+{
+    //NCV statuses
+    NCV_SUCCESS,
+    NCV_UNKNOWN_ERROR,
+
+    NCV_CUDA_ERROR,
+    NCV_NPP_ERROR,
+    NCV_FILE_ERROR,
+
+    NCV_NULL_PTR,
+    NCV_INCONSISTENT_INPUT,
+    NCV_TEXTURE_BIND_ERROR,
+    NCV_DIMENSIONS_INVALID,
+
+    NCV_INVALID_ROI,
+    NCV_INVALID_STEP,
+    NCV_INVALID_SCALE,
+
+    NCV_ALLOCATOR_NOT_INITIALIZED,
+    NCV_ALLOCATOR_BAD_ALLOC,
+    NCV_ALLOCATOR_BAD_DEALLOC,
+    NCV_ALLOCATOR_INSUFFICIENT_CAPACITY,
+    NCV_ALLOCATOR_DEALLOC_ORDER,
+    NCV_ALLOCATOR_BAD_REUSE,
+
+    NCV_MEM_COPY_ERROR,
+    NCV_MEM_RESIDENCE_ERROR,
+    NCV_MEM_INSUFFICIENT_CAPACITY,
+
+    NCV_HAAR_INVALID_PIXEL_STEP,
+    NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER,
+    NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE,
+    NCV_HAAR_TOO_LARGE_FEATURES,
+    NCV_HAAR_XML_LOADING_EXCEPTION,
+
+    NCV_NOIMPL_HAAR_TILTED_FEATURES,
+    NCV_NOT_IMPLEMENTED,
+
+    NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW,
+
+    //NPP statuses
+    NPPST_SUCCESS = NCV_SUCCESS,              ///< Successful operation (same as NPP_NO_ERROR)
+    NPPST_ERROR,                              ///< Unknown error
+    NPPST_CUDA_KERNEL_EXECUTION_ERROR,        ///< CUDA kernel execution error
+    NPPST_NULL_POINTER_ERROR,                 ///< NULL pointer argument error
+    NPPST_TEXTURE_BIND_ERROR,                 ///< CUDA texture binding error or non-zero offset returned
+    NPPST_MEMCPY_ERROR,                       ///< CUDA memory copy error
+    NPPST_MEM_ALLOC_ERR,                      ///< CUDA memory allocation error
+    NPPST_MEMFREE_ERR,                        ///< CUDA memory deallocation error
+
+    //NPPST statuses
+    NPPST_INVALID_ROI,                        ///< Invalid region of interest argument
+    NPPST_INVALID_STEP,                       ///< Invalid image lines step argument (check sign, alignment, relation to image width)
+    NPPST_INVALID_SCALE,                      ///< Invalid scale parameter passed
+    NPPST_MEM_INSUFFICIENT_BUFFER,            ///< Insufficient user-allocated buffer
+    NPPST_MEM_RESIDENCE_ERROR,                ///< Memory residence error detected (check if pointers should be device or pinned)
+    NPPST_MEM_INTERNAL_ERROR,                 ///< Internal memory management error
+
+    NCV_LAST_STATUS                           ///< Marker to continue error numeration in other files
+};
+
+
+typedef Ncv32u NCVStatus;
+
+
+#define NCV_SET_SKIP_COND(x) \
+    bool __ncv_skip_cond = x
+
+
+#define NCV_RESET_SKIP_COND(x) \
+    __ncv_skip_cond = x
+
+
+#define NCV_SKIP_COND_BEGIN \
+    if (!__ncv_skip_cond) {
+
+
+#define NCV_SKIP_COND_END \
+    }
+
+
+//==============================================================================
+//
+// Timer
+//
+//==============================================================================
+
+
+typedef struct _NcvTimer *NcvTimer;
+
+CV_EXPORTS NcvTimer ncvStartTimer(void);
+
+CV_EXPORTS double ncvEndQueryTimerUs(NcvTimer t);
+
+CV_EXPORTS double ncvEndQueryTimerMs(NcvTimer t);
+
+
+//==============================================================================
+//
+// Memory management classes template compound types
+//
+//==============================================================================
+
+
+/**
+* Calculates the aligned top bound value
+*/
+CV_EXPORTS Ncv32u alignUp(Ncv32u what, Ncv32u alignment);
+
+
+/**
+* NCVMemoryType
+*/
+enum NCVMemoryType
+{
+    NCVMemoryTypeNone,
+    NCVMemoryTypeHostPageable,
+    NCVMemoryTypeHostPinned,
+    NCVMemoryTypeDevice
+};
+
+
+/**
+* NCVMemPtr
+*/
+struct CV_EXPORTS NCVMemPtr
+{
+    void *ptr;
+    NCVMemoryType memtype;
+    void clear();
+};
+
+
+/**
+* NCVMemSegment
+*/
+struct CV_EXPORTS NCVMemSegment
+{
+    NCVMemPtr begin;
+    size_t size;
+    void clear();
+};
+
+
+/**
+* INCVMemAllocator (Interface)
+*/
+class CV_EXPORTS INCVMemAllocator
+{
+public:
+    virtual ~INCVMemAllocator() = 0;
+
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size) = 0;
+    virtual NCVStatus dealloc(NCVMemSegment &seg) = 0;
+
+    virtual NcvBool isInitialized(void) const = 0;
+    virtual NcvBool isCounting(void) const = 0;
+
+    virtual NCVMemoryType memType(void) const = 0;
+    virtual Ncv32u alignment(void) const = 0;
+    virtual size_t maxSize(void) const = 0;
+};
+
+inline INCVMemAllocator::~INCVMemAllocator() {}
+
+
+/**
+* NCVMemStackAllocator
+*/
+class CV_EXPORTS NCVMemStackAllocator : public INCVMemAllocator
+{
+    NCVMemStackAllocator();
+    NCVMemStackAllocator(const NCVMemStackAllocator &);
+
+public:
+
+    explicit NCVMemStackAllocator(Ncv32u alignment);
+    NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr=NULL);
+    virtual ~NCVMemStackAllocator();
+
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);
+    virtual NCVStatus dealloc(NCVMemSegment &seg);
+
+    virtual NcvBool isInitialized() const;
+    virtual NcvBool isCounting() const;
+
+    virtual NCVMemoryType memType() const;
+    virtual Ncv32u alignment() const;
+    virtual size_t maxSize() const;
+
+private:
+
+    NCVMemoryType _memType;
+    Ncv32u _alignment;
+    Ncv8u *allocBegin;
+    Ncv8u *begin;
+    Ncv8u *end;
+    size_t currentSize;
+    size_t _maxSize;
+    NcvBool bReusesMemory;
+};
+
+
+/**
+* NCVMemNativeAllocator
+*/
+class CV_EXPORTS NCVMemNativeAllocator : public INCVMemAllocator
+{
+public:
+
+    NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment);
+    virtual ~NCVMemNativeAllocator();
+
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);
+    virtual NCVStatus dealloc(NCVMemSegment &seg);
+
+    virtual NcvBool isInitialized() const;
+    virtual NcvBool isCounting() const;
+
+    virtual NCVMemoryType memType() const;
+    virtual Ncv32u alignment() const;
+    virtual size_t maxSize() const;
+
+private:
+
+    NCVMemNativeAllocator();
+    NCVMemNativeAllocator(const NCVMemNativeAllocator &);
+
+    NCVMemoryType _memType;
+    Ncv32u _alignment;
+    size_t currentSize;
+    size_t _maxSize;
+};
+
+
+/**
+* Copy dispatchers
+*/
+CV_EXPORTS NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
+                                       const void *src, NCVMemoryType srcType,
+                                       size_t sz, cudaStream_t cuStream);
+
+
+CV_EXPORTS NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
+                                         const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
+                                         Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream);
+
+
+/**
+* NCVVector (1D)
+*/
+template <class T>
+class NCVVector
+{
+    NCVVector(const NCVVector &);
+
+public:
+
+    NCVVector()
+    {
+        clear();
+    }
+
+    virtual ~NCVVector() {}
+
+    void clear()
+    {
+        _ptr = NULL;
+        _length = 0;
+        _memtype = NCVMemoryTypeNone;
+    }
+
+    NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
+    {
+        if (howMuch == 0)
+        {
+            ncvAssertReturn(dst._length == this->_length, NCV_MEM_COPY_ERROR);
+            howMuch = this->_length * sizeof(T);
+        }
+        else
+        {
+            ncvAssertReturn(dst._length * sizeof(T) >= howMuch &&
+                this->_length * sizeof(T) >= howMuch &&
+                howMuch > 0, NCV_MEM_COPY_ERROR);
+        }
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) &&
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
+
+        NCVStatus ncvStat = NCV_SUCCESS;
+        if (this->_memtype != NCVMemoryTypeNone)
+        {
+            ncvStat = memSegCopyHelper(dst._ptr, dst._memtype,
+                                       this->_ptr, this->_memtype,
+                                       howMuch, cuStream);
+        }
+
+        return ncvStat;
+    }
+
+    T *ptr() const {return this->_ptr;}
+    size_t length() const {return this->_length;}
+    NCVMemoryType memType() const {return this->_memtype;}
+
+protected:
+
+    T *_ptr;
+    size_t _length;
+    NCVMemoryType _memtype;
+};
+
+
+/**
+* NCVVectorAlloc
+*/
+template <class T>
+class NCVVectorAlloc : public NCVVector<T>
+{
+    NCVVectorAlloc();
+    NCVVectorAlloc(const NCVVectorAlloc &);
+    NCVVectorAlloc& operator=(const NCVVectorAlloc<T>&);
+
+public:
+
+    NCVVectorAlloc(INCVMemAllocator &allocator_, Ncv32u length_)
+        :
+        allocator(allocator_)
+    {
+        NCVStatus ncvStat;
+
+        this->clear();
+        this->allocatedMem.clear();
+
+        ncvStat = allocator.alloc(this->allocatedMem, length_ * sizeof(T));
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVVectorAlloc ctor:: alloc failed", );
+
+        this->_ptr = (T *)this->allocatedMem.begin.ptr;
+        this->_length = length_;
+        this->_memtype = this->allocatedMem.begin.memtype;
+    }
+
+    ~NCVVectorAlloc()
+    {
+        NCVStatus ncvStat;
+
+        ncvStat = allocator.dealloc(this->allocatedMem);
+        ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVVectorAlloc dtor:: dealloc failed");
+
+        this->clear();
+    }
+
+    NcvBool isMemAllocated() const
+    {
+        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
+    }
+
+    Ncv32u getAllocatorsAlignment() const
+    {
+        return allocator.alignment();
+    }
+
+    NCVMemSegment getSegment() const
+    {
+        return allocatedMem;
+    }
+
+private:
+    INCVMemAllocator &allocator;
+    NCVMemSegment allocatedMem;
+};
+
+
+/**
+* NCVVectorReuse
+*/
+template <class T>
+class NCVVectorReuse : public NCVVector<T>
+{
+    NCVVectorReuse();
+    NCVVectorReuse(const NCVVectorReuse &);
+
+public:
+
+    explicit NCVVectorReuse(const NCVMemSegment &memSegment)
+    {
+        this->bReused = false;
+        this->clear();
+
+        this->_length = memSegment.size / sizeof(T);
+        this->_ptr = (T *)memSegment.begin.ptr;
+        this->_memtype = memSegment.begin.memtype;
+
+        this->bReused = true;
+    }
+
+    NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length_)
+    {
+        this->bReused = false;
+        this->clear();
+
+        ncvAssertPrintReturn(length_ * sizeof(T) <= memSegment.size, \
+            "NCVVectorReuse ctor:: memory binding failed due to size mismatch", );
+
+        this->_length = length_;
+        this->_ptr = (T *)memSegment.begin.ptr;
+        this->_memtype = memSegment.begin.memtype;
+
+        this->bReused = true;
+    }
+
+    NcvBool isMemReused() const
+    {
+        return this->bReused;
+    }
+
+private:
+
+    NcvBool bReused;
+};
+
+
+/**
+* NCVMatrix (2D)
+*/
+template <class T>
+class NCVMatrix
+{
+    NCVMatrix(const NCVMatrix &);
+
+public:
+
+    NCVMatrix()
+    {
+        clear();
+    }
+
+    virtual ~NCVMatrix() {}
+
+    void clear()
+    {
+        _ptr = NULL;
+        _pitch = 0;
+        _width = 0;
+        _height = 0;
+        _memtype = NCVMemoryTypeNone;
+    }
+
+    Ncv32u stride() const
+    {
+        return _pitch / sizeof(T);
+    }
+
+    //a side effect of this function is that it copies everything in a single chunk, so the "padding" will be overwritten
+    NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
+    {
+        if (howMuch == 0)
+        {
+            ncvAssertReturn(dst._pitch == this->_pitch &&
+                            dst._height == this->_height, NCV_MEM_COPY_ERROR);
+            howMuch = this->_pitch * this->_height;
+        }
+        else
+        {
+            ncvAssertReturn(dst._pitch * dst._height >= howMuch &&
+                            this->_pitch * this->_height >= howMuch &&
+                            howMuch > 0, NCV_MEM_COPY_ERROR);
+        }
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) &&
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
+
+        NCVStatus ncvStat = NCV_SUCCESS;
+        if (this->_memtype != NCVMemoryTypeNone)
+        {
+            ncvStat = memSegCopyHelper(dst._ptr, dst._memtype,
+                                       this->_ptr, this->_memtype,
+                                       howMuch, cuStream);
+        }
+
+        return ncvStat;
+    }
+
+    NCVStatus copy2D(NCVMatrix<T> &dst, NcvSize32u roi, cudaStream_t cuStream) const
+    {
+        ncvAssertReturn(this->width() >= roi.width && this->height() >= roi.height &&
+                        dst.width() >= roi.width && dst.height() >= roi.height, NCV_MEM_COPY_ERROR);
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) &&
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
+
+        NCVStatus ncvStat = NCV_SUCCESS;
+        if (this->_memtype != NCVMemoryTypeNone)
+        {
+            ncvStat = memSegCopyHelper2D(dst._ptr, dst._pitch, dst._memtype,
+                                         this->_ptr, this->_pitch, this->_memtype,
+                                         roi.width * sizeof(T), roi.height, cuStream);
+        }
+
+        return ncvStat;
+    }
+
+    T& at(Ncv32u x, Ncv32u y) const
+    {
+        NcvBool bOutRange = (x >= this->_width || y >= this->_height);
+        ncvAssertPrintCheck(!bOutRange, "Error addressing matrix");
+        if (bOutRange)
+        {
+            return *this->_ptr;
+        }
+        return ((T *)((Ncv8u *)this->_ptr + y * this->_pitch))[x];
+    }
+
+    T *ptr() const {return this->_ptr;}
+    Ncv32u width() const {return this->_width;}
+    Ncv32u height() const {return this->_height;}
+    NcvSize32u size() const {return NcvSize32u(this->_width, this->_height);}
+    Ncv32u pitch() const {return this->_pitch;}
+    NCVMemoryType memType() const {return this->_memtype;}
+
+protected:
+
+    T *_ptr;
+    Ncv32u _width;
+    Ncv32u _height;
+    Ncv32u _pitch;
+    NCVMemoryType _memtype;
+};
+
+
+/**
+* NCVMatrixAlloc
+*/
+template <class T>
+class NCVMatrixAlloc : public NCVMatrix<T>
+{
+    NCVMatrixAlloc();
+    NCVMatrixAlloc(const NCVMatrixAlloc &);
+    NCVMatrixAlloc& operator=(const NCVMatrixAlloc &);
+public:
+
+    NCVMatrixAlloc(INCVMemAllocator &allocator_, Ncv32u width_, Ncv32u height_, Ncv32u pitch_=0)
+        :
+        allocator(allocator_)
+    {
+        NCVStatus ncvStat;
+
+        this->clear();
+        this->allocatedMem.clear();
+
+        Ncv32u widthBytes = width_ * sizeof(T);
+        Ncv32u pitchBytes = alignUp(widthBytes, allocator.alignment());
+
+        if (pitch_ != 0)
+        {
+            ncvAssertPrintReturn(pitch_ >= pitchBytes &&
+                (pitch_ & (allocator.alignment() - 1)) == 0,
+                "NCVMatrixAlloc ctor:: incorrect pitch passed", );
+            pitchBytes = pitch_;
+        }
+
+        Ncv32u requiredAllocSize = pitchBytes * height_;
+
+        ncvStat = allocator.alloc(this->allocatedMem, requiredAllocSize);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc ctor:: alloc failed", );
+
+        this->_ptr = (T *)this->allocatedMem.begin.ptr;
+        this->_width = width_;
+        this->_height = height_;
+        this->_pitch = pitchBytes;
+        this->_memtype = this->allocatedMem.begin.memtype;
+    }
+
+    ~NCVMatrixAlloc()
+    {
+        NCVStatus ncvStat;
+
+        ncvStat = allocator.dealloc(this->allocatedMem);
+        ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc dtor:: dealloc failed");
+
+        this->clear();
+    }
+
+    NcvBool isMemAllocated() const
+    {
+        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
+    }
+
+    Ncv32u getAllocatorsAlignment() const
+    {
+        return allocator.alignment();
+    }
+
+    NCVMemSegment getSegment() const
+    {
+        return allocatedMem;
+    }
+
+private:
+
+    INCVMemAllocator &allocator;
+    NCVMemSegment allocatedMem;
+};
+
+
+/**
+* NCVMatrixReuse
+*/
+template <class T>
+class NCVMatrixReuse : public NCVMatrix<T>
+{
+    NCVMatrixReuse();
+    NCVMatrixReuse(const NCVMatrixReuse &);
+
+public:
+
+    NCVMatrixReuse(const NCVMemSegment &memSegment, Ncv32u alignment, Ncv32u width_, Ncv32u height_, Ncv32u pitch_=0, NcvBool bSkipPitchCheck=false)
+    {
+        this->bReused = false;
+        this->clear();
+
+        Ncv32u widthBytes = width_ * sizeof(T);
+        Ncv32u pitchBytes = alignUp(widthBytes, alignment);
+
+        if (pitch_ != 0)
+        {
+            if (!bSkipPitchCheck)
+            {
+                ncvAssertPrintReturn(pitch_ >= pitchBytes &&
+                    (pitch_ & (alignment - 1)) == 0,
+                    "NCVMatrixReuse ctor:: incorrect pitch passed", );
+            }
+            else
+            {
+                ncvAssertPrintReturn(pitch_ >= widthBytes, "NCVMatrixReuse ctor:: incorrect pitch passed", );
+            }
+            pitchBytes = pitch_;
+        }
+
+        ncvAssertPrintReturn(pitchBytes * height_ <= memSegment.size, \
+            "NCVMatrixReuse ctor:: memory binding failed due to size mismatch", );
+
+        this->_width = width_;
+        this->_height = height_;
+        this->_pitch = pitchBytes;
+        this->_ptr = (T *)memSegment.begin.ptr;
+        this->_memtype = memSegment.begin.memtype;
+
+        this->bReused = true;
+    }
+
+    NCVMatrixReuse(const NCVMatrix<T> &mat, NcvRect32u roi)
+    {
+        this->bReused = false;
+        this->clear();
+
+        ncvAssertPrintReturn(roi.x < mat.width() && roi.y < mat.height() && \
+            roi.x + roi.width <= mat.width() && roi.y + roi.height <= mat.height(),
+            "NCVMatrixReuse ctor:: memory binding failed due to mismatching ROI and source matrix dims", );
+
+        this->_width = roi.width;
+        this->_height = roi.height;
+        this->_pitch = mat.pitch();
+        this->_ptr = &mat.at(roi.x, roi.y);
+        this->_memtype = mat.memType();
+
+        this->bReused = true;
+    }
+
+    NcvBool isMemReused() const
+    {
+        return this->bReused;
+    }
+
+private:
+
+    NcvBool bReused;
+};
+
+
+/**
+* Operations with rectangles
+*/
+CV_EXPORTS NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses, Ncv32u &numHypotheses,
+                                              Ncv32u minNeighbors, Ncv32f intersectEps, NCVVector<Ncv32u> *hypothesesWeights);
+
+
+CV_EXPORTS NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                           NcvRect32u *h_rects, Ncv32u numRects, Ncv8u color);
+
+
+CV_EXPORTS NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                            NcvRect32u *h_rects, Ncv32u numRects, Ncv32u color);
+
+
+CV_EXPORTS NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                             NcvRect32u *d_rects, Ncv32u numRects, Ncv8u color, cudaStream_t cuStream);
+
+
+CV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
+                                              NcvRect32u *d_rects, Ncv32u numRects, Ncv32u color, cudaStream_t cuStream);
+
+
+#define CLAMP(x,a,b)        ( (x) > (b) ? (b) : ( (x) < (a) ? (a) : (x) ) )
+#define CLAMP_TOP(x, a)     (((x) > (a)) ? (a) : (x))
+#define CLAMP_BOTTOM(x, a)  (((x) < (a)) ? (a) : (x))
+#define CLAMP_0_255(x)      CLAMP(x,0,255)
+
+
+#define SUB_BEGIN(type, name)    struct { __inline type name
+#define SUB_END(name)            } name;
+#define SUB_CALL(name)           name.name
+
+#define SQR(x)              ((x)*(x))
+
+
+#define ncvSafeMatAlloc(name, type, alloc, width, height, err) \
+    NCVMatrixAlloc<type> name(alloc, width, height); \
+    ncvAssertReturn(name.isMemAllocated(), err);
+
+//! @}
+
+#endif // _ncv_hpp_
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NCVBroxOpticalFlow.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NCVBroxOpticalFlow.hpp
new file mode 100644
index 00000000000..c14532b4809
--- /dev/null
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCVBroxOpticalFlow.hpp
@@ -0,0 +1,110 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// NVIDIA CUDA implementation of Brox et al Optical Flow algorithm
+//
+// Algorithm is explained in the original paper:
+//      T. Brox, A. Bruhn, N. Papenberg, J. Weickert:
+//      High accuracy optical flow estimation based on a theory for warping.
+//      ECCV 2004.
+//
+// Implementation by Mikhail Smirnov
+// email: msmirnov@nvidia.com, devsupport@nvidia.com
+//
+// Credits for help with the code to:
+// Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _ncv_optical_flow_h_
+#define _ncv_optical_flow_h_
+
+#include "opencv2/cudalegacy/NCV.hpp"
+
+//! @addtogroup cudalegacy
+//! @{
+
+/// \brief Model and solver parameters
+struct NCVBroxOpticalFlowDescriptor
+{
+    /// flow smoothness
+    Ncv32f alpha;
+    /// gradient constancy importance
+    Ncv32f gamma;
+    /// pyramid scale factor
+    Ncv32f scale_factor;
+    /// number of lagged non-linearity iterations (inner loop)
+    Ncv32u number_of_inner_iterations;
+    /// number of warping iterations (number of pyramid levels)
+    Ncv32u number_of_outer_iterations;
+    /// number of linear system solver iterations
+    Ncv32u number_of_solver_iterations;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Compute optical flow
+///
+/// Based on method by Brox et al [2004]
+/// \param [in]  desc              model and solver parameters
+/// \param [in]  gpu_mem_allocator GPU memory allocator
+/// \param [in]  frame0            source frame
+/// \param [in]  frame1            frame to track
+/// \param [out] u                 flow horizontal component (along \b x axis)
+/// \param [out] v                 flow vertical component (along \b y axis)
+/// \param       stream
+/// \return                        computation status
+/////////////////////////////////////////////////////////////////////////////////////////
+
+CV_EXPORTS
+NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
+                             INCVMemAllocator &gpu_mem_allocator,
+                             const NCVMatrix<Ncv32f> &frame0,
+                             const NCVMatrix<Ncv32f> &frame1,
+                             NCVMatrix<Ncv32f> &u,
+                             NCVMatrix<Ncv32f> &v,
+                             cudaStream_t stream);
+
+//! @}
+
+#endif
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NCVHaarObjectDetection.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NCVHaarObjectDetection.hpp
new file mode 100644
index 00000000000..50d3de3250b
--- /dev/null
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCVHaarObjectDetection.hpp
@@ -0,0 +1,463 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
+//
+// The algorithm and code are explained in the upcoming GPU Computing Gems
+// chapter in detail:
+//
+//   Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
+//   PDF URL placeholder
+//   email: aobukhov@nvidia.com, devsupport@nvidia.com
+//
+// Credits for help with the code to:
+// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _ncvhaarobjectdetection_hpp_
+#define _ncvhaarobjectdetection_hpp_
+
+#include "opencv2/cudalegacy/NCV.hpp"
+
+//! @addtogroup cudalegacy
+//! @{
+
+//==============================================================================
+//
+// Guaranteed size cross-platform classifier structures
+//
+//==============================================================================
+#if defined __GNUC__ && (__GNUC__*100 + __GNUC_MINOR__ > 204)
+typedef Ncv32f __attribute__((__may_alias__)) Ncv32f_a;
+#else
+typedef Ncv32f Ncv32f_a;
+#endif
+
+struct HaarFeature64
+{
+    uint2 _ui2;
+
+#define HaarFeature64_CreateCheck_MaxRectField                  0xFF
+
+    __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u /*clsWidth*/, Ncv32u /*clsHeight*/)
+    {
+        ncvAssertReturn(rectWidth <= HaarFeature64_CreateCheck_MaxRectField && rectHeight <= HaarFeature64_CreateCheck_MaxRectField, NCV_HAAR_TOO_LARGE_FEATURES);
+        ((NcvRect8u*)&(this->_ui2.x))->x = (Ncv8u)rectX;
+        ((NcvRect8u*)&(this->_ui2.x))->y = (Ncv8u)rectY;
+        ((NcvRect8u*)&(this->_ui2.x))->width = (Ncv8u)rectWidth;
+        ((NcvRect8u*)&(this->_ui2.x))->height = (Ncv8u)rectHeight;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setWeight(Ncv32f weight)
+    {
+        ((Ncv32f_a*)&(this->_ui2.y))[0] = weight;
+        return NCV_SUCCESS;
+    }
+
+    __device__ __host__ void getRect(Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
+    {
+        NcvRect8u tmpRect = *(NcvRect8u*)(&this->_ui2.x);
+        *rectX = tmpRect.x;
+        *rectY = tmpRect.y;
+        *rectWidth = tmpRect.width;
+        *rectHeight = tmpRect.height;
+    }
+
+    __device__ __host__ Ncv32f getWeight(void)
+    {
+        return *(Ncv32f_a*)(&this->_ui2.y);
+    }
+};
+
+
+struct HaarFeatureDescriptor32
+{
+private:
+
+#define HaarFeatureDescriptor32_Interpret_MaskFlagTilted        0x80000000
+#define HaarFeatureDescriptor32_Interpret_MaskFlagLeftNodeLeaf  0x40000000
+#define HaarFeatureDescriptor32_Interpret_MaskFlagRightNodeLeaf 0x20000000
+#define HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures      0x1F
+#define HaarFeatureDescriptor32_NumFeatures_Shift               24
+#define HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset    0x00FFFFFF
+
+    Ncv32u desc;
+
+public:
+
+    __host__ NCVStatus create(NcvBool bTilted, NcvBool bLeftLeaf, NcvBool bRightLeaf,
+                              Ncv32u numFeatures, Ncv32u offsetFeatures)
+    {
+        if (numFeatures > HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures)
+        {
+            return NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER;
+        }
+        if (offsetFeatures > HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset)
+        {
+            return NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE;
+        }
+        this->desc = 0;
+        this->desc |= (bTilted ? HaarFeatureDescriptor32_Interpret_MaskFlagTilted : 0);
+        this->desc |= (bLeftLeaf ? HaarFeatureDescriptor32_Interpret_MaskFlagLeftNodeLeaf : 0);
+        this->desc |= (bRightLeaf ? HaarFeatureDescriptor32_Interpret_MaskFlagRightNodeLeaf : 0);
+        this->desc |= (numFeatures << HaarFeatureDescriptor32_NumFeatures_Shift);
+        this->desc |= offsetFeatures;
+        return NCV_SUCCESS;
+    }
+
+    __device__ __host__ NcvBool isTilted(void)
+    {
+        return (this->desc & HaarFeatureDescriptor32_Interpret_MaskFlagTilted) != 0;
+    }
+
+    __device__ __host__ NcvBool isLeftNodeLeaf(void)
+    {
+        return (this->desc & HaarFeatureDescriptor32_Interpret_MaskFlagLeftNodeLeaf) != 0;
+    }
+
+    __device__ __host__ NcvBool isRightNodeLeaf(void)
+    {
+        return (this->desc & HaarFeatureDescriptor32_Interpret_MaskFlagRightNodeLeaf) != 0;
+    }
+
+    __device__ __host__ Ncv32u getNumFeatures(void)
+    {
+        return (this->desc >> HaarFeatureDescriptor32_NumFeatures_Shift) & HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures;
+    }
+
+    __device__ __host__ Ncv32u getFeaturesOffset(void)
+    {
+        return this->desc & HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset;
+    }
+};
+
+struct HaarClassifierNodeDescriptor32
+{
+    uint1 _ui1;
+
+    __host__ NCVStatus create(Ncv32f leafValue)
+    {
+        *(Ncv32f_a *)&this->_ui1 = leafValue;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus create(Ncv32u offsetHaarClassifierNode)
+    {
+        this->_ui1.x = offsetHaarClassifierNode;
+        return NCV_SUCCESS;
+    }
+
+    __host__ Ncv32f getLeafValueHost(void)
+    {
+        return *(Ncv32f_a *)&this->_ui1.x;
+    }
+
+#ifdef __CUDACC__
+    __device__ Ncv32f getLeafValue(void)
+    {
+        return __int_as_float(this->_ui1.x);
+    }
+#endif
+
+    __device__ __host__ Ncv32u getNextNodeOffset(void)
+    {
+        return this->_ui1.x;
+    }
+};
+
+#if defined __GNUC__ && (__GNUC__*100 + __GNUC_MINOR__ > 204)
+typedef Ncv32u __attribute__((__may_alias__)) Ncv32u_a;
+#else
+typedef Ncv32u Ncv32u_a;
+#endif
+
+struct HaarClassifierNode128
+{
+    uint4 _ui4;
+
+    __host__ NCVStatus setFeatureDesc(HaarFeatureDescriptor32 f)
+    {
+        this->_ui4.x = *(Ncv32u *)&f;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setThreshold(Ncv32f t)
+    {
+        this->_ui4.y = *(Ncv32u_a *)&t;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setLeftNodeDesc(HaarClassifierNodeDescriptor32 nl)
+    {
+        this->_ui4.z = *(Ncv32u_a *)&nl;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setRightNodeDesc(HaarClassifierNodeDescriptor32 nr)
+    {
+        this->_ui4.w = *(Ncv32u_a *)&nr;
+        return NCV_SUCCESS;
+    }
+
+    __host__ __device__ HaarFeatureDescriptor32 getFeatureDesc(void)
+    {
+        return *(HaarFeatureDescriptor32 *)&this->_ui4.x;
+    }
+
+    __host__ __device__ Ncv32f getThreshold(void)
+    {
+        return *(Ncv32f_a*)&this->_ui4.y;
+    }
+
+    __host__ __device__ HaarClassifierNodeDescriptor32 getLeftNodeDesc(void)
+    {
+        return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.z;
+    }
+
+    __host__ __device__ HaarClassifierNodeDescriptor32 getRightNodeDesc(void)
+    {
+        return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.w;
+    }
+};
+
+
+struct HaarStage64
+{
+#define HaarStage64_Interpret_MaskRootNodes         0x0000FFFF
+#define HaarStage64_Interpret_MaskRootNodeOffset    0xFFFF0000
+#define HaarStage64_Interpret_ShiftRootNodeOffset   16
+
+    uint2 _ui2;
+
+    __host__ NCVStatus setStageThreshold(Ncv32f t)
+    {
+        this->_ui2.x = *(Ncv32u_a *)&t;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setStartClassifierRootNodeOffset(Ncv32u val)
+    {
+        if (val > (HaarStage64_Interpret_MaskRootNodeOffset >> HaarStage64_Interpret_ShiftRootNodeOffset))
+        {
+            return NCV_HAAR_XML_LOADING_EXCEPTION;
+        }
+        this->_ui2.y = (val << HaarStage64_Interpret_ShiftRootNodeOffset) | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes);
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setNumClassifierRootNodes(Ncv32u val)
+    {
+        if (val > HaarStage64_Interpret_MaskRootNodes)
+        {
+            return NCV_HAAR_XML_LOADING_EXCEPTION;
+        }
+        this->_ui2.y = val | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodeOffset);
+        return NCV_SUCCESS;
+    }
+
+    __host__ __device__ Ncv32f getStageThreshold(void)
+    {
+        return *(Ncv32f_a*)&this->_ui2.x;
+    }
+
+    __host__ __device__ Ncv32u getStartClassifierRootNodeOffset(void)
+    {
+        return (this->_ui2.y >> HaarStage64_Interpret_ShiftRootNodeOffset);
+    }
+
+    __host__ __device__ Ncv32u getNumClassifierRootNodes(void)
+    {
+        return (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes);
+    }
+};
+
+
+NCV_CT_ASSERT(sizeof(HaarFeature64) == 8);
+NCV_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);
+NCV_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);
+NCV_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);
+NCV_CT_ASSERT(sizeof(HaarStage64) == 8);
+
+
+//==============================================================================
+//
+// Classifier cascade descriptor
+//
+//==============================================================================
+
+
+struct HaarClassifierCascadeDescriptor
+{
+    Ncv32u NumStages;
+    Ncv32u NumClassifierRootNodes;
+    Ncv32u NumClassifierTotalNodes;
+    Ncv32u NumFeatures;
+    NcvSize32u ClassifierSize;
+    NcvBool bNeedsTiltedII;
+    NcvBool bHasStumpsOnly;
+};
+
+
+//==============================================================================
+//
+// Functional interface
+//
+//==============================================================================
+
+
+enum
+{
+    NCVPipeObjDet_Default               = 0x000,
+    NCVPipeObjDet_UseFairImageScaling   = 0x001,
+    NCVPipeObjDet_FindLargestObject     = 0x002,
+    NCVPipeObjDet_VisualizeInPlace      = 0x004,
+};
+
+
+CV_EXPORTS NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
+                                                        NcvSize32u srcRoi,
+                                                        NCVVector<NcvRect32u> &d_dstRects,
+                                                        Ncv32u &dstNumRects,
+
+                                                        HaarClassifierCascadeDescriptor &haar,
+                                                        NCVVector<HaarStage64> &h_HaarStages,
+                                                        NCVVector<HaarStage64> &d_HaarStages,
+                                                        NCVVector<HaarClassifierNode128> &d_HaarNodes,
+                                                        NCVVector<HaarFeature64> &d_HaarFeatures,
+
+                                                        NcvSize32u minObjSize,
+                                                        Ncv32u minNeighbors,      //default 4
+                                                        Ncv32f scaleStep,         //default 1.2f
+                                                        Ncv32u pixelStep,         //default 1
+                                                        Ncv32u flags,             //default NCVPipeObjDet_Default
+
+                                                        INCVMemAllocator &gpuAllocator,
+                                                        INCVMemAllocator &cpuAllocator,
+                                                        cudaDeviceProp &devProp,
+                                                        cudaStream_t cuStream);
+
+
+#define OBJDET_MASK_ELEMENT_INVALID_32U     0xFFFFFFFF
+#define HAAR_STDDEV_BORDER                  1
+
+
+CV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
+                                                           NCVMatrix<Ncv32f> &d_weights,
+                                                           NCVMatrixAlloc<Ncv32u> &d_pixelMask,
+                                                           Ncv32u &numDetections,
+                                                           HaarClassifierCascadeDescriptor &haar,
+                                                           NCVVector<HaarStage64> &h_HaarStages,
+                                                           NCVVector<HaarStage64> &d_HaarStages,
+                                                           NCVVector<HaarClassifierNode128> &d_HaarNodes,
+                                                           NCVVector<HaarFeature64> &d_HaarFeatures,
+                                                           NcvBool bMaskElements,
+                                                           NcvSize32u anchorsRoi,
+                                                           Ncv32u pixelStep,
+                                                           Ncv32f scaleArea,
+                                                           INCVMemAllocator &gpuAllocator,
+                                                           INCVMemAllocator &cpuAllocator,
+                                                           cudaDeviceProp &devProp,
+                                                           cudaStream_t cuStream);
+
+
+CV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
+                                                         NCVMatrix<Ncv32f> &h_weights,
+                                                         NCVMatrixAlloc<Ncv32u> &h_pixelMask,
+                                                         Ncv32u &numDetections,
+                                                         HaarClassifierCascadeDescriptor &haar,
+                                                         NCVVector<HaarStage64> &h_HaarStages,
+                                                         NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                                         NCVVector<HaarFeature64> &h_HaarFeatures,
+                                                         NcvBool bMaskElements,
+                                                         NcvSize32u anchorsRoi,
+                                                         Ncv32u pixelStep,
+                                                         Ncv32f scaleArea);
+
+
+#define RECT_SIMILARITY_PROPORTION      0.2f
+
+
+CV_EXPORTS NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
+                                                     Ncv32u numPixelMaskDetections,
+                                                     NCVVector<NcvRect32u> &hypotheses,
+                                                     Ncv32u &totalDetections,
+                                                     Ncv32u totalMaxDetections,
+                                                     Ncv32u rectWidth,
+                                                     Ncv32u rectHeight,
+                                                     Ncv32f curScale,
+                                                     cudaStream_t cuStream);
+
+
+CV_EXPORTS NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
+                                                   Ncv32u numPixelMaskDetections,
+                                                   NCVVector<NcvRect32u> &hypotheses,
+                                                   Ncv32u &totalDetections,
+                                                   Ncv32u totalMaxDetections,
+                                                   Ncv32u rectWidth,
+                                                   Ncv32u rectHeight,
+                                                   Ncv32f curScale);
+
+
+CV_EXPORTS NCVStatus ncvHaarGetClassifierSize(const cv::String &filename, Ncv32u &numStages,
+                                               Ncv32u &numNodes, Ncv32u &numFeatures);
+
+
+CV_EXPORTS NCVStatus ncvHaarLoadFromFile_host(const cv::String &filename,
+                                               HaarClassifierCascadeDescriptor &haar,
+                                               NCVVector<HaarStage64> &h_HaarStages,
+                                               NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                               NCVVector<HaarFeature64> &h_HaarFeatures);
+
+
+CV_EXPORTS NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename,
+                                             HaarClassifierCascadeDescriptor haar,
+                                             NCVVector<HaarStage64> &h_HaarStages,
+                                             NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                             NCVVector<HaarFeature64> &h_HaarFeatures);
+
+//! @}
+
+#endif // _ncvhaarobjectdetection_hpp_
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NCVPyramid.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NCVPyramid.hpp
new file mode 100644
index 00000000000..28de29fe318
--- /dev/null
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCVPyramid.hpp
@@ -0,0 +1,113 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncvpyramid_hpp_
+#define _ncvpyramid_hpp_
+
+#include <memory>
+#include <vector>
+#include "opencv2/cudalegacy/NCV.hpp"
+#include "opencv2/core/cuda/common.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace pyramid
+    {
+        CV_EXPORTS void downsampleX2(PtrStepSzb src, PtrStepSzb dst, int depth, int cn, cudaStream_t stream);
+        CV_EXPORTS void interpolateFrom1(PtrStepSzb src, PtrStepSzb dst, int depth, int cn, cudaStream_t stream);
+    }
+}}}
+
+#if 0 //def _WIN32
+
+template <class T>
+class CV_EXPORTS NCVMatrixStack
+{
+public:
+    NCVMatrixStack() {this->_arr.clear();}
+    ~NCVMatrixStack()
+    {
+        const Ncv32u nElem = this->_arr.size();
+        for (Ncv32u i=0; i<nElem; i++)
+        {
+            pop_back();
+        }
+    }
+    void push_back(NCVMatrix<T> *elem) {this->_arr.push_back(std::shared_ptr< NCVMatrix<T> >(elem));}
+    void pop_back() {this->_arr.pop_back();}
+    NCVMatrix<T> * operator [] (int i) const {return this->_arr[i].get();}
+private:
+    std::vector< std::shared_ptr< NCVMatrix<T> > > _arr;
+};
+
+
+template <class T>
+class CV_EXPORTS NCVImagePyramid
+{
+public:
+
+    NCVImagePyramid(const NCVMatrix<T> &img,
+                    Ncv8u nLayers,
+                    INCVMemAllocator &alloc,
+                    cudaStream_t cuStream);
+    ~NCVImagePyramid();
+    NcvBool isInitialized() const;
+    NCVStatus getLayer(NCVMatrix<T> &outImg,
+                       NcvSize32u outRoi,
+                       NcvBool bTrilinear,
+                       cudaStream_t cuStream) const;
+
+private:
+
+    NcvBool _isInitialized;
+    const NCVMatrix<T> *layer0;
+    NCVMatrixStack<T> pyramid;
+    Ncv32u nLayers;
+};
+
+#endif //_WIN32
+
+//! @endcond
+
+#endif //_ncvpyramid_hpp_
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
new file mode 100644
index 00000000000..6cc50d7a471
--- /dev/null
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
@@ -0,0 +1,906 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _npp_staging_hpp_
+#define _npp_staging_hpp_
+
+#include "opencv2/cudalegacy/NCV.hpp"
+
+//! @addtogroup cudalegacy
+//! @{
+
+/** \defgroup core_npp NPPST Core
+ * Basic functions for CUDA streams management.
+ * @{
+ */
+
+/**
+ * Gets an active CUDA stream used by NPPST
+ * NOT THREAD SAFE
+ * \return Current CUDA stream
+ */
+CV_EXPORTS
+cudaStream_t nppStGetActiveCUDAstream();
+
+
+/**
+ * Sets an active CUDA stream used by NPPST
+ * NOT THREAD SAFE
+ * \param cudaStream        [IN] cudaStream CUDA stream to become current
+ * \return CUDA stream used before
+ */
+CV_EXPORTS
+cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
+
+
+/*@}*/
+
+
+/** \defgroup nppi NPPST Image Processing
+* @{
+*/
+
+
+/** Border type
+ *
+ * Filtering operations assume that each pixel has a neighborhood of pixels.
+ * The following structure describes possible ways to define non-existent pixels.
+ */
+enum NppStBorderType
+{
+    nppStBorderNone   = 0, ///< There is no need to define additional pixels, image is extended already
+    nppStBorderClamp  = 1, ///< Clamp out of range position to borders
+    nppStBorderWrap   = 2, ///< Wrap out of range position. Image becomes periodic.
+    nppStBorderMirror = 3  ///< reflect out of range position across borders
+};
+
+
+/**
+ * Filter types for image resizing
+ */
+enum NppStInterpMode
+{
+    nppStSupersample, ///< Supersampling. For downscaling only
+    nppStBicubic      ///< Bicubic convolution filter, a = -0.5 (cubic Hermite spline)
+};
+
+
+/** Frame interpolation state
+ *
+ * This structure holds parameters required for frame interpolation.
+ * Forward displacement field is a per-pixel mapping from frame 0 to frame 1.
+ * Backward displacement field is a per-pixel mapping from frame 1 to frame 0.
+ */
+
+ struct NppStInterpolationState
+{
+    NcvSize32u size;      ///< frame size
+    Ncv32u nStep;         ///< pitch
+    Ncv32f pos;           ///< new frame position
+    Ncv32f *pSrcFrame0;   ///< frame 0
+    Ncv32f *pSrcFrame1;   ///< frame 1
+    Ncv32f *pFU;          ///< forward horizontal displacement
+    Ncv32f *pFV;          ///< forward vertical displacement
+    Ncv32f *pBU;          ///< backward horizontal displacement
+    Ncv32f *pBV;          ///< backward vertical displacement
+    Ncv32f *pNewFrame;    ///< new frame
+    Ncv32f *ppBuffers[6]; ///< temporary buffers
+};
+
+
+/** Size of a buffer required for interpolation.
+ *
+ * Requires several such buffers. See \see NppStInterpolationState.
+ *
+ * \param srcSize           [IN]  Frame size (both frames must be of the same size)
+ * \param nStep             [IN]  Frame line step
+ * \param hpSize            [OUT] Where to store computed size (host memory)
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStGetInterpolationBufferSize(NcvSize32u srcSize,
+                                           Ncv32u nStep,
+                                           Ncv32u *hpSize);
+
+
+/** Interpolate frames (images) using provided optical flow (displacement field).
+ * 32-bit floating point images, single channel
+ *
+ * \param pState            [IN] structure containing all required parameters (host memory)
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState);
+
+
+/** Row linear filter. 32-bit floating point image, single channel
+ *
+ * Apply horizontal linear filter
+ *
+ * \param pSrc              [IN]  Source image pointer (CUDA device memory)
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param pDst              [OUT] Destination image pointer (CUDA device memory)
+ * \param dstSize           [OUT] Destination image size
+ * \param nDstStep
+ * \param oROI              [IN]  Region of interest in the source image
+ * \param borderType        [IN]  Type of border
+ * \param pKernel           [IN]  Pointer to row kernel values (CUDA device memory)
+ * \param nKernelSize       [IN]  Size of the kernel in pixels
+ * \param nAnchor           [IN]  The kernel row alignment with respect to the position of the input pixel
+ * \param multiplier        [IN]  Value by which the computed result is multiplied
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
+                                        NcvSize32u srcSize,
+                                        Ncv32u nSrcStep,
+                                        Ncv32f *pDst,
+                                        NcvSize32u dstSize,
+                                        Ncv32u nDstStep,
+                                        NcvRect32u oROI,
+                                        NppStBorderType borderType,
+                                        const Ncv32f *pKernel,
+                                        Ncv32s nKernelSize,
+                                        Ncv32s nAnchor,
+                                        Ncv32f multiplier);
+
+
+/** Column linear filter. 32-bit floating point image, single channel
+ *
+ * Apply vertical linear filter
+ *
+ * \param pSrc              [IN]  Source image pointer (CUDA device memory)
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param pDst              [OUT] Destination image pointer (CUDA device memory)
+ * \param dstSize           [OUT] Destination image size
+ * \param nDstStep          [IN]
+ * \param oROI              [IN]  Region of interest in the source image
+ * \param borderType        [IN]  Type of border
+ * \param pKernel           [IN]  Pointer to column kernel values (CUDA device memory)
+ * \param nKernelSize       [IN]  Size of the kernel in pixels
+ * \param nAnchor           [IN]  The kernel column alignment with respect to the position of the input pixel
+ * \param multiplier        [IN]  Value by which the computed result is multiplied
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
+                                           NcvSize32u srcSize,
+                                           Ncv32u nSrcStep,
+                                           Ncv32f *pDst,
+                                           NcvSize32u dstSize,
+                                           Ncv32u nDstStep,
+                                           NcvRect32u oROI,
+                                           NppStBorderType borderType,
+                                           const Ncv32f *pKernel,
+                                           Ncv32s nKernelSize,
+                                           Ncv32s nAnchor,
+                                           Ncv32f multiplier);
+
+
+/** Size of buffer required for vector image warping.
+ *
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param hpSize            [OUT] Where to store computed size (host memory)
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStVectorWarpGetBufferSize(NcvSize32u srcSize,
+                                        Ncv32u nSrcStep,
+                                        Ncv32u *hpSize);
+
+
+/** Warp image using provided 2D vector field and 1x1 point spread function.
+ * 32-bit floating point image, single channel
+ *
+ * During warping pixels from the source image may fall between pixels of the destination image.
+ * PSF (point spread function) describes how the source image pixel affects pixels of the destination.
+ * For 1x1 PSF only single pixel with the largest intersection is affected (similar to nearest interpolation).
+ *
+ * Destination image size and line step must be the same as the source image size and line step
+ *
+ * \param pSrc              [IN]  Source image pointer (CUDA device memory)
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param pU                [IN]  Pointer to horizontal displacement field (CUDA device memory)
+ * \param pV                [IN]  Pointer to vertical displacement field (CUDA device memory)
+ * \param nVFStep           [IN]  Displacement field line step
+ * \param timeScale         [IN]  Value by which displacement field will be scaled for warping
+ * \param pDst              [OUT] Destination image pointer (CUDA device memory)
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
+                                         NcvSize32u srcSize,
+                                         Ncv32u nSrcStep,
+                                         const Ncv32f *pU,
+                                         const Ncv32f *pV,
+                                         Ncv32u nVFStep,
+                                         Ncv32f timeScale,
+                                         Ncv32f *pDst);
+
+
+/** Warp image using provided 2D vector field and 2x2 point spread function.
+ * 32-bit floating point image, single channel
+ *
+ * During warping pixels from the source image may fall between pixels of the destination image.
+ * PSF (point spread function) describes how the source image pixel affects pixels of the destination.
+ * For 2x2 PSF all four intersected pixels will be affected.
+ *
+ * Destination image size and line step must be the same as the source image size and line step
+ *
+ * \param pSrc              [IN]  Source image pointer (CUDA device memory)
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param pU                [IN]  Pointer to horizontal displacement field (CUDA device memory)
+ * \param pV                [IN]  Pointer to vertical displacement field (CUDA device memory)
+ * \param nVFStep           [IN]  Displacement field line step
+ * \param pBuffer
+ * \param timeScale         [IN]  Value by which displacement field will be scaled for warping
+ * \param pDst              [OUT] Destination image pointer (CUDA device memory)
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
+                                         NcvSize32u srcSize,
+                                         Ncv32u nSrcStep,
+                                         const Ncv32f *pU,
+                                         const Ncv32f *pV,
+                                         Ncv32u nVFStep,
+                                         Ncv32f *pBuffer,
+                                         Ncv32f timeScale,
+                                         Ncv32f *pDst);
+
+
+/** Resize. 32-bit floating point image, single channel
+ *
+ * Resizes image using specified filter (interpolation type)
+ *
+ * \param pSrc              [IN]  Source image pointer (CUDA device memory)
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param srcROI            [IN]  Source image region of interest
+ * \param pDst              [OUT] Destination image pointer (CUDA device memory)
+ * \param dstSize           [IN]  Destination image size
+ * \param nDstStep          [IN]  Destination image line step
+ * \param dstROI            [IN]  Destination image region of interest
+ * \param xFactor           [IN]  Row scale factor
+ * \param yFactor           [IN]  Column scale factor
+ * \param interpolation     [IN]  Interpolation type
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
+                               NcvSize32u srcSize,
+                               Ncv32u nSrcStep,
+                               NcvRect32u srcROI,
+                               Ncv32f *pDst,
+                               NcvSize32u dstSize,
+                               Ncv32u nDstStep,
+                               NcvRect32u dstROI,
+                               Ncv32f xFactor,
+                               Ncv32f yFactor,
+                               NppStInterpMode interpolation);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel.
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStep           [IN] Source image line step
+ * \param d_dst             [OUT] Destination image pointer (CUDA device memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest in the source image
+ * \param scale             [IN] Downsampling scale factor (positive integer)
+ * \param readThruTexture   [IN] Performance hint to cache source in texture (true) or read directly (false)
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
+                                 Ncv32u *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel.
+ * \see nppiStDecimate_32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
+                                 Ncv32s *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel.
+ * \see nppiStDecimate_32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
+                                 Ncv32f *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);
+
+
+/**
+* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel.
+* \see nppiStDecimate_32u_C1R
+*/
+CV_EXPORTS
+NCVStatus nppiStDecimate_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
+                                 Ncv64u *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel.
+ * \see nppiStDecimate_32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
+                                 Ncv64s *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel.
+ * \see nppiStDecimate_32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
+                                 Ncv64f *d_dst, Ncv32u dstStep,
+                                 NcvSize32u srcRoi, Ncv32u scale,
+                                 NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel. Host implementation.
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStep           [IN] Source image line step
+ * \param h_dst             [OUT] Destination image pointer (Host or pinned memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest in the source image
+ * \param scale             [IN] Downsampling scale factor (positive integer)
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
+                                      Ncv32u *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation.
+ * \see nppiStDecimate_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
+                                      Ncv32s *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation.
+ * \see nppiStDecimate_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
+                                      Ncv32f *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation.
+ * \see nppiStDecimate_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
+                                      Ncv64u *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation.
+ * \see nppiStDecimate_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
+                                      Ncv64s *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation.
+ * \see nppiStDecimate_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStDecimate_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
+                                      Ncv64f *h_dst, Ncv32u dstStep,
+                                      NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Computes standard deviation for each rectangular region of the input image using integral images.
+ *
+ * \param d_sum             [IN] Integral image pointer (CUDA device memory)
+ * \param sumStep           [IN] Integral image line step
+ * \param d_sqsum           [IN] Squared integral image pointer (CUDA device memory)
+ * \param sqsumStep         [IN] Squared integral image line step
+ * \param d_norm            [OUT] Stddev image pointer (CUDA device memory). Each pixel contains stddev of a rect with top-left corner at the original location in the image
+ * \param normStep          [IN] Stddev image line step
+ * \param roi               [IN] Region of interest in the source image
+ * \param rect              [IN] Rectangular region to calculate stddev over
+ * \param scaleArea         [IN] Multiplication factor to account decimated scale
+ * \param readThruTexture   [IN] Performance hint to cache source in texture (true) or read directly (false)
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
+                                   Ncv64u *d_sqsum, Ncv32u sqsumStep,
+                                   Ncv32f *d_norm, Ncv32u normStep,
+                                   NcvSize32u roi, NcvRect32u rect,
+                                   Ncv32f scaleArea, NcvBool readThruTexture);
+
+
+/**
+ * Computes standard deviation for each rectangular region of the input image using integral images. Host implementation
+ *
+ * \param h_sum             [IN] Integral image pointer (Host or pinned memory)
+ * \param sumStep           [IN] Integral image line step
+ * \param h_sqsum           [IN] Squared integral image pointer (Host or pinned memory)
+ * \param sqsumStep         [IN] Squared integral image line step
+ * \param h_norm            [OUT] Stddev image pointer (Host or pinned memory). Each pixel contains stddev of a rect with top-left corner at the original location in the image
+ * \param normStep          [IN] Stddev image line step
+ * \param roi               [IN] Region of interest in the source image
+ * \param rect              [IN] Rectangular region to calculate stddev over
+ * \param scaleArea         [IN] Multiplication factor to account decimated scale
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStRectStdDev_32f_C1R_host(Ncv32u *h_sum, Ncv32u sumStep,
+                                        Ncv64u *h_sqsum, Ncv32u sqsumStep,
+                                        Ncv32f *h_norm, Ncv32u normStep,
+                                        NcvSize32u roi, NcvRect32u rect,
+                                        Ncv32f scaleArea);
+
+
+/**
+ * Transposes an image. 32-bit unsigned pixels, single channel
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStride         [IN] Source image line step
+ * \param d_dst             [OUT] Destination image pointer (CUDA device memory)
+ * \param dstStride         [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_32u_C1R(Ncv32u *d_src, Ncv32u srcStride,
+                                  Ncv32u *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit signed pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_32s_C1R(Ncv32s *d_src, Ncv32u srcStride,
+                                  Ncv32s *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit float pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_32f_C1R(Ncv32f *d_src, Ncv32u srcStride,
+                                  Ncv32f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit unsigned pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_64u_C1R(Ncv64u *d_src, Ncv32u srcStride,
+                                  Ncv64u *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit signed pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_64s_C1R(Ncv64s *d_src, Ncv32u srcStride,
+                                  Ncv64s *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit float pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_64f_C1R(Ncv64f *d_src, Ncv32u srcStride,
+                                  Ncv64f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 128-bit pixels of any type, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
+                                  void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit unsigned pixels, single channel. Host implementation
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStride         [IN] Source image line step
+ * \param h_dst             [OUT] Destination image pointer (Host or pinned memory)
+ * \param dstStride         [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStride,
+                                       Ncv32u *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit signed pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStride,
+                                       Ncv32s *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit float pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStride,
+                                       Ncv32f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit unsigned pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStride,
+                                       Ncv64u *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit signed pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStride,
+                                       Ncv64s *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit float pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStride,
+                                       Ncv64f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 128-bit pixels of any type, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
+                                       void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
+
+
+/**
+ * Calculates the size of the temporary buffer for integral image creation
+ *
+ * \param roiSize           [IN] Size of the input image
+ * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStIntegralGetSize_8u32u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Calculates the size of the temporary buffer for integral image creation
+ * \see nppiStIntegralGetSize_8u32u
+ */
+CV_EXPORTS
+NCVStatus nppiStIntegralGetSize_32f32f(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates an integral image representation for the input image
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStep           [IN] Source image line step
+ * \param d_dst             [OUT] Destination integral image pointer (CUDA device memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
+ * \param bufSize           [IN] Size of the pBuffer in bytes
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStIntegral_8u32u_C1R(Ncv8u *d_src, Ncv32u srcStep,
+                                   Ncv32u *d_dst, Ncv32u dstStep, NcvSize32u roiSize,
+                                   Ncv8u *pBuffer, Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates an integral image representation for the input image
+ * \see nppiStIntegral_8u32u_C1R
+ */
+CV_EXPORTS
+NCVStatus nppiStIntegral_32f32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
+                                    Ncv32f *d_dst, Ncv32u dstStep, NcvSize32u roiSize,
+                                    Ncv8u *pBuffer, Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates an integral image representation for the input image. Host implementation
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStep           [IN] Source image line step
+ * \param h_dst             [OUT] Destination integral image pointer (Host or pinned memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStIntegral_8u32u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
+                                        Ncv32u *h_dst, Ncv32u dstStep, NcvSize32u roiSize);
+
+
+/**
+ * Creates an integral image representation for the input image. Host implementation
+ * \see nppiStIntegral_8u32u_C1R_host
+ */
+CV_EXPORTS
+NCVStatus nppiStIntegral_32f32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
+                                         Ncv32f *h_dst, Ncv32u dstStep, NcvSize32u roiSize);
+
+
+/**
+ * Calculates the size of the temporary buffer for squared integral image creation
+ *
+ * \param roiSize           [IN] Size of the input image
+ * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates a squared integral image representation for the input image
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStep           [IN] Source image line step
+ * \param d_dst             [OUT] Destination squared integral image pointer (CUDA device memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
+ * \param bufSize           [IN] Size of the pBuffer in bytes
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStSqrIntegral_8u64u_C1R(Ncv8u *d_src, Ncv32u srcStep,
+                                      Ncv64u *d_dst, Ncv32u dstStep, NcvSize32u roiSize,
+                                      Ncv8u *pBuffer, Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates a squared integral image representation for the input image. Host implementation
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStep           [IN] Source image line step
+ * \param h_dst             [OUT] Destination squared integral image pointer (Host or pinned memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
+                                           Ncv64u *h_dst, Ncv32u dstStep, NcvSize32u roiSize);
+
+
+/*@}*/
+
+
+/** \defgroup npps NPPST Signal Processing
+* @{
+*/
+
+
+/**
+ * Calculates the size of the temporary buffer for vector compaction. 32-bit unsigned values
+ *
+ * \param srcLen            [IN] Length of the input vector in elements
+ * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppsStCompactGetSize_32u(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Calculates the size of the temporary buffer for vector compaction. 32-bit signed values
+ * \see nppsStCompactGetSize_32u
+ */
+NCVStatus nppsStCompactGetSize_32s(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Calculates the size of the temporary buffer for vector compaction. 32-bit float values
+ * \see nppsStCompactGetSize_32u
+ */
+NCVStatus nppsStCompactGetSize_32f(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit unsigned values
+ *
+ * \param d_src             [IN] Source vector pointer (CUDA device memory)
+ * \param srcLen            [IN] Source vector length
+ * \param d_dst             [OUT] Destination vector pointer (CUDA device memory)
+ * \param p_dstLen          [OUT] Pointer to the destination vector length (Pinned memory or NULL)
+ * \param elemRemove        [IN] The value to be removed
+ * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
+ * \param bufSize           [IN] Size of the pBuffer in bytes
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppsStCompact_32u(Ncv32u *d_src, Ncv32u srcLen,
+                            Ncv32u *d_dst, Ncv32u *p_dstLen,
+                            Ncv32u elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit signed values
+ * \see nppsStCompact_32u
+ */
+CV_EXPORTS
+NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen,
+                            Ncv32s *d_dst, Ncv32u *p_dstLen,
+                            Ncv32s elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit float values
+ * \see nppsStCompact_32u
+ */
+CV_EXPORTS
+NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen,
+                            Ncv32f *d_dst, Ncv32u *p_dstLen,
+                            Ncv32f elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit unsigned values. Host implementation
+ *
+ * \param h_src             [IN] Source vector pointer (CUDA device memory)
+ * \param srcLen            [IN] Source vector length
+ * \param h_dst             [OUT] Destination vector pointer (CUDA device memory)
+ * \param dstLen            [OUT] Pointer to the destination vector length (can be NULL)
+ * \param elemRemove        [IN] The value to be removed
+ *
+ * \return NCV status code
+ */
+CV_EXPORTS
+NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
+                                 Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit signed values. Host implementation
+ * \see nppsStCompact_32u_host
+ */
+CV_EXPORTS
+NCVStatus nppsStCompact_32s_host(Ncv32s *h_src, Ncv32u srcLen,
+                                 Ncv32s *h_dst, Ncv32u *dstLen, Ncv32s elemRemove);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit float values. Host implementation
+ * \see nppsStCompact_32u_host
+ */
+CV_EXPORTS
+NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
+                                 Ncv32f *h_dst, Ncv32u *dstLen, Ncv32f elemRemove);
+
+
+/*@}*/
+
+//! @}
+
+#endif // _npp_staging_hpp_
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp
new file mode 100644
index 00000000000..79f9e635bf0
--- /dev/null
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp
@@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CUDALEGACY_PRIVATE_HPP
+#define OPENCV_CORE_CUDALEGACY_PRIVATE_HPP
+
+#ifndef __OPENCV_BUILD
+#  error this is a private header which should not be used from outside of the OpenCV library
+#endif
+
+#include "opencv2/core/private.cuda.hpp"
+
+#ifndef HAVE_CUDA
+#  error cudalegacy module requires CUDA
+#endif
+
+#include "opencv2/cudalegacy.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda
+{
+    class NppStStreamHandler
+    {
+    public:
+        inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
+        {
+            oldStream = nppStSetActiveCUDAstream(newStream);
+        }
+
+        inline ~NppStStreamHandler()
+        {
+            nppStSetActiveCUDAstream(oldStream);
+        }
+
+    private:
+        cudaStream_t oldStream;
+    };
+
+    CV_EXPORTS cv::String getNcvErrorMessage(int code);
+
+    static inline void checkNcvError(int err, const char* file, const int line, const char* func)
+    {
+        if (NCV_SUCCESS != err)
+        {
+            cv::String msg = getNcvErrorMessage(err);
+            cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
+        }
+    }
+}}
+
+#define ncvSafeCall(expr)  cv::cuda::checkNcvError(expr, __FILE__, __LINE__, CV_Func)
+
+//! @endcond
+
+#endif // OPENCV_CORE_CUDALEGACY_PRIVATE_HPP
diff --git a/modules/cudalegacy/perf/perf_bgsegm.cpp b/modules/cudalegacy/perf/perf_bgsegm.cpp
new file mode 100644
index 00000000000..fb3aabd1dc5
--- /dev/null
+++ b/modules/cudalegacy/perf/perf_bgsegm.cpp
@@ -0,0 +1,236 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+#ifdef HAVE_OPENCV_CUDAIMGPROC
+#  include "opencv2/cudaimgproc.hpp"
+#endif
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////
+// FGDStatModel
+
+#ifdef HAVE_VIDEO_INPUT
+
+DEF_PARAM_TEST_1(Video, string);
+
+PERF_TEST_P(Video, FGDStatModel,
+            Values(string("gpu/video/768x576.avi")))
+{
+    const int numIters = 10;
+
+    declare.time(60);
+
+    const string inputFile = perf::TestBase::getDataPath(GetParam());
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::cuda::GpuMat d_frame(frame), foreground;
+
+        cv::Ptr<cv::cuda::BackgroundSubtractorFGD> d_fgd = cv::cuda::createBackgroundSubtractorFGD();
+        d_fgd->apply(d_frame, foreground);
+
+        int i = 0;
+
+        // collect performance data
+        for (; i < numIters; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            d_frame.upload(frame);
+
+            startTimer();
+            if(!next())
+                break;
+
+            d_fgd->apply(d_frame, foreground);
+
+            stopTimer();
+        }
+
+        // process last frame in sequence to get data for sanity test
+        for (; i < numIters; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            d_frame.upload(frame);
+
+            d_fgd->apply(d_frame, foreground);
+        }
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+#endif
+
+//////////////////////////////////////////////////////
+// GMG
+
+#ifdef HAVE_VIDEO_INPUT
+
+DEF_PARAM_TEST(Video_Cn_MaxFeatures, string, MatCn, int);
+
+PERF_TEST_P(Video_Cn_MaxFeatures, GMG,
+            Combine(Values(string("gpu/video/768x576.avi")),
+                    CUDA_CHANNELS_1_3_4,
+                    Values(20, 40, 60)))
+{
+    const int numIters = 150;
+
+    const std::string inputFile = perf::TestBase::getDataPath(GET_PARAM(0));
+    const int cn = GET_PARAM(1);
+    const int maxFeatures = GET_PARAM(2);
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    if (cn != 3)
+    {
+        cv::Mat temp;
+        if (cn == 1)
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+        else
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+        cv::swap(temp, frame);
+    }
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::cuda::GpuMat d_frame(frame);
+        cv::cuda::GpuMat foreground;
+
+        cv::Ptr<cv::cuda::BackgroundSubtractorGMG> d_gmg = cv::cuda::createBackgroundSubtractorGMG();
+        d_gmg->setMaxFeatures(maxFeatures);
+
+        d_gmg->apply(d_frame, foreground);
+
+        int i = 0;
+
+        // collect performance data
+        for (; i < numIters; ++i)
+        {
+            cap >> frame;
+            if (frame.empty())
+            {
+                cap.release();
+                cap.open(inputFile);
+                cap >> frame;
+            }
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            d_frame.upload(frame);
+
+            startTimer();
+            if(!next())
+                break;
+
+            d_gmg->apply(d_frame, foreground);
+
+            stopTimer();
+        }
+
+        // process last frame in sequence to get data for sanity test
+        for (; i < numIters; ++i)
+        {
+            cap >> frame;
+            if (frame.empty())
+            {
+                cap.release();
+                cap.open(inputFile);
+                cap >> frame;
+            }
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            d_frame.upload(frame);
+
+            d_gmg->apply(d_frame, foreground);
+        }
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+#endif
+
+}} // namespace
diff --git a/modules/cudalegacy/perf/perf_calib3d.cpp b/modules/cudalegacy/perf/perf_calib3d.cpp
new file mode 100644
index 00000000000..9558e6cffc5
--- /dev/null
+++ b/modules/cudalegacy/perf/perf_calib3d.cpp
@@ -0,0 +1,140 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+#ifdef HAVE_OPENCV_CALIB3D
+
+#include "opencv2/calib3d.hpp"
+
+namespace opencv_test { namespace {
+
+DEF_PARAM_TEST_1(Count, int);
+
+//////////////////////////////////////////////////////////////////////
+// ProjectPoints
+
+PERF_TEST_P(Count, Calib3D_ProjectPoints,
+            Values(5000, 10000, 20000))
+{
+    const int count = GetParam();
+
+    cv::Mat src(1, count, CV_32FC3);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SolvePnPRansac
+
+PERF_TEST_P(Count, Calib3D_SolvePnPRansac,
+            Values(5000, 10000, 20000))
+{
+    declare.time(10.0);
+
+    const int count = GetParam();
+
+    cv::Mat object(1, count, CV_32FC3);
+    declare.in(object, WARMUP_RNG);
+
+    cv::Mat camera_mat(3, 3, CV_32FC1);
+    cv::randu(camera_mat, 0.5, 1);
+    camera_mat.at<float>(0, 1) = 0.f;
+    camera_mat.at<float>(1, 0) = 0.f;
+    camera_mat.at<float>(2, 0) = 0.f;
+    camera_mat.at<float>(2, 1) = 0.f;
+
+    const cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
+
+    cv::Mat rvec_gold(1, 3, CV_32FC1);
+    cv::randu(rvec_gold, 0, 1);
+
+    cv::Mat tvec_gold(1, 3, CV_32FC1);
+    cv::randu(tvec_gold, 0, 1);
+
+    std::vector<cv::Point2f> image_vec;
+    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);
+
+    const cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
+
+    cv::Mat rvec;
+    cv::Mat tvec;
+
+    if (PERF_RUN_CUDA())
+    {
+        TEST_CYCLE() cv::cuda::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+
+        CUDA_SANITY_CHECK(rvec, 1e-3);
+        CUDA_SANITY_CHECK(tvec, 1e-3);
+    }
+    else
+    {
+        TEST_CYCLE() cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+
+        CPU_SANITY_CHECK(rvec, 1e-6);
+        CPU_SANITY_CHECK(tvec, 1e-6);
+    }
+}
+
+}} // namespace
+#endif
diff --git a/modules/cudalegacy/perf/perf_labeling.cpp b/modules/cudalegacy/perf/perf_labeling.cpp
new file mode 100644
index 00000000000..0084744f41c
--- /dev/null
+++ b/modules/cudalegacy/perf/perf_labeling.cpp
@@ -0,0 +1,195 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+DEF_PARAM_TEST_1(Image, string);
+
+struct GreedyLabeling
+{
+    struct dot
+    {
+        int x;
+        int y;
+
+        static dot make(int i, int j)
+        {
+            dot d; d.x = i; d.y = j;
+            return d;
+        }
+    };
+
+    struct InInterval
+    {
+        InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {}
+        const int lo, hi;
+
+        bool operator() (const unsigned char a, const unsigned char b) const
+        {
+            int d = a - b;
+            return lo <= d && d <= hi;
+        }
+
+    private:
+        InInterval& operator=(const InInterval&);
+
+
+    };
+
+    GreedyLabeling(cv::Mat img)
+    : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {stack = new dot[image.cols * image.rows];}
+
+    ~GreedyLabeling(){delete[] stack;}
+
+    void operator() (cv::Mat labels) const
+    {
+        labels.setTo(cv::Scalar::all(-1));
+        InInterval inInt(0, 2);
+        int cc = -1;
+
+        int* dist_labels = (int*)labels.data;
+        int pitch = static_cast<int>(labels.step1());
+
+        unsigned char* source = (unsigned char*)image.data;
+        int width = image.cols;
+        int height = image.rows;
+
+        for (int j = 0; j < image.rows; ++j)
+            for (int i = 0; i < image.cols; ++i)
+            {
+                if (dist_labels[j * pitch + i] != -1) continue;
+
+                dot* top = stack;
+                dot p = dot::make(i, j);
+                cc++;
+
+                dist_labels[j * pitch + i] = cc;
+
+                while (top >= stack)
+                {
+                    int*  dl = &dist_labels[p.y * pitch + p.x];
+                    unsigned char* sp = &source[p.y * image.step1() + p.x];
+
+                    dl[0] = cc;
+
+                    //right
+                    if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
+                        *top++ = dot::make(p.x + 1, p.y);
+
+                    //left
+                    if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
+                        *top++ = dot::make(p.x - 1, p.y);
+
+                    //bottom
+                    if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+image.step1()]))
+                        *top++ = dot::make(p.x, p.y + 1);
+
+                    //top
+                    if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-static_cast<int>(image.step1())]))
+                        *top++ = dot::make(p.x, p.y - 1);
+
+                    p = *--top;
+                }
+            }
+    }
+
+    cv::Mat image;
+    cv::Mat _labels;
+    dot* stack;
+};
+
+PERF_TEST_P(Image, DISABLED_Labeling_ConnectivityMask,
+            Values<string>("gpu/labeling/aloe-disp.png"))
+{
+    declare.time(1.0);
+
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::cuda::GpuMat d_image(image);
+        cv::cuda::GpuMat mask;
+
+        TEST_CYCLE() cv::cuda::connectivityMask(d_image, mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+        CUDA_SANITY_CHECK(mask);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+PERF_TEST_P(Image, DISABLED_Labeling_ConnectedComponents,
+            Values<string>("gpu/labeling/aloe-disp.png"))
+{
+    declare.time(1.0);
+
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::cuda::GpuMat d_mask;
+        cv::cuda::connectivityMask(cv::cuda::GpuMat(image), d_mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+        cv::cuda::GpuMat components;
+
+        TEST_CYCLE() cv::cuda::labelComponents(d_mask, components);
+
+        CUDA_SANITY_CHECK(components);
+    }
+    else
+    {
+        GreedyLabeling host(image);
+
+        TEST_CYCLE() host(host._labels);
+
+        cv::Mat components = host._labels;
+        CPU_SANITY_CHECK(components);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudalegacy/perf/perf_main.cpp b/modules/cudalegacy/perf/perf_main.cpp
new file mode 100644
index 00000000000..0830707460e
--- /dev/null
+++ b/modules/cudalegacy/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudalegacy)
diff --git a/modules/cudalegacy/perf/perf_precomp.hpp b/modules/cudalegacy/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..89f1b051a2b
--- /dev/null
+++ b/modules/cudalegacy/perf/perf_precomp.hpp
@@ -0,0 +1,57 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudalegacy.hpp"
+#include "opencv2/video.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+namespace opencv_test {
+using namespace perf;
+}
+
+#endif
diff --git a/modules/cudalegacy/src/NCV.cpp b/modules/cudalegacy/src/NCV.cpp
new file mode 100644
index 00000000000..ddb7003fad0
--- /dev/null
+++ b/modules/cudalegacy/src/NCV.cpp
@@ -0,0 +1,888 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+//==============================================================================
+//
+// Error handling helpers
+//
+//==============================================================================
+
+namespace
+{
+    #define error_entry(entry)  { entry, #entry }
+
+    struct ErrorEntry
+    {
+        int code;
+        const char* str;
+    };
+
+    struct ErrorEntryComparer
+    {
+        int code;
+        ErrorEntryComparer(int code_) : code(code_) {}
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }
+    };
+
+    //////////////////////////////////////////////////////////////////////////
+    // NCV errors
+
+    const ErrorEntry ncv_errors [] =
+    {
+        error_entry( NCV_SUCCESS ),
+        error_entry( NCV_UNKNOWN_ERROR ),
+        error_entry( NCV_CUDA_ERROR ),
+        error_entry( NCV_NPP_ERROR ),
+        error_entry( NCV_FILE_ERROR ),
+        error_entry( NCV_NULL_PTR ),
+        error_entry( NCV_INCONSISTENT_INPUT ),
+        error_entry( NCV_TEXTURE_BIND_ERROR ),
+        error_entry( NCV_DIMENSIONS_INVALID ),
+        error_entry( NCV_INVALID_ROI ),
+        error_entry( NCV_INVALID_STEP ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ),
+        error_entry( NCV_ALLOCATOR_BAD_ALLOC ),
+        error_entry( NCV_ALLOCATOR_BAD_DEALLOC ),
+        error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ),
+        error_entry( NCV_ALLOCATOR_BAD_REUSE ),
+        error_entry( NCV_MEM_COPY_ERROR ),
+        error_entry( NCV_MEM_RESIDENCE_ERROR ),
+        error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_HAAR_INVALID_PIXEL_STEP ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ),
+        error_entry( NCV_HAAR_TOO_LARGE_FEATURES ),
+        error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ),
+        error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ),
+        error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ),
+        error_entry( NPPST_SUCCESS ),
+        error_entry( NPPST_ERROR ),
+        error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ),
+        error_entry( NPPST_NULL_POINTER_ERROR ),
+        error_entry( NPPST_TEXTURE_BIND_ERROR ),
+        error_entry( NPPST_MEMCPY_ERROR ),
+        error_entry( NPPST_MEM_ALLOC_ERR ),
+        error_entry( NPPST_MEMFREE_ERR ),
+        error_entry( NPPST_INVALID_ROI ),
+        error_entry( NPPST_INVALID_STEP ),
+        error_entry( NPPST_INVALID_SCALE ),
+        error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ),
+        error_entry( NPPST_MEM_RESIDENCE_ERROR ),
+        error_entry( NPPST_MEM_INTERNAL_ERROR )
+    };
+
+    const size_t ncv_error_num = sizeof(ncv_errors) / sizeof(ncv_errors[0]);
+}
+
+cv::String cv::cuda::getNcvErrorMessage(int code)
+{
+    size_t idx = std::find_if(ncv_errors, ncv_errors + ncv_error_num, ErrorEntryComparer(code)) - ncv_errors;
+
+    const char* msg = (idx != ncv_error_num) ? ncv_errors[idx].str : "Unknown error code";
+    String str = cv::format("%s [Code = %d]", msg, code);
+
+    return str;
+}
+
+
+static void stdDebugOutput(const cv::String &msg)
+{
+    std::cout << msg.c_str() << std::endl;
+}
+
+
+static NCVDebugOutputHandler *debugOutputHandler = stdDebugOutput;
+
+
+void ncvDebugOutput(const cv::String &msg)
+{
+    debugOutputHandler(msg);
+}
+
+
+void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func)
+{
+    debugOutputHandler = func;
+}
+
+
+//==============================================================================
+//
+// Memory wrappers and helpers
+//
+//==============================================================================
+
+
+Ncv32u alignUp(Ncv32u what, Ncv32u alignment)
+{
+    Ncv32u alignMask = alignment-1;
+    Ncv32u inverseAlignMask = ~alignMask;
+    Ncv32u res = (what + alignMask) & inverseAlignMask;
+    return res;
+}
+
+
+void NCVMemPtr::clear()
+{
+    ptr = NULL;
+    memtype = NCVMemoryTypeNone;
+}
+
+
+void NCVMemSegment::clear()
+{
+    begin.clear();
+    size = 0;
+}
+
+
+NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NCVMemoryType srcType, size_t sz, cudaStream_t cuStream)
+{
+    NCVStatus ncvStat;
+    switch (dstType)
+    {
+    case NCVMemoryTypeHostPageable:
+    case NCVMemoryTypeHostPinned:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            memcpy(dst, src, sz);
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    case NCVMemoryTypeDevice:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    default:
+        ncvStat = NCV_MEM_RESIDENCE_ERROR;
+    }
+
+    return ncvStat;
+}
+
+
+NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
+                             const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
+                             Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream)
+{
+    NCVStatus ncvStat;
+    switch (dstType)
+    {
+    case NCVMemoryTypeHostPageable:
+    case NCVMemoryTypeHostPinned:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            for (Ncv32u i=0; i<height; i++)
+            {
+                memcpy((char*)dst + i * dstPitch, (char*)src + i * srcPitch, widthbytes);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    case NCVMemoryTypeDevice:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    default:
+        ncvStat = NCV_MEM_RESIDENCE_ERROR;
+    }
+
+    return ncvStat;
+}
+
+
+//===================================================================
+//
+// NCVMemStackAllocator class members implementation
+//
+//===================================================================
+
+
+NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_) :
+    _memType(NCVMemoryTypeNone),
+    _alignment(alignment_),
+    allocBegin(NULL),
+    begin(NULL),
+    end(NULL),
+    currentSize(0),
+    _maxSize(0),
+    bReusesMemory(false)
+{
+    NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
+    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
+}
+
+
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr) :
+    _memType(memT),
+    _alignment(alignment_),
+    allocBegin(NULL),
+    currentSize(0),
+    _maxSize(0)
+{
+    NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
+    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
+    ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
+
+    allocBegin = NULL;
+
+    if (reusePtr == NULL && capacity != 0)
+    {
+        bReusesMemory = false;
+        switch (memT)
+        {
+        case NCVMemoryTypeDevice:
+            ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
+            break;
+        case NCVMemoryTypeHostPinned:
+            ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
+            break;
+        case NCVMemoryTypeHostPageable:
+            allocBegin = (Ncv8u *)malloc(capacity);
+            break;
+        default:;
+        }
+    }
+    else
+    {
+        bReusesMemory = true;
+        allocBegin = (Ncv8u *)reusePtr;
+    }
+
+    if (capacity == 0)
+    {
+        allocBegin = (Ncv8u *)(0x1);
+    }
+
+    if (!isCounting())
+    {
+        begin = allocBegin;
+        end = begin + capacity;
+    }
+}
+
+
+NCVMemStackAllocator::~NCVMemStackAllocator()
+{
+    if (allocBegin != NULL)
+    {
+        ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");
+
+        if (!bReusesMemory && (allocBegin != (Ncv8u *)(0x1)))
+        {
+            switch (_memType)
+            {
+            case NCVMemoryTypeDevice:
+                ncvAssertCUDAReturn(cudaFree(allocBegin), );
+                break;
+            case NCVMemoryTypeHostPinned:
+                ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
+                break;
+            case NCVMemoryTypeHostPageable:
+                free(allocBegin);
+                break;
+            default:;
+            }
+        }
+
+        allocBegin = NULL;
+    }
+}
+
+
+NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, size_t size)
+{
+    seg.clear();
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    size = alignUp(static_cast<Ncv32u>(size), this->_alignment);
+    this->currentSize += size;
+    this->_maxSize = std::max(this->_maxSize, this->currentSize);
+
+    if (!isCounting())
+    {
+        size_t availSize = end - begin;
+        ncvAssertReturn(size <= availSize, NCV_ALLOCATOR_INSUFFICIENT_CAPACITY);
+    }
+
+    seg.begin.ptr = begin;
+    seg.begin.memtype = this->_memType;
+    seg.size = size;
+    begin += size;
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus NCVMemStackAllocator::dealloc(NCVMemSegment &seg)
+{
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr != NULL || isCounting(), NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr == begin - seg.size, NCV_ALLOCATOR_DEALLOC_ORDER);
+
+    currentSize -= seg.size;
+    begin -= seg.size;
+
+    seg.clear();
+
+    ncvAssertReturn(allocBegin <= begin, NCV_ALLOCATOR_BAD_DEALLOC);
+
+    return NCV_SUCCESS;
+}
+
+
+NcvBool NCVMemStackAllocator::isInitialized(void) const
+{
+    return (((this->_alignment & (this->_alignment-1)) == 0) && isCounting()) || this->allocBegin != NULL;
+}
+
+
+NcvBool NCVMemStackAllocator::isCounting(void) const
+{
+    return this->_memType == NCVMemoryTypeNone;
+}
+
+
+NCVMemoryType NCVMemStackAllocator::memType(void) const
+{
+    return this->_memType;
+}
+
+
+Ncv32u NCVMemStackAllocator::alignment(void) const
+{
+    return this->_alignment;
+}
+
+
+size_t NCVMemStackAllocator::maxSize(void) const
+{
+    return this->_maxSize;
+}
+
+
+//===================================================================
+//
+// NCVMemNativeAllocator class members implementation
+//
+//===================================================================
+
+
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_) :
+    _memType(memT),
+    _alignment(alignment_),
+    currentSize(0),
+    _maxSize(0)
+{
+    ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
+}
+
+
+NCVMemNativeAllocator::~NCVMemNativeAllocator()
+{
+    ncvAssertPrintCheck(currentSize == 0, "NCVMemNativeAllocator dtor:: detected memory leak");
+}
+
+
+NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, size_t size)
+{
+    seg.clear();
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    switch (this->_memType)
+    {
+    case NCVMemoryTypeDevice:
+        ncvAssertCUDAReturn(cudaMalloc(&seg.begin.ptr, size), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPinned:
+        ncvAssertCUDAReturn(cudaMallocHost(&seg.begin.ptr, size), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPageable:
+        seg.begin.ptr = (Ncv8u *)malloc(size);
+        break;
+    default:;
+    }
+
+    this->currentSize += alignUp(static_cast<Ncv32u>(size), this->_alignment);
+    this->_maxSize = std::max(this->_maxSize, this->currentSize);
+
+    seg.begin.memtype = this->_memType;
+    seg.size = size;
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg)
+{
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC);
+
+    ncvAssertReturn(currentSize >= alignUp(static_cast<Ncv32u>(seg.size), this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
+    currentSize -= alignUp(static_cast<Ncv32u>(seg.size), this->_alignment);
+
+    switch (this->_memType)
+    {
+    case NCVMemoryTypeDevice:
+        ncvAssertCUDAReturn(cudaFree(seg.begin.ptr), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPinned:
+        ncvAssertCUDAReturn(cudaFreeHost(seg.begin.ptr), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPageable:
+        free(seg.begin.ptr);
+        break;
+    default:;
+    }
+
+    seg.clear();
+
+    return NCV_SUCCESS;
+}
+
+
+NcvBool NCVMemNativeAllocator::isInitialized(void) const
+{
+    return (this->_alignment != 0);
+}
+
+
+NcvBool NCVMemNativeAllocator::isCounting(void) const
+{
+    return false;
+}
+
+
+NCVMemoryType NCVMemNativeAllocator::memType(void) const
+{
+    return this->_memType;
+}
+
+
+Ncv32u NCVMemNativeAllocator::alignment(void) const
+{
+    return this->_alignment;
+}
+
+
+size_t NCVMemNativeAllocator::maxSize(void) const
+{
+    return this->_maxSize;
+}
+
+
+//===================================================================
+//
+// Time and timer routines
+//
+//===================================================================
+
+
+typedef struct _NcvTimeMoment NcvTimeMoment;
+
+#if defined(_WIN32) || defined(_WIN64)
+
+    #include <Windows.h>
+
+    typedef struct _NcvTimeMoment
+    {
+        LONGLONG moment, freq;
+    } NcvTimeMoment;
+
+
+    static void _ncvQueryMoment(NcvTimeMoment *t)
+    {
+        QueryPerformanceFrequency((LARGE_INTEGER *)&(t->freq));
+        QueryPerformanceCounter((LARGE_INTEGER *)&(t->moment));
+    }
+
+
+    double _ncvMomentToMicroseconds(NcvTimeMoment *t)
+    {
+        return 1000000.0 * t->moment / t->freq;
+    }
+
+
+    double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return 1000000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);
+    }
+
+
+    double _ncvMomentsDiffToMilliseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return 1000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);
+    }
+
+#elif defined(__GNUC__)
+
+    #include <sys/time.h>
+
+    typedef struct _NcvTimeMoment
+    {
+        struct timeval tv;
+        struct timezone tz;
+    } NcvTimeMoment;
+
+
+    void _ncvQueryMoment(NcvTimeMoment *t)
+    {
+        gettimeofday(& t->tv, & t->tz);
+    }
+
+
+    double _ncvMomentToMicroseconds(NcvTimeMoment *t)
+    {
+        return 1000000.0 * t->tv.tv_sec + (double)t->tv.tv_usec;
+    }
+
+
+    double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return (((double)t2->tv.tv_sec - (double)t1->tv.tv_sec) * 1000000 + (double)t2->tv.tv_usec - (double)t1->tv.tv_usec);
+    }
+
+    double _ncvMomentsDiffToMilliseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return ((double)t2->tv.tv_sec - (double)t1->tv.tv_sec) * 1000;
+    }
+
+#endif //#if defined(_WIN32) || defined(_WIN64)
+
+
+struct _NcvTimer
+{
+    NcvTimeMoment t1, t2;
+};
+
+
+NcvTimer ncvStartTimer(void)
+{
+    struct _NcvTimer *t;
+    t = (struct _NcvTimer *)malloc(sizeof(struct _NcvTimer));
+    _ncvQueryMoment(&t->t1);
+    return t;
+}
+
+
+double ncvEndQueryTimerUs(NcvTimer t)
+{
+    double res;
+    _ncvQueryMoment(&t->t2);
+    res = _ncvMomentsDiffToMicroseconds(&t->t1, &t->t2);
+    free(t);
+    return res;
+}
+
+
+double ncvEndQueryTimerMs(NcvTimer t)
+{
+    double res;
+    _ncvQueryMoment(&t->t2);
+    res = _ncvMomentsDiffToMilliseconds(&t->t1, &t->t2);
+    free(t);
+    return res;
+}
+
+
+//===================================================================
+//
+// Operations with rectangles
+//
+//===================================================================
+
+struct RectConvert
+{
+    cv::Rect operator()(const NcvRect32u& nr) const { return cv::Rect(nr.x, nr.y, nr.width, nr.height); }
+    NcvRect32u operator()(const cv::Rect& nr) const
+    {
+        NcvRect32u rect;
+        rect.x = nr.x;
+        rect.y = nr.y;
+        rect.width = nr.width;
+        rect.height = nr.height;
+        return rect;
+    }
+};
+
+static void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights)
+{
+#ifndef HAVE_OPENCV_OBJDETECT
+    CV_UNUSED(hypotheses);
+    CV_UNUSED(groupThreshold);
+    CV_UNUSED(eps);
+    CV_UNUSED(weights);
+    CV_Error(cv::Error::StsNotImplemented, "This functionality requires objdetect module");
+#else
+    std::vector<cv::Rect> rects(hypotheses.size());
+    std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert());
+
+    if (weights)
+    {
+        std::vector<int> weights_int;
+        weights_int.assign(weights->begin(), weights->end());
+        cv::groupRectangles(rects, weights_int, groupThreshold, eps);
+    }
+    else
+    {
+        cv::groupRectangles(rects, groupThreshold, eps);
+    }
+    std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert());
+    hypotheses.resize(rects.size());
+#endif
+}
+
+
+
+NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses,
+                                  Ncv32u &numHypotheses,
+                                  Ncv32u minNeighbors,
+                                  Ncv32f intersectEps,
+                                  NCVVector<Ncv32u> *hypothesesWeights)
+{
+    ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
+                    hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
+    if (hypothesesWeights != NULL)
+    {
+        ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
+                        hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
+    }
+
+    if (numHypotheses == 0)
+    {
+        return NCV_SUCCESS;
+    }
+
+    std::vector<NcvRect32u> rects(numHypotheses);
+    memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
+
+    std::vector<Ncv32u> weights;
+    if (hypothesesWeights != NULL)
+    {
+        groupRectangles(rects, minNeighbors, intersectEps, &weights);
+    }
+    else
+    {
+        groupRectangles(rects, minNeighbors, intersectEps, NULL);
+    }
+
+    numHypotheses = (Ncv32u)rects.size();
+    if (numHypotheses > 0)
+    {
+        memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
+    }
+
+    if (hypothesesWeights != NULL)
+    {
+        memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
+    }
+
+    return NCV_SUCCESS;
+}
+
+
+template <class T>
+static NCVStatus drawRectsWrapperHost(T *h_dst,
+                                      Ncv32u dstStride,
+                                      Ncv32u dstWidth,
+                                      Ncv32u dstHeight,
+                                      NcvRect32u *h_rects,
+                                      Ncv32u numRects,
+                                      T color)
+{
+    ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
+    ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
+    ncvAssertReturn(numRects != 0, NCV_SUCCESS);
+    ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
+
+    for (Ncv32u i=0; i<numRects; i++)
+    {
+        NcvRect32u rect = h_rects[i];
+
+        if (rect.x < dstWidth)
+        {
+            for (Ncv32u each=rect.y; each<rect.y+rect.height && each<dstHeight; each++)
+            {
+                h_dst[each*dstStride+rect.x] = color;
+            }
+        }
+        if (rect.x+rect.width-1 < dstWidth)
+        {
+            for (Ncv32u each=rect.y; each<rect.y+rect.height && each<dstHeight; each++)
+            {
+                h_dst[each*dstStride+rect.x+rect.width-1] = color;
+            }
+        }
+        if (rect.y < dstHeight)
+        {
+            for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
+            {
+                h_dst[rect.y*dstStride+j] = color;
+            }
+        }
+        if (rect.y + rect.height - 1 < dstHeight)
+        {
+            for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
+            {
+                h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
+            }
+        }
+    }
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
+                               Ncv32u dstStride,
+                               Ncv32u dstWidth,
+                               Ncv32u dstHeight,
+                               NcvRect32u *h_rects,
+                               Ncv32u numRects,
+                               Ncv8u color)
+{
+    return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
+}
+
+
+NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
+                                Ncv32u dstStride,
+                                Ncv32u dstWidth,
+                                Ncv32u dstHeight,
+                                NcvRect32u *h_rects,
+                                Ncv32u numRects,
+                                Ncv32u color)
+{
+    return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
+}
diff --git a/modules/cudalegacy/src/bm.cpp b/modules/cudalegacy/src/bm.cpp
new file mode 100644
index 00000000000..19ad164f85e
--- /dev/null
+++ b/modules/cudalegacy/src/bm.cpp
@@ -0,0 +1,204 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::calcOpticalFlowBM(const GpuMat&, const GpuMat&, Size, Size, Size, bool, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace optflowbm
+{
+    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
+              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream);
+}
+
+void cv::cuda::calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr, Size blockSize, Size shiftSize, Size maxRange, bool usePrevious, GpuMat& velx, GpuMat& vely, GpuMat& buf, Stream& st)
+{
+    CV_Assert( prev.type() == CV_8UC1 );
+    CV_Assert( curr.size() == prev.size() && curr.type() == prev.type() );
+
+    const Size velSize((prev.cols - blockSize.width + shiftSize.width) / shiftSize.width,
+                       (prev.rows - blockSize.height + shiftSize.height) / shiftSize.height);
+
+    velx.create(velSize, CV_32FC1);
+    vely.create(velSize, CV_32FC1);
+
+    // scanning scheme coordinates
+    std::vector<short2> ss((2 * maxRange.width + 1) * (2 * maxRange.height + 1));
+    int ssCount = 0;
+
+    // Calculate scanning scheme
+    const int minCount = std::min(maxRange.width, maxRange.height);
+
+    // use spiral search pattern
+    //
+    //     9 10 11 12
+    //     8  1  2 13
+    //     7  *  3 14
+    //     6  5  4 15
+    //... 20 19 18 17
+    //
+
+    for (int i = 0; i < minCount; ++i)
+    {
+        // four cycles along sides
+        int x = -i - 1, y = x;
+
+        // upper side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = (short) ++x;
+            ss[ssCount].y = (short) y;
+        }
+
+        // right side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = (short) x;
+            ss[ssCount].y = (short) ++y;
+        }
+
+        // bottom side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = (short) --x;
+            ss[ssCount].y = (short) y;
+        }
+
+        // left side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = (short) x;
+            ss[ssCount].y = (short) --y;
+        }
+    }
+
+    // the rest part
+    if (maxRange.width < maxRange.height)
+    {
+        const int xleft = -minCount;
+
+        // cycle by neighbor rings
+        for (int i = minCount; i < maxRange.height; ++i)
+        {
+            // two cycles by x
+            int y = -(i + 1);
+            int x = xleft;
+
+            // upper side
+            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
+            {
+                ss[ssCount].x = (short) x;
+                ss[ssCount].y = (short) y;
+            }
+
+            x = xleft;
+            y = -y;
+
+            // bottom side
+            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
+            {
+                ss[ssCount].x = (short) x;
+                ss[ssCount].y = (short) y;
+            }
+        }
+    }
+    else if (maxRange.width > maxRange.height)
+    {
+        const int yupper = -minCount;
+
+        // cycle by neighbor rings
+        for (int i = minCount; i < maxRange.width; ++i)
+        {
+            // two cycles by y
+            int x = -(i + 1);
+            int y = yupper;
+
+            // left side
+            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
+            {
+                ss[ssCount].x = (short) x;
+                ss[ssCount].y = (short) y;
+            }
+
+            y = yupper;
+            x = -x;
+
+            // right side
+            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
+            {
+                ss[ssCount].x = (short) x;
+                ss[ssCount].y = (short) y;
+            }
+        }
+    }
+
+    const cudaStream_t stream = StreamAccessor::getStream(st);
+
+    ensureSizeIsEnough(1, ssCount, CV_16SC2, buf);
+    if (stream == 0)
+        cudaSafeCall( cudaMemcpy(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice) );
+    else
+        cudaSafeCall( cudaMemcpyAsync(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice, stream) );
+
+    const int maxX = prev.cols - blockSize.width;
+    const int maxY = prev.rows - blockSize.height;
+
+    const int SMALL_DIFF = 2;
+    const int BIG_DIFF = 128;
+
+    const int blSize = blockSize.area();
+    const int acceptLevel = blSize * SMALL_DIFF;
+    const int escapeLevel = blSize * BIG_DIFF;
+
+    optflowbm::calc(prev, curr, velx, vely,
+                    make_int2(blockSize.width, blockSize.height), make_int2(shiftSize.width, shiftSize.height), usePrevious,
+                    maxX, maxY, acceptLevel, escapeLevel, buf.ptr<short2>(), ssCount, stream);
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/cudalegacy/src/bm_fast.cpp b/modules/cudalegacy/src/bm_fast.cpp
new file mode 100644
index 00000000000..ecb87908fc1
--- /dev/null
+++ b/modules/cudalegacy/src/bm_fast.cpp
@@ -0,0 +1,90 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || !defined(HAVE_OPENCV_CUDAARITHM) || defined(CUDA_DISABLER)
+
+void cv::cuda::FastOpticalFlowBM::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, int, int, Stream&) { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace optflowbm_fast
+{
+    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
+
+    template <typename T>
+    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
+}
+
+void cv::cuda::FastOpticalFlowBM::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window, int block_window, Stream& stream)
+{
+    CV_Assert( I0.type() == CV_8UC1 );
+    CV_Assert( I1.size() == I0.size() && I1.type() == I0.type() );
+
+    int border_size = search_window / 2 + block_window / 2;
+    Size esize = I0.size() + Size(border_size, border_size) * 2;
+
+    ensureSizeIsEnough(esize, I0.type(), extended_I0);
+    ensureSizeIsEnough(esize, I0.type(), extended_I1);
+
+    cuda::copyMakeBorder(I0, extended_I0, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
+    cuda::copyMakeBorder(I1, extended_I1, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
+
+    GpuMat I0_hdr = extended_I0(Rect(Point2i(border_size, border_size), I0.size()));
+    GpuMat I1_hdr = extended_I1(Rect(Point2i(border_size, border_size), I0.size()));
+
+    int bcols, brows;
+    optflowbm_fast::get_buffer_size(I0.cols, I0.rows, search_window, block_window, bcols, brows);
+
+    ensureSizeIsEnough(brows, bcols, CV_32SC1, buffer);
+
+    flowx.create(I0.size(), CV_32FC1);
+    flowy.create(I0.size(), CV_32FC1);
+
+    optflowbm_fast::calc<uchar>(I0_hdr, I1_hdr, flowx, flowy, buffer, search_window, block_window, StreamAccessor::getStream(stream));
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/cudalegacy/src/calib3d.cpp b/modules/cudalegacy/src/calib3d.cpp
new file mode 100644
index 00000000000..f9b80efb1d9
--- /dev/null
+++ b/modules/cudalegacy/src/calib3d.cpp
@@ -0,0 +1,292 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || !defined HAVE_OPENCV_CALIB3D || defined(CUDA_DISABLER)
+
+void cv::cuda::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, std::vector<int>*) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace transform_points
+    {
+        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, PtrStepSz<float3> dst, cudaStream_t stream);
+    }
+
+    namespace project_points
+    {
+        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, const float* proj, PtrStepSz<float2> dst, cudaStream_t stream);
+    }
+
+    namespace solve_pnp_ransac
+    {
+        int maxNumIters();
+
+        void computeHypothesisScores(
+                const int num_hypotheses, const int num_points, const float* rot_matrices,
+                const float3* transl_vectors, const float3* object, const float2* image,
+                const float dist_threshold, int* hypothesis_scores);
+    }
+}}}
+
+using namespace ::cv::cuda::device;
+
+namespace
+{
+    void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
+        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
+        CV_Assert(tvec.size() == Size(3, 1) && tvec.type() == CV_32F);
+
+        // Convert rotation vector into matrix
+        Mat rot;
+        Rodrigues(rvec, rot);
+
+        dst.create(src.size(), src.type());
+        transform_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), dst, stream);
+    }
+}
+
+void cv::cuda::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
+{
+    transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
+}
+
+namespace
+{
+    void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
+        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
+        CV_Assert(tvec.size() == Size(3, 1) && tvec.type() == CV_32F);
+        CV_Assert(camera_mat.size() == Size(3, 3) && camera_mat.type() == CV_32F);
+        CV_Assert(dist_coef.empty()); // Undistortion isn't supported
+
+        // Convert rotation vector into matrix
+        Mat rot;
+        Rodrigues(rvec, rot);
+
+        dst.create(src.size(), CV_32FC2);
+        project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), camera_mat.ptr<float>(), dst,stream);
+    }
+}
+
+void cv::cuda::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
+{
+    projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
+}
+
+namespace
+{
+    // Selects subset_size random different points from [0, num_points - 1] range
+    void selectRandom(int subset_size, int num_points, std::vector<int>& subset)
+    {
+        subset.resize(subset_size);
+        for (int i = 0; i < subset_size; ++i)
+        {
+            bool was;
+            do
+            {
+                subset[i] = rand() % num_points;
+                was = false;
+                for (int j = 0; j < i; ++j)
+                    if (subset[j] == subset[i])
+                    {
+                        was = true;
+                        break;
+                    }
+            } while (was);
+        }
+    }
+
+    // Computes rotation, translation pair for small subsets if the input data
+    class TransformHypothesesGenerator : public ParallelLoopBody
+    {
+    public:
+        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
+                                     const Mat& camera_mat_, int num_points_, int subset_size_,
+                                     Mat rot_matrices_, Mat transl_vectors_)
+                : object(&object_), image(&image_), dist_coef(&dist_coef_), camera_mat(&camera_mat_),
+                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
+                  transl_vectors(transl_vectors_) {}
+
+        void operator()(const Range& range) const CV_OVERRIDE
+        {
+            // Input data for generation of the current hypothesis
+            std::vector<int> subset_indices(subset_size);
+            Mat_<Point3f> object_subset(1, subset_size);
+            Mat_<Point2f> image_subset(1, subset_size);
+
+            // Current hypothesis data
+            Mat rot_vec(1, 3, CV_64F);
+            Mat rot_mat(3, 3, CV_64F);
+            Mat transl_vec(1, 3, CV_64F);
+
+            for (int iter = range.start; iter < range.end; ++iter)
+            {
+                selectRandom(subset_size, num_points, subset_indices);
+                for (int i = 0; i < subset_size; ++i)
+                {
+                   object_subset(0, i) = object->at<Point3f>(subset_indices[i]);
+                   image_subset(0, i) = image->at<Point2f>(subset_indices[i]);
+                }
+
+                solvePnP(object_subset, image_subset, *camera_mat, *dist_coef, rot_vec, transl_vec);
+
+                // Remember translation vector
+                Mat transl_vec_ = transl_vectors.colRange(iter * 3, (iter + 1) * 3);
+                transl_vec = transl_vec.reshape(0, 1);
+                transl_vec.convertTo(transl_vec_, CV_32F);
+
+                // Remember rotation matrix
+                Rodrigues(rot_vec, rot_mat);
+                Mat rot_mat_ = rot_matrices.colRange(iter * 9, (iter + 1) * 9).reshape(0, 3);
+                rot_mat.convertTo(rot_mat_, CV_32F);
+            }
+        }
+
+        const Mat* object;
+        const Mat* image;
+        const Mat* dist_coef;
+        const Mat* camera_mat;
+        int num_points;
+        int subset_size;
+
+        // Hypotheses storage (global)
+        Mat rot_matrices;
+        Mat transl_vectors;
+    };
+}
+
+void cv::cuda::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
+                             const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess,
+                             int num_iters, float max_dist, int min_inlier_count,
+                             std::vector<int>* inliers)
+{
+    CV_UNUSED(min_inlier_count);
+    CV_Assert(object.rows == 1 && object.cols > 0 && object.type() == CV_32FC3);
+    CV_Assert(image.rows == 1 && image.cols > 0 && image.type() == CV_32FC2);
+    CV_Assert(object.cols == image.cols);
+    CV_Assert(camera_mat.size() == Size(3, 3) && camera_mat.type() == CV_32F);
+    CV_Assert(!use_extrinsic_guess); // We don't support initial guess for now
+    CV_Assert(num_iters <= solve_pnp_ransac::maxNumIters());
+
+    const int subset_size = 4;
+    const int num_points = object.cols;
+    CV_Assert(num_points >= subset_size);
+
+    // Unapply distortion and intrinsic camera transformations
+    Mat eye_camera_mat = Mat::eye(3, 3, CV_32F);
+    Mat empty_dist_coef;
+    Mat image_normalized;
+    undistortPoints(image, image_normalized, camera_mat, dist_coef, Mat(), eye_camera_mat);
+
+    // Hypotheses storage (global)
+    Mat rot_matrices(1, num_iters * 9, CV_32F);
+    Mat transl_vectors(1, num_iters * 3, CV_32F);
+
+    // Generate set of hypotheses using small subsets of the input data
+    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
+                                      num_points, subset_size, rot_matrices, transl_vectors);
+    parallel_for_(Range(0, num_iters), body);
+
+    // Compute scores (i.e. number of inliers) for each hypothesis
+    GpuMat d_object(object);
+    GpuMat d_image_normalized(image_normalized);
+    GpuMat d_hypothesis_scores(1, num_iters, CV_32S);
+    solve_pnp_ransac::computeHypothesisScores(
+            num_iters, num_points, rot_matrices.ptr<float>(), transl_vectors.ptr<float3>(),
+            d_object.ptr<float3>(), d_image_normalized.ptr<float2>(), max_dist * max_dist,
+            d_hypothesis_scores.ptr<int>());
+
+    // Find the best hypothesis index
+    Point best_idx;
+    double best_score;
+    cuda::minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
+    int num_inliers = static_cast<int>(best_score);
+
+    // Extract the best hypothesis data
+
+    Mat rot_mat = rot_matrices.colRange(best_idx.x * 9, (best_idx.x + 1) * 9).reshape(0, 3);
+    Rodrigues(rot_mat, rvec);
+    rvec = rvec.reshape(0, 1);
+
+    tvec = transl_vectors.colRange(best_idx.x * 3, (best_idx.x + 1) * 3).clone();
+    tvec = tvec.reshape(0, 1);
+
+    // Build vector of inlier indices
+    if (inliers != NULL)
+    {
+        inliers->clear();
+        inliers->reserve(num_inliers);
+
+        Point3f p, p_transf;
+        Point2f p_proj;
+        const float* rot = rot_mat.ptr<float>();
+        const float* transl = tvec.ptr<float>();
+
+        for (int i = 0; i < num_points; ++i)
+        {
+            p = object.at<Point3f>(0, i);
+            p_transf.x = rot[0] * p.x + rot[1] * p.y + rot[2] * p.z + transl[0];
+            p_transf.y = rot[3] * p.x + rot[4] * p.y + rot[5] * p.z + transl[1];
+            p_transf.z = rot[6] * p.x + rot[7] * p.y + rot[8] * p.z + transl[2];
+            p_proj.x = p_transf.x / p_transf.z;
+            p_proj.y = p_transf.y / p_transf.z;
+            if (norm(p_proj - image_normalized.at<Point2f>(0, i)) < max_dist)
+                inliers->push_back(i);
+        }
+    }
+}
+
+#endif
diff --git a/modules/cudalegacy/src/cuda/NCV.cu b/modules/cudalegacy/src/cuda/NCV.cu
new file mode 100644
index 00000000000..a149fc0f859
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/NCV.cu
@@ -0,0 +1,180 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <iostream>
+#include <vector>
+
+#include "opencv2/cudalegacy/NCV.hpp"
+
+//===================================================================
+//
+// Operations with rectangles
+//
+//===================================================================
+
+
+const Ncv32u NUMTHREADS_DRAWRECTS = 32;
+const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
+
+
+template <class T>
+__global__ void drawRects(T *d_dst,
+                          Ncv32u dstStride,
+                          Ncv32u dstWidth,
+                          Ncv32u dstHeight,
+                          NcvRect32u *d_rects,
+                          Ncv32u numRects,
+                          T color)
+{
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
+    if (blockId > numRects * 4)
+    {
+        return;
+    }
+
+    NcvRect32u curRect = d_rects[blockId >> 2];
+    NcvBool bVertical = blockId & 0x1;
+    NcvBool bTopLeft = blockId & 0x2;
+
+    Ncv32u pt0x, pt0y;
+    if (bVertical)
+    {
+        Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
+
+        pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
+        pt0y = curRect.y;
+
+        if (pt0x < dstWidth)
+        {
+            for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
+            {
+                Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
+                if (ptY < pt0y + curRect.height && ptY < dstHeight)
+                {
+                    d_dst[ptY * dstStride + pt0x] = color;
+                }
+            }
+        }
+    }
+    else
+    {
+        Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
+
+        pt0x = curRect.x;
+        pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
+
+        if (pt0y < dstHeight)
+        {
+            for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
+            {
+                Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
+                if (ptX < pt0x + curRect.width && ptX < dstWidth)
+                {
+                    d_dst[pt0y * dstStride + ptX] = color;
+                }
+            }
+        }
+    }
+}
+
+
+template <class T>
+static NCVStatus drawRectsWrapperDevice(T *d_dst,
+                                        Ncv32u dstStride,
+                                        Ncv32u dstWidth,
+                                        Ncv32u dstHeight,
+                                        NcvRect32u *d_rects,
+                                        Ncv32u numRects,
+                                        T color,
+                                        cudaStream_t cuStream)
+{
+    CV_UNUSED(cuStream);
+    ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
+    ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
+    ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
+
+    if (numRects == 0)
+    {
+        return NCV_SUCCESS;
+    }
+
+    dim3 grid(numRects * 4);
+    dim3 block(NUMTHREADS_DRAWRECTS);
+    if (grid.x > 65535)
+    {
+        grid.y = (grid.x + 65534) / 65535;
+        grid.x = 65535;
+    }
+
+    drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
+
+    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
+                                 Ncv32u dstStride,
+                                 Ncv32u dstWidth,
+                                 Ncv32u dstHeight,
+                                 NcvRect32u *d_rects,
+                                 Ncv32u numRects,
+                                 Ncv8u color,
+                                 cudaStream_t cuStream)
+{
+    return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
+}
+
+
+NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
+                                  Ncv32u dstStride,
+                                  Ncv32u dstWidth,
+                                  Ncv32u dstHeight,
+                                  NcvRect32u *d_rects,
+                                  Ncv32u numRects,
+                                  Ncv32u color,
+                                  cudaStream_t cuStream)
+{
+    return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
+}
diff --git a/modules/cudalegacy/src/cuda/NCVAlg.hpp b/modules/cudalegacy/src/cuda/NCVAlg.hpp
new file mode 100644
index 00000000000..96a7e5e0f6f
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/NCVAlg.hpp
@@ -0,0 +1,155 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_alg_hpp_
+#define _ncv_alg_hpp_
+
+#include "opencv2/cudalegacy/NCV.hpp"
+
+
+template <class T>
+static void swap(T &p1, T &p2)
+{
+    T tmp = p1;
+    p1 = p2;
+    p2 = tmp;
+}
+
+
+template<typename T>
+static T divUp(T a, T b)
+{
+    return (a + b - 1) / b;
+}
+
+
+template<typename T>
+struct functorAddValues
+{
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
+    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
+    {
+        in1out += in2;
+    }
+};
+
+
+template<typename T>
+struct functorMinValues
+{
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
+    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
+    {
+        in1out = in1out > in2 ? in2 : in1out;
+    }
+};
+
+
+template<typename T>
+struct functorMaxValues
+{
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
+    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
+    {
+        in1out = in1out > in2 ? in1out : in2;
+    }
+};
+
+
+template<typename Tdata, class Tfunc, Ncv32u nThreads>
+static __device__ Tdata subReduce(Tdata threadElem)
+{
+    Tfunc functor;
+
+    __shared__ Tdata _reduceArr[nThreads];
+    volatile Tdata *reduceArr = _reduceArr;
+    functor.assign(reduceArr + threadIdx.x, &threadElem);
+    __syncthreads();
+
+    if (nThreads >= 256 && threadIdx.x < 128)
+    {
+        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 128]);
+    }
+    __syncthreads();
+
+    if (nThreads >= 128 && threadIdx.x < 64)
+    {
+        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 64]);
+    }
+    __syncthreads();
+
+    if (threadIdx.x < 32)
+    {
+        if (nThreads >= 64)
+        {
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
+        }
+        if (nThreads >= 32 && threadIdx.x < 16)
+        {
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
+        }
+    }
+
+    __syncthreads();
+    Tdata reduceRes;
+    functor.assign(&reduceRes, reduceArr);
+    return reduceRes;
+}
+
+
+#endif //_ncv_alg_hpp_
diff --git a/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
new file mode 100644
index 00000000000..690bf8e1cc3
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
@@ -0,0 +1,1164 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// NVIDIA CUDA implementation of Brox et al Optical Flow algorithm
+//
+// Algorithm is explained in the original paper:
+//      T. Brox, A. Bruhn, N. Papenberg, J. Weickert:
+//      High accuracy optical flow estimation based on a theory for warping.
+//      ECCV 2004.
+//
+// Implementation by Mikhail Smirnov
+// email: msmirnov@nvidia.com, devsupport@nvidia.com
+//
+// Credits for help with the code to:
+// Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <iostream>
+#include <vector>
+#include <memory>
+
+#include "opencv2/core/cuda/utility.hpp"
+
+#include "opencv2/cudalegacy/NPP_staging.hpp"
+#include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
+
+
+typedef NCVVectorAlloc<Ncv32f> FloatVector;
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// Implementation specific constants
+/////////////////////////////////////////////////////////////////////////////////////////
+__device__ const float eps2 = 1e-6f;
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// Additional defines
+/////////////////////////////////////////////////////////////////////////////////////////
+
+// rounded up division
+inline int iDivUp(int a, int b)
+{
+    return (a + b - 1)/b;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// Texture references
+/////////////////////////////////////////////////////////////////////////////////////////
+
+texture<float, 2, cudaReadModeElementType> tex_coarse;
+texture<float, 2, cudaReadModeElementType> tex_fine;
+
+texture<float, 2, cudaReadModeElementType> tex_I1;
+texture<float, 2, cudaReadModeElementType> tex_I0;
+
+texture<float, 2, cudaReadModeElementType> tex_Ix;
+texture<float, 2, cudaReadModeElementType> tex_Ixx;
+texture<float, 2, cudaReadModeElementType> tex_Ix0;
+
+texture<float, 2, cudaReadModeElementType> tex_Iy;
+texture<float, 2, cudaReadModeElementType> tex_Iyy;
+texture<float, 2, cudaReadModeElementType> tex_Iy0;
+
+texture<float, 2, cudaReadModeElementType> tex_Ixy;
+
+texture<float, 1, cudaReadModeElementType> tex_u;
+texture<float, 1, cudaReadModeElementType> tex_v;
+texture<float, 1, cudaReadModeElementType> tex_du;
+texture<float, 1, cudaReadModeElementType> tex_dv;
+texture<float, 1, cudaReadModeElementType> tex_numerator_dudv;
+texture<float, 1, cudaReadModeElementType> tex_numerator_u;
+texture<float, 1, cudaReadModeElementType> tex_numerator_v;
+texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u;
+texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v;
+texture<float, 1, cudaReadModeElementType> tex_diffusivity_x;
+texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// SUPPLEMENTARY FUNCTIONS
+/////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+/// \brief performs pointwise summation of two vectors stored in device memory
+/// \param d_res    - pointer to resulting vector (device memory)
+/// \param d_op1    - term #1 (device memory)
+/// \param d_op2    - term #2 (device memory)
+/// \param len    - vector size
+///////////////////////////////////////////////////////////////////////////////
+__global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len)
+{
+    const int pos = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(pos >= len) return;
+
+    d_res[pos] = d_op1[pos] + d_op2[pos];
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// \brief wrapper for summation kernel.
+///  Computes \b op1 + \b op2 and stores result to \b res
+/// \param res   array, containing op1 + op2 (device memory)
+/// \param op1   term #1 (device memory)
+/// \param op2   term #2 (device memory)
+/// \param count vector size
+///////////////////////////////////////////////////////////////////////////////
+static void add(float *res, const float *op1, const float *op2, const int count, cudaStream_t stream)
+{
+    dim3 threads(256);
+    dim3 blocks(iDivUp(count, threads.x));
+
+    pointwise_add<<<blocks, threads, 0, stream>>>(res, op1, op2, count);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// \brief wrapper for summation kernel.
+/// Increments \b res by \b rhs
+/// \param res   initial vector, will be replaced with result (device memory)
+/// \param rhs   increment (device memory)
+/// \param count vector size
+///////////////////////////////////////////////////////////////////////////////
+static void add(float *res, const float *rhs, const int count, cudaStream_t stream)
+{
+    add(res, res, rhs, count, stream);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// \brief kernel for scaling vector by scalar
+/// \param d_res  scaled vector (device memory)
+/// \param d_src  source vector (device memory)
+/// \param scale  scalar to scale by
+/// \param len    vector size (number of elements)
+///////////////////////////////////////////////////////////////////////////////
+__global__ void scaleVector(float *d_res, const float *d_src, float scale, const int len)
+{
+    const int pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (pos >= len) return;
+
+    d_res[pos] = d_src[pos] * scale;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// \brief scale vector by scalar
+///
+/// kernel wrapper
+/// \param d_res  scaled vector (device memory)
+/// \param d_src  source vector (device memory)
+/// \param scale  scalar to scale by
+/// \param len    vector size (number of elements)
+/// \param stream CUDA stream
+///////////////////////////////////////////////////////////////////////////////
+static void ScaleVector(float *d_res, const float *d_src, float scale, const int len, cudaStream_t stream)
+{
+    dim3 threads(256);
+    dim3 blocks(iDivUp(len, threads.x));
+
+    scaleVector<<<blocks, threads, 0, stream>>>(d_res, d_src, scale, len);
+}
+
+const int SOR_TILE_WIDTH = 32;
+const int SOR_TILE_HEIGHT = 6;
+const int PSOR_TILE_WIDTH = 32;
+const int PSOR_TILE_HEIGHT = 6;
+const int PSOR_PITCH = PSOR_TILE_WIDTH + 4;
+const int PSOR_HEIGHT = PSOR_TILE_HEIGHT + 4;
+
+///////////////////////////////////////////////////////////////////////////////
+///\brief Utility function. Compute smooth term diffusivity along x axis
+///\param s (out) pointer to memory location for result (diffusivity)
+///\param pos (in) position within shared memory array containing \b u
+///\param u (in) shared memory array containing \b u
+///\param v (in) shared memory array containing \b v
+///\param du (in) shared memory array containing \b du
+///\param dv (in) shared memory array containing \b dv
+///////////////////////////////////////////////////////////////////////////////
+__forceinline__ __device__ void diffusivity_along_x(float *s, int pos, const float *u, const float *v, const float *du, const float *dv)
+{
+    //x derivative between pixels (i,j) and (i-1,j)
+    const int left = pos-1;
+    float u_x = u[pos] + du[pos] - u[left] - du[left];
+    float v_x = v[pos] + dv[pos] - v[left] - dv[left];
+    const int up        = pos + PSOR_PITCH;
+    const int down      = pos - PSOR_PITCH;
+    const int up_left   = up - 1;
+    const int down_left = down-1;
+    //y derivative between pixels (i,j) and (i-1,j)
+    float u_y = 0.25f*(u[up] + du[up] + u[up_left] + du[up_left] - u[down] - du[down] - u[down_left] - du[down_left]);
+    float v_y = 0.25f*(v[up] + dv[up] + v[up_left] + dv[up_left] - v[down] - dv[down] - v[down_left] - dv[down_left]);
+    *s = 0.5f / sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///\brief Utility function. Compute smooth term diffusivity along y axis
+///\param s (out) pointer to memory location for result (diffusivity)
+///\param pos (in) position within shared memory array containing \b u
+///\param u (in) shared memory array containing \b u
+///\param v (in) shared memory array containing \b v
+///\param du (in) shared memory array containing \b du
+///\param dv (in) shared memory array containing \b dv
+///////////////////////////////////////////////////////////////////////////////
+__forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const float *u, const float *v, const float *du, const float *dv)
+{
+    //y derivative between pixels (i,j) and (i,j-1)
+    const int down = pos-PSOR_PITCH;
+    float u_y = u[pos] + du[pos] - u[down] - du[down];
+    float v_y = v[pos] + dv[pos] - v[down] - dv[down];
+    const int right      = pos + 1;
+    const int left       = pos - 1;
+    const int down_right = down + 1;
+    const int down_left  = down - 1;
+    //x derivative between pixels (i,j) and (i,j-1);
+    float u_x = 0.25f*(u[right] + u[down_right] + du[right] + du[down_right] - u[left] - u[down_left] - du[left] - du[down_left]);
+    float v_x = 0.25f*(v[right] + v[down_right] + dv[right] + dv[down_right] - v[left] - v[down_left] - dv[left] - dv[down_left]);
+    *s = 0.5f/sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///\brief Utility function. Load element of 2D global memory to shared memory
+///\param smem pointer to shared memory array
+///\param is shared memory array column
+///\param js shared memory array row
+///\param w number of columns in global memory array
+///\param h number of rows in global memory array
+///\param p global memory array pitch in floats
+///////////////////////////////////////////////////////////////////////////////
+template<int tex_id>
+__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
+{
+    //position within shared memory array
+    const int ijs = js * PSOR_PITCH + is;
+    //mirror reflection across borders
+    i = max(i, -i-1);
+    i = min(i, w-i+w-1);
+    j = max(j, -j-1);
+    j = min(j, h-j+h-1);
+    const int pos = j * p + i;
+    switch(tex_id){
+        case 0:
+            smem[ijs] = tex1Dfetch(tex_u, pos);
+            break;
+        case 1:
+            smem[ijs] = tex1Dfetch(tex_v, pos);
+            break;
+        case 2:
+            smem[ijs] = tex1Dfetch(tex_du, pos);
+            break;
+        case 3:
+            smem[ijs] = tex1Dfetch(tex_dv, pos);
+            break;
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///\brief Utility function. Load part (tile) of 2D global memory to shared memory
+///\param smem pointer to target shared memory array
+///\param ig column number within source
+///\param jg row number within source
+///\param w number of columns in global memory array
+///\param h number of rows in global memory array
+///\param p global memory array pitch in floats
+///////////////////////////////////////////////////////////////////////////////
+template<int tex>
+__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
+{
+    const int i = threadIdx.x + 2;
+    const int j = threadIdx.y + 2;
+    load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel
+    __syncthreads();
+    if(threadIdx.y < 2)
+    {
+        //load bottom shadow elements
+        load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p);
+        if(threadIdx.x < 2)
+        {
+            //load bottom right shadow elements
+            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
+            //load middle right shadow elements
+            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
+        {
+            //load bottom left shadow elements
+            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
+            //load middle left shadow elements
+            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+    }
+    else if(threadIdx.y >= PSOR_TILE_HEIGHT-2)
+    {
+        //load upper shadow elements
+        load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p);
+        if(threadIdx.x < 2)
+        {
+            //load upper right shadow elements
+            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
+            //load middle right shadow elements
+            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
+        {
+            //load upper left shadow elements
+            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
+            //load middle left shadow elements
+            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+    }
+    else
+    {
+        //load middle shadow elements
+        if(threadIdx.x < 2)
+        {
+            //load middle right shadow elements
+            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
+        {
+            //load middle left shadow elements
+            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+    }
+    __syncthreads();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// \brief computes matrix of linearised system for \c du, \c dv
+/// Computed values reside in GPU memory. \n
+/// Matrix computation is divided into two steps. This kernel performs first step\n
+/// - compute smoothness term diffusivity between pixels - psi dash smooth
+/// - compute robustness factor in the data term - psi dash data
+/// \param diffusivity_x (in/out) diffusivity between pixels along x axis in smoothness term
+/// \param diffusivity_y (in/out) diffusivity between pixels along y axis in smoothness term
+/// \param denominator_u (in/out) precomputed part of expression for new du value in SOR iteration
+/// \param denominator_v (in/out) precomputed part of expression for new dv value in SOR iteration
+/// \param numerator_dudv (in/out) precomputed part of expression for new du and dv value in SOR iteration
+/// \param numerator_u (in/out) precomputed part of expression for new du value in SOR iteration
+/// \param numerator_v (in/out) precomputed part of expression for new dv value in SOR iteration
+/// \param w (in) frame width
+/// \param h (in) frame height
+/// \param pitch (in) pitch in floats
+/// \param alpha (in) alpha in Brox model (flow smoothness)
+/// \param gamma (in) gamma in Brox model (edge importance)
+///////////////////////////////////////////////////////////////////////////////
+
+__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
+                                                        float *denominator_u, float *denominator_v,
+                                                        float *numerator_dudv,
+                                                        float *numerator_u, float *numerator_v,
+                                                        int w, int h, int s,
+                                                        float alpha, float gamma)
+{
+    __shared__ float u[PSOR_PITCH * PSOR_HEIGHT];
+    __shared__ float v[PSOR_PITCH * PSOR_HEIGHT];
+    __shared__ float du[PSOR_PITCH * PSOR_HEIGHT];
+    __shared__ float dv[PSOR_PITCH * PSOR_HEIGHT];
+
+    //position within tile
+    const int i = threadIdx.x;
+    const int j = threadIdx.y;
+    //position within smem arrays
+    const int ijs = (j+2) * PSOR_PITCH + i + 2;
+    //position within global memory
+    const int ig  = blockIdx.x * blockDim.x + threadIdx.x;
+    const int jg  = blockIdx.y * blockDim.y + threadIdx.y;
+    const int ijg = jg * s + ig;
+    //position within texture
+    float x = (float)ig + 0.5f;
+    float y = (float)jg + 0.5f;
+    //load u  and v to smem
+    load_array<0>(u, ig, jg, w, h, s);
+    load_array<1>(v, ig, jg, w, h, s);
+    load_array<2>(du, ig, jg, w, h, s);
+    load_array<3>(dv, ig, jg, w, h, s);
+    //warped position
+    float wx = (x + u[ijs])/(float)w;
+    float wy = (y + v[ijs])/(float)h;
+    x /= (float)w;
+    y /= (float)h;
+    //compute image derivatives
+    const float Iz  = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y);
+    const float Ix  = tex2D(tex_Ix, wx, wy);
+    const float Ixz = Ix - tex2D(tex_Ix0, x, y);
+    const float Ixy = tex2D(tex_Ixy, wx, wy);
+    const float Ixx = tex2D(tex_Ixx, wx, wy);
+    const float Iy  = tex2D(tex_Iy, wx, wy);
+    const float Iyz = Iy - tex2D(tex_Iy0, x, y);
+    const float Iyy = tex2D(tex_Iyy, wx, wy);
+    //compute data term
+    float q0, q1, q2;
+    q0 = Iz  + Ix  * du[ijs] + Iy  * dv[ijs];
+    q1 = Ixz + Ixx * du[ijs] + Ixy * dv[ijs];
+    q2 = Iyz + Ixy * du[ijs] + Iyy * dv[ijs];
+    float data_term = 0.5f * rsqrtf(q0*q0 + gamma*(q1*q1 + q2*q2) + eps2);
+    //scale data term by 1/alpha
+    data_term /= alpha;
+    //compute smoothness term (diffusivity)
+    float sx, sy;
+
+    if(ig >= w || jg >= h) return;
+
+    diffusivity_along_x(&sx, ijs, u, v, du, dv);
+    diffusivity_along_y(&sy, ijs, u, v, du, dv);
+
+    if(ig == 0) sx = 0.0f;
+    if(jg == 0) sy = 0.0f;
+
+    numerator_dudv[ijg] = data_term * (Ix*Iy + gamma * Ixy*(Ixx + Iyy));
+    numerator_u[ijg]    = data_term * (Ix*Iz + gamma * (Ixx*Ixz + Ixy*Iyz));
+    numerator_v[ijg]    = data_term * (Iy*Iz + gamma * (Iyy*Iyz + Ixy*Ixz));
+    denominator_u[ijg]  = data_term * (Ix*Ix + gamma * (Ixy*Ixy + Ixx*Ixx));
+    denominator_v[ijg]  = data_term * (Iy*Iy + gamma * (Ixy*Ixy + Iyy*Iyy));
+    diffusivity_x[ijg]  = sx;
+    diffusivity_y[ijg]  = sy;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///\brief computes matrix of linearised system for \c du, \c dv
+///\param inv_denominator_u
+///\param inv_denominator_v
+///\param w
+///\param h
+///\param s
+///////////////////////////////////////////////////////////////////////////////
+__global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v,
+                                    int w, int h, int s)
+{
+    __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
+    __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
+    //position within tile
+    const int i = threadIdx.x;
+    const int j = threadIdx.y;
+    //position within smem arrays
+    const int ijs = j*(PSOR_TILE_WIDTH+1) + i;
+    //position within global memory
+    const int ig  = blockIdx.x * blockDim.x + threadIdx.x;
+    const int jg  = blockIdx.y * blockDim.y + threadIdx.y;
+    const int ijg = jg*s + ig;
+    int inside = ig < w && jg < h;
+    float denom_u;
+    float denom_v;
+    if(inside)
+    {
+        denom_u = inv_denominator_u[ijg];
+        denom_v = inv_denominator_v[ijg];
+    }
+    if(inside)
+    {
+        sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg);
+        sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg);
+    }
+    else
+    {
+        sx[ijs] = 0.0f;
+        sy[ijs] = 0.0f;
+    }
+    int up = ijs+PSOR_TILE_WIDTH+1;
+    if(j == PSOR_TILE_HEIGHT-1)
+    {
+        if(jg < h-1 && inside)
+        {
+            sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s);
+        }
+        else
+        {
+            sy[up] = 0.0f;
+        }
+    }
+    int right = ijs + 1;
+    if(threadIdx.x == PSOR_TILE_WIDTH-1)
+    {
+        if(ig < w-1 && inside)
+        {
+            sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1);
+        }
+        else
+        {
+            sx[right] = 0.0f;
+        }
+    }
+    __syncthreads();
+    float diffusivity_sum;
+    diffusivity_sum = sx[ijs] + sx[ijs+1] + sy[ijs] + sy[ijs+PSOR_TILE_WIDTH+1];
+    if(inside)
+    {
+        denom_u += diffusivity_sum;
+        denom_v += diffusivity_sum;
+        inv_denominator_u[ijg] = 1.0f/denom_u;
+        inv_denominator_v[ijg] = 1.0f/denom_v;
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// Red-Black SOR
+/////////////////////////////////////////////////////////////////////////////////////////
+
+template<int isBlack> __global__ void sor_pass(float *new_du,
+                                               float *new_dv,
+                                               const float *g_inv_denominator_u,
+                                               const float *g_inv_denominator_v,
+                                               const float *g_numerator_u,
+                                               const float *g_numerator_v,
+                                               const float *g_numerator_dudv,
+                                               float omega,
+                                               int width,
+                                               int height,
+                                               int stride)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if(i >= width || j >= height)
+        return;
+
+    const int pos = j * stride + i;
+    const int pos_r = i < width - 1 ? pos + 1 : pos;
+    const int pos_u = j < height - 1 ? pos + stride : pos;
+    const int pos_d = j > 0 ? pos - stride : pos;
+    const int pos_l = i > 0 ? pos - 1 : pos;
+
+    //load smooth term
+    float s_up, s_left, s_right, s_down;
+    s_left = tex1Dfetch(tex_diffusivity_x, pos);
+    s_down = tex1Dfetch(tex_diffusivity_y, pos);
+    if(i < width-1)
+        s_right = tex1Dfetch(tex_diffusivity_x, pos_r);
+    else
+        s_right = 0.0f; //Neumann BC
+    if(j < height-1)
+        s_up = tex1Dfetch(tex_diffusivity_y, pos_u);
+    else
+        s_up = 0.0f; //Neumann BC
+
+    //load u, v and du, dv
+    float u_up, u_left, u_right, u_down, u;
+    float v_up, v_left, v_right, v_down, v;
+    float du_up, du_left, du_right, du_down, du;
+    float dv_up, dv_left, dv_right, dv_down, dv;
+
+    u_left  = tex1Dfetch(tex_u, pos_l);
+    u_right = tex1Dfetch(tex_u, pos_r);
+    u_down  = tex1Dfetch(tex_u, pos_d);
+    u_up    = tex1Dfetch(tex_u, pos_u);
+    u       = tex1Dfetch(tex_u, pos);
+
+    v_left  = tex1Dfetch(tex_v, pos_l);
+    v_right = tex1Dfetch(tex_v, pos_r);
+    v_down  = tex1Dfetch(tex_v, pos_d);
+    v       = tex1Dfetch(tex_v, pos);
+    v_up    = tex1Dfetch(tex_v, pos_u);
+
+    du       = tex1Dfetch(tex_du, pos);
+    du_left  = tex1Dfetch(tex_du, pos_l);
+    du_right = tex1Dfetch(tex_du, pos_r);
+    du_down  = tex1Dfetch(tex_du, pos_d);
+    du_up    = tex1Dfetch(tex_du, pos_u);
+
+    dv       = tex1Dfetch(tex_dv, pos);
+    dv_left  = tex1Dfetch(tex_dv, pos_l);
+    dv_right = tex1Dfetch(tex_dv, pos_r);
+    dv_down  = tex1Dfetch(tex_dv, pos_d);
+    dv_up    = tex1Dfetch(tex_dv, pos_u);
+
+    float numerator_dudv    = g_numerator_dudv[pos];
+
+    if((i+j)%2 == isBlack)
+    {
+        // update du
+        float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) -
+                             u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv);
+
+        du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u;
+
+        // update dv
+        float numerator_v = (s_left*(v_left + dv_left) + s_up*(v_up + dv_up) + s_right*(v_right + dv_right) + s_down*(v_down + dv_down) -
+                             v * (s_left + s_right + s_up + s_down) - g_numerator_v[pos] - numerator_dudv*du);
+
+        dv = (1.0f - omega) * dv + omega * g_inv_denominator_v[pos] * numerator_v;
+    }
+    new_du[pos] = du;
+    new_dv[pos] = dv;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// utility functions
+///////////////////////////////////////////////////////////////////////////////
+
+void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex)
+{
+    tex.addressMode[0] = cudaAddressModeClamp;
+    tex.filterMode = cudaFilterModePoint;
+    tex.normalized = false;
+}
+
+void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex)
+{
+    tex.addressMode[0] = cudaAddressModeMirror;
+    tex.addressMode[1] = cudaAddressModeMirror;
+    tex.filterMode = cudaFilterModeLinear;
+    tex.normalized = true;
+}
+
+void InitTextures()
+{
+    initTexture2D(tex_I0);
+    initTexture2D(tex_I1);
+    initTexture2D(tex_fine);      // for downsampling
+    initTexture2D(tex_coarse);    // for prolongation
+
+    initTexture2D(tex_Ix);
+    initTexture2D(tex_Ixx);
+    initTexture2D(tex_Ix0);
+
+    initTexture2D(tex_Iy);
+    initTexture2D(tex_Iyy);
+    initTexture2D(tex_Iy0);
+
+    initTexture2D(tex_Ixy);
+
+    initTexture1D(tex_u);
+    initTexture1D(tex_v);
+    initTexture1D(tex_du);
+    initTexture1D(tex_dv);
+    initTexture1D(tex_diffusivity_x);
+    initTexture1D(tex_diffusivity_y);
+    initTexture1D(tex_inv_denominator_u);
+    initTexture1D(tex_inv_denominator_v);
+    initTexture1D(tex_numerator_dudv);
+    initTexture1D(tex_numerator_u);
+    initTexture1D(tex_numerator_v);
+}
+
+namespace
+{
+    struct ImagePyramid
+    {
+        std::vector<FloatVector*> img0;
+        std::vector<FloatVector*> img1;
+
+        std::vector<Ncv32u> w;
+        std::vector<Ncv32u> h;
+
+        explicit ImagePyramid(int outer_iterations)
+        {
+            img0.reserve(outer_iterations);
+            img1.reserve(outer_iterations);
+
+            w.reserve(outer_iterations);
+            h.reserve(outer_iterations);
+        }
+
+        ~ImagePyramid()
+        {
+            w.clear();
+            h.clear();
+
+            for (int i = static_cast<int>(img0.size()) - 1; i >= 0; --i)
+            {
+                delete img1[i];
+                delete img0[i];
+            }
+
+            img0.clear();
+            img1.clear();
+        }
+    };
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+/////////////////////////////////////////////////////////////////////////////////////////
+NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
+                             INCVMemAllocator &gpu_mem_allocator,
+                             const NCVMatrix<Ncv32f> &frame0,
+                             const NCVMatrix<Ncv32f> &frame1,
+                             NCVMatrix<Ncv32f> &uOut,
+                             NCVMatrix<Ncv32f> &vOut,
+                             cudaStream_t stream)
+{
+    ncvAssertPrintReturn(desc.alpha > 0.0f                   , "Invalid alpha"                      , NCV_INCONSISTENT_INPUT);
+    ncvAssertPrintReturn(desc.gamma >= 0.0f                  , "Invalid gamma"                      , NCV_INCONSISTENT_INPUT);
+    ncvAssertPrintReturn(desc.number_of_inner_iterations > 0 , "Invalid number of inner iterations" , NCV_INCONSISTENT_INPUT);
+    ncvAssertPrintReturn(desc.number_of_outer_iterations > 0 , "Invalid number of outer iterations" , NCV_INCONSISTENT_INPUT);
+    ncvAssertPrintReturn(desc.number_of_solver_iterations > 0, "Invalid number of solver iterations", NCV_INCONSISTENT_INPUT);
+
+    const Ncv32u kSourceWidth  = frame0.width();
+    const Ncv32u kSourceHeight = frame0.height();
+
+    ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
+    ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
+        uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);
+
+    ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
+
+    bool kSkipProcessing = gpu_mem_allocator.isCounting();
+
+    int cuda_device;
+    ncvAssertCUDAReturn(cudaGetDevice(&cuda_device), NCV_CUDA_ERROR);
+
+    cudaDeviceProp device_props;
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&device_props, cuda_device), NCV_CUDA_ERROR);
+
+    Ncv32u alignmentValue = gpu_mem_allocator.alignment ();
+
+    const Ncv32u kStrideAlignmentFloat = alignmentValue / sizeof(float);
+    const Ncv32u kSourcePitch = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
+
+    const Ncv32f scale_factor = desc.scale_factor;
+    const Ncv32f alpha = desc.alpha;
+    const Ncv32f gamma = desc.gamma;
+
+    const Ncv32u kSizeInPixelsAligned = alignUp(kSourceWidth, kStrideAlignmentFloat)*kSourceHeight;
+
+#if defined SAFE_VECTOR_DECL
+#undef SAFE_VECTOR_DECL
+#endif
+#define SAFE_VECTOR_DECL(name, allocator, size) \
+    FloatVector name((allocator), (size)); \
+    ncvAssertReturn(name.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    // matrix elements
+    SAFE_VECTOR_DECL(diffusivity_x,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(diffusivity_y,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(denom_u,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(denom_v,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(num_dudv, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(num_u,    gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(num_v,    gpu_mem_allocator, kSizeInPixelsAligned);
+
+    // flow components
+    SAFE_VECTOR_DECL(u, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(v, gpu_mem_allocator, kSizeInPixelsAligned);
+
+    SAFE_VECTOR_DECL(u_new, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(v_new, gpu_mem_allocator, kSizeInPixelsAligned);
+
+    // flow increments
+    SAFE_VECTOR_DECL(du, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(dv, gpu_mem_allocator, kSizeInPixelsAligned);
+
+    SAFE_VECTOR_DECL(du_new, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned);
+
+    // temporary storage
+    SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,
+        alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));
+
+    // image derivatives
+    SAFE_VECTOR_DECL(Ix,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Ixx, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Ix0, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Iy,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Iyy, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Iy0, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Ixy, gpu_mem_allocator, kSizeInPixelsAligned);
+
+    // spatial derivative filter size
+    const int kDFilterSize = 5;
+    SAFE_VECTOR_DECL(derivativeFilter, gpu_mem_allocator, kDFilterSize);
+
+    if (!kSkipProcessing)
+    {
+        const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
+
+        ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
+            cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
+
+        InitTextures();
+    }
+
+    //prepare image pyramid
+    ImagePyramid pyr(desc.number_of_outer_iterations);
+
+    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
+
+    float scale = 1.0f;
+
+    //cuda arrays for frames
+    std::unique_ptr<FloatVector> pI0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
+    ncvAssertReturn(pI0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    std::unique_ptr<FloatVector> pI1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
+    ncvAssertReturn(pI1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    if (!kSkipProcessing)
+    {
+        //copy frame data to device
+        size_t dst_width_in_bytes = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
+        size_t src_width_in_bytes = kSourceWidth * sizeof(float);
+        size_t src_pitch_in_bytes = frame0.pitch();
+
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(),
+            src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
+
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(),
+            src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
+    }
+
+    FloatVector* I0 = pI0.release();
+    FloatVector* I1 = pI1.release();
+
+        //prepare pyramid
+    pyr.img0.push_back(I0);
+    pyr.img1.push_back(I1);
+
+    pyr.w.push_back(kSourceWidth);
+    pyr.h.push_back(kSourceHeight);
+
+    scale *= scale_factor;
+
+    Ncv32u prev_level_width  = kSourceWidth;
+    Ncv32u prev_level_height = kSourceHeight;
+    while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(pyr.img0.size()) < desc.number_of_outer_iterations))
+    {
+        //current resolution
+        Ncv32u level_width  = static_cast<Ncv32u>(ceilf(kSourceWidth  * scale));
+        Ncv32u level_height = static_cast<Ncv32u>(ceilf(kSourceHeight * scale));
+
+        Ncv32u level_width_aligned  = alignUp(level_width,  kStrideAlignmentFloat);
+
+        Ncv32u buffer_size = alignUp(level_width, kStrideAlignmentFloat) * level_height; // buffer size in floats
+
+        Ncv32u prev_level_pitch = alignUp(prev_level_width, kStrideAlignmentFloat) * sizeof(float);
+
+        std::unique_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
+        ncvAssertReturn(level_frame0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+        std::unique_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
+        ncvAssertReturn(level_frame1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+        if (!kSkipProcessing)
+        {
+            ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+
+            NcvSize32u srcSize (prev_level_width, prev_level_height);
+            NcvSize32u dstSize (level_width, level_height);
+            NcvRect32u srcROI (0, 0, prev_level_width, prev_level_height);
+            NcvRect32u dstROI (0, 0, level_width, level_height);
+
+            // frame 0
+            ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
+                level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
+
+            // frame 1
+            ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
+                level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
+        }
+
+        I0 = level_frame0.release();
+        I1 = level_frame1.release();
+
+        //store pointers
+        pyr.img0.push_back(I0);
+        pyr.img1.push_back(I1);
+
+        pyr.w.push_back(level_width);
+        pyr.h.push_back(level_height);
+
+        scale *= scale_factor;
+
+        prev_level_width  = level_width;
+        prev_level_height = level_height;
+    }
+
+    if (!kSkipProcessing)
+    {
+        //initial values for flow is 0
+        ncvAssertCUDAReturn(cudaMemsetAsync(u.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
+
+        //select images with lowest resolution
+        size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
+        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+
+        FloatVector* ptrU = &u;
+        FloatVector* ptrV = &v;
+        FloatVector* ptrUNew = &u_new;
+        FloatVector* ptrVNew = &v_new;
+
+        std::vector<FloatVector*>::const_reverse_iterator img0Iter = pyr.img0.rbegin();
+        std::vector<FloatVector*>::const_reverse_iterator img1Iter = pyr.img1.rbegin();
+
+        //outer loop
+        //warping fixed point iteration
+        while(!pyr.w.empty())
+        {
+            //current grid dimensions
+            const Ncv32u kLevelWidth  = pyr.w.back();
+            const Ncv32u kLevelHeight = pyr.h.back();
+            const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
+
+            //size of current image in bytes
+            const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
+
+            //number of points at current resolution
+            const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
+
+            //initial guess for du and dv
+            ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
+
+            //texture format descriptor
+            cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc<float>();
+
+            I0 = *img0Iter;
+            I1 = *img1Iter;
+
+            ++img0Iter;
+            ++img1Iter;
+
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
+
+            //compute derivatives
+            dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
+            dim3 dThreads(32, 6);
+
+            const int kPitchTex = kLevelStride * sizeof(float);
+
+            NcvSize32u srcSize(kLevelWidth, kLevelHeight);
+            Ncv32u nSrcStep = kLevelStride * sizeof(float);
+            NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
+
+            // Ix0
+            ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
+
+            // Iy0
+            ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
+
+            // Ix
+            ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Ix.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
+
+            // Iy
+            ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Iy.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
+
+            // Ixx
+            ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Ix.ptr(), srcSize, nSrcStep, Ixx.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
+
+            // Iyy
+            ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Iyy.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
+
+            // Ixy
+            ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
+
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy,  Iy.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+
+            //    flow
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            //    flow increments
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+
+            dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
+            dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
+
+            dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
+            dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);
+
+            // inner loop
+            // lagged nonlinearity fixed point iteration
+            ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+            for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
+            {
+                //compute coefficients
+                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
+                    (diffusivity_x.ptr(),
+                     diffusivity_y.ptr(),
+                     denom_u.ptr(),
+                     denom_v.ptr(),
+                     num_dudv.ptr(),
+                     num_u.ptr(),
+                     num_v.ptr(),
+                     kLevelWidth,
+                     kLevelHeight,
+                     kLevelStride,
+                     alpha,
+                     gamma);
+
+                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+
+                prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
+
+                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+
+                //    linear system coefficients
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+
+                //solve linear system
+                for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
+                {
+                    float omega = 1.99f;
+
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+
+                    sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
+                        (du_new.ptr(),
+                        dv_new.ptr(),
+                        denom_u.ptr(),
+                        denom_v.ptr(),
+                        num_u.ptr(),
+                        num_v.ptr(),
+                        num_dudv.ptr(),
+                        omega,
+                        kLevelWidth,
+                        kLevelHeight,
+                        kLevelStride);
+
+                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+
+                    sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
+                        (du.ptr(),
+                        dv.ptr(),
+                        denom_u.ptr(),
+                        denom_v.ptr(),
+                        num_u.ptr(),
+                        num_v.ptr(),
+                        num_dudv.ptr(),
+                        omega,
+                        kLevelWidth,
+                        kLevelHeight,
+                        kLevelStride);
+
+                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                }//end of solver loop
+            }// end of inner loop
+
+            //update u and v
+            add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream);
+            ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+            add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream);
+            ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+
+            //prolongate using texture
+            pyr.w.pop_back();
+            pyr.h.pop_back();
+            if (!pyr.w.empty())
+            {
+                //compute new image size
+                Ncv32u nw = pyr.w.back();
+                Ncv32u nh = pyr.h.back();
+                Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);
+
+                dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
+                dim3 p_threads(32, 8);
+
+                NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight);
+                NcvSize32u dstSize (nw, nh);
+                NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
+                NcvRect32u dstROI (0, 0, nw, nh);
+
+                ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
+                    ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
+
+                ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
+                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+
+                ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
+                    ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
+
+                ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
+                ncvAssertCUDALastErrorReturn((int)NCV_CUDA_ERROR);
+
+                cv::cuda::device::swap<FloatVector*>(ptrU, ptrUNew);
+                cv::cuda::device::swap<FloatVector*>(ptrV, ptrVNew);
+            }
+            scale /= scale_factor;
+        }
+
+        // end of warping iterations
+        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), (int)NCV_CUDA_ERROR);
+
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync
+            (uOut.ptr(), uOut.pitch(), ptrU->ptr(),
+            kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), (int)NCV_CUDA_ERROR );
+
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync
+            (vOut.ptr(), vOut.pitch(), ptrV->ptr(),
+            kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), (int)NCV_CUDA_ERROR );
+
+        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), (int)NCV_CUDA_ERROR);
+    }
+
+    return NCV_SUCCESS;
+}
diff --git a/modules/cudalegacy/src/cuda/NCVColorConversion.hpp b/modules/cudalegacy/src/cuda/NCVColorConversion.hpp
new file mode 100644
index 00000000000..c1293f2b34d
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/NCVColorConversion.hpp
@@ -0,0 +1,100 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+// this file does not contain any used code.
+
+#ifndef _ncv_color_conversion_hpp_
+#define _ncv_color_conversion_hpp_
+
+#include "NCVPixelOperations.hpp"
+
+#if 0
+enum NCVColorSpace
+{
+    NCVColorSpaceGray,
+    NCVColorSpaceRGBA,
+};
+
+template<NCVColorSpace CSin, NCVColorSpace CSout, typename Tin, typename Tout> struct __pixColorConv {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut);
+};
+
+template<typename Tin, typename Tout> struct __pixColorConv<NCVColorSpaceRGBA, NCVColorSpaceGray, Tin, Tout> {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut)
+{
+    Ncv32f luma = 0.299f * pixIn.x + 0.587f * pixIn.y + 0.114f * pixIn.z;
+    _TDemoteClampNN(luma, pixOut.x);
+}};
+
+template<typename Tin, typename Tout> struct __pixColorConv<NCVColorSpaceGray, NCVColorSpaceRGBA, Tin, Tout> {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut)
+{
+    _TDemoteClampNN(pixIn.x, pixOut.x);
+    _TDemoteClampNN(pixIn.x, pixOut.y);
+    _TDemoteClampNN(pixIn.x, pixOut.z);
+    pixOut.w = 0;
+}};
+
+template<NCVColorSpace CSin, NCVColorSpace CSout, typename Tin, typename Tout>
+static NCVStatus _ncvColorConv_host(const NCVMatrix<Tin> &h_imgIn,
+                             const NCVMatrix<Tout> &h_imgOut)
+{
+    ncvAssertReturn(h_imgIn.size() == h_imgOut.size(), NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(h_imgIn.memType() == h_imgOut.memType() &&
+                    (h_imgIn.memType() == NCVMemoryTypeHostPinned || h_imgIn.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+    NCV_SET_SKIP_COND(h_imgIn.memType() == NCVMemoryTypeNone);
+    NCV_SKIP_COND_BEGIN
+
+    for (Ncv32u i=0; i<h_imgIn.height(); i++)
+    {
+        for (Ncv32u j=0; j<h_imgIn.width(); j++)
+        {
+            __pixColorConv<CSin, CSout, Tin, Tout>::_pixColorConv(h_imgIn.at(j,i), h_imgOut.at(j,i));
+        }
+    }
+
+    NCV_SKIP_COND_END
+    return NCV_SUCCESS;
+}
+#endif
+
+#endif //_ncv_color_conversion_hpp_
diff --git a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
new file mode 100644
index 00000000000..cba425bb2c1
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
@@ -0,0 +1,2542 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
+//
+// The algorithm and code are explained in the upcoming GPU Computing Gems
+// chapter in detail:
+//
+//   Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
+//   PDF URL placeholder
+//   email: aobukhov@nvidia.com, devsupport@nvidia.com
+//
+// Credits for help with the code to:
+// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <algorithm>
+#include <cstdio>
+
+#include "opencv2/core/cuda/warp.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_OBJDETECT
+#  include "opencv2/objdetect.hpp"
+#  include "opencv2/objdetect/objdetect_c.h"
+#endif
+
+#include "opencv2/cudalegacy/NCV.hpp"
+#include "opencv2/cudalegacy/NPP_staging.hpp"
+#include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
+
+#include "NCVRuntimeTemplates.hpp"
+#include "NCVAlg.hpp"
+
+
+//==============================================================================
+//
+// BlockScan file
+//
+//==============================================================================
+
+
+NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
+
+
+//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
+//assuming size <= WARP_SIZE and size is power of 2
+__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
+{
+#if __CUDA_ARCH__ >= 300
+    const unsigned int laneId = cv::cuda::device::Warp::laneId();
+
+    // scan on shuffl functions
+    #pragma unroll
+    for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
+    {
+        const Ncv32u n = cv::cuda::device::shfl_up(idata, i);
+        if (laneId >= i)
+              idata += n;
+    }
+
+    return idata;
+#else
+    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
+    s_Data[pos] = 0;
+    pos += K_WARP_SIZE;
+    s_Data[pos] = idata;
+
+    s_Data[pos] += s_Data[pos - 1];
+    s_Data[pos] += s_Data[pos - 2];
+    s_Data[pos] += s_Data[pos - 4];
+    s_Data[pos] += s_Data[pos - 8];
+    s_Data[pos] += s_Data[pos - 16];
+
+    return s_Data[pos];
+#endif
+}
+
+__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
+{
+    return warpScanInclusive(idata, s_Data) - idata;
+}
+
+template <Ncv32u tiNumScanThreads>
+__device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
+{
+    if (tiNumScanThreads > K_WARP_SIZE)
+    {
+        //Bottom-level inclusive warp scan
+        Ncv32u warpResult = warpScanInclusive(idata, s_Data);
+
+        //Save top elements of each warp for exclusive warp scan
+        //sync to wait for warp scans to complete (because s_Data is being overwritten)
+        __syncthreads();
+        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
+        {
+            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
+        }
+
+        //wait for warp scans to complete
+        __syncthreads();
+
+        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
+        {
+            //grab top warp elements
+            Ncv32u val = s_Data[threadIdx.x];
+            //calculate exclusive scan and write back to shared memory
+            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
+        }
+
+        //return updated warp scans with exclusive scan results
+        __syncthreads();
+        return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
+    }
+    else
+    {
+        return warpScanInclusive(idata, s_Data);
+    }
+}
+
+
+//==============================================================================
+//
+// HaarClassifierCascade file
+//
+//==============================================================================
+
+
+const Ncv32u MAX_GRID_DIM = 65535;
+
+
+const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
+
+
+#define NUM_THREADS_CLASSIFIERPARALLEL_LOG2     6
+#define NUM_THREADS_CLASSIFIERPARALLEL          (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
+
+
+/** \internal
+* Haar features solid array.
+*/
+texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
+
+
+/** \internal
+* Haar classifiers flattened trees container.
+* Two parts: first contains root nodes, second - nodes that are referred by root nodes.
+* Drawback: breaks tree locality (might cause more cache misses
+* Advantage: No need to introduce additional 32-bit field to index root nodes offsets
+*/
+texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
+
+
+texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
+
+
+__device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
+{
+    return d_Stages[iStage];
+}
+
+
+template <NcvBool tbCacheTextureCascade>
+__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
+{
+    HaarClassifierNode128 tmpNode;
+    if (tbCacheTextureCascade)
+    {
+        tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
+    }
+    else
+    {
+        tmpNode = d_ClassifierNodes[iNode];
+    }
+    return tmpNode;
+}
+
+
+template <NcvBool tbCacheTextureCascade>
+__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
+                           Ncv32f *weight,
+                           Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
+{
+    HaarFeature64 feature;
+    if (tbCacheTextureCascade)
+    {
+        feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
+    }
+    else
+    {
+        feature = d_Features[iFeature];
+    }
+    feature.getRect(rectX, rectY, rectWidth, rectHeight);
+    *weight = feature.getWeight();
+}
+
+
+template <NcvBool tbCacheTextureIImg>
+__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
+{
+    if (tbCacheTextureIImg)
+    {
+        return tex1Dfetch(texIImage, x);
+    }
+    else
+    {
+        return d_IImg[x];
+    }
+}
+
+
+__device__ Ncv32u d_outMaskPosition;
+
+
+__device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)
+{
+#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
+
+    __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
+    __shared__ Ncv32u numPassed;
+    __shared__ Ncv32u outMaskOffset;
+
+    Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
+    __syncthreads();
+
+    if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
+    {
+        numPassed = incScan;
+        outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);
+    }
+
+    if (threadPassFlag)
+    {
+        Ncv32u excScan = incScan - threadPassFlag;
+        shmem[excScan] = threadElem;
+    }
+
+    __syncthreads();
+
+    if (threadIdx.x < numPassed)
+    {
+        vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x];
+    }
+#endif
+}
+
+
+template <NcvBool tbInitMaskPositively,
+          NcvBool tbCacheTextureIImg,
+          NcvBool tbCacheTextureCascade,
+          NcvBool tbReadPixelIndexFromVector,
+          NcvBool tbDoAtomicCompaction>
+__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
+                                                  Ncv32f *d_weights, Ncv32u weightsStride,
+                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
+                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,
+                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,
+                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
+{
+    Ncv32u y_offs;
+    Ncv32u x_offs;
+    Ncv32u maskOffset;
+    Ncv32u outMaskVal;
+
+    NcvBool bInactiveThread = false;
+
+    if (tbReadPixelIndexFromVector)
+    {
+        maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
+
+        if (maskOffset >= mask1Dlen)
+        {
+            if (tbDoAtomicCompaction) bInactiveThread = true; else return;
+        }
+
+        if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
+        {
+            outMaskVal = d_inMask[maskOffset];
+            y_offs = outMaskVal >> 16;
+            x_offs = outMaskVal & 0xFFFF;
+        }
+    }
+    else
+    {
+        y_offs = blockIdx.y;
+        x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
+
+        if (x_offs >= mask2Dstride)
+        {
+            if (tbDoAtomicCompaction) bInactiveThread = true; else return;
+        }
+
+        if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
+        {
+            maskOffset = y_offs * mask2Dstride + x_offs;
+
+            if ((x_offs >= anchorsRoi.width) ||
+                (!tbInitMaskPositively &&
+                 d_inMask != d_outMask &&
+                 d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U))
+            {
+                if (tbDoAtomicCompaction)
+                {
+                    bInactiveThread = true;
+                }
+                else
+                {
+                    d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U;
+                    return;
+                }
+            }
+
+            outMaskVal = (y_offs << 16) | x_offs;
+        }
+    }
+
+    NcvBool bPass = true;
+
+    if (!tbDoAtomicCompaction || tbDoAtomicCompaction)
+    {
+        Ncv32f pixelStdDev = 0.0f;
+
+        if (!bInactiveThread)
+            pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
+
+        for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)
+        {
+            Ncv32f curStageSum = 0.0f;
+
+            HaarStage64 curStage = getStage(iStage, d_Stages);
+            Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes();
+            Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset();
+            Ncv32f stageThreshold = curStage.getStageThreshold();
+
+            while (numRootNodesInStage--)
+            {
+                NcvBool bMoreNodesToTraverse = true;
+                Ncv32u iNode = curRootNodeOffset;
+
+                if (bPass && !bInactiveThread)
+                {
+                    while (bMoreNodesToTraverse)
+                    {
+                        HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
+                        HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
+                        Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
+                        Ncv32u iFeature = featuresDesc.getFeaturesOffset();
+
+                        Ncv32f curNodeVal = 0.0f;
+
+                        for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
+                        {
+                            Ncv32f rectWeight;
+                            Ncv32u rectX, rectY, rectWidth, rectHeight;
+                            getFeature<tbCacheTextureCascade>
+                                (iFeature + iRect, d_Features,
+                                &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
+
+                            Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
+                            Ncv32u iioffsTR = iioffsTL + rectWidth;
+                            Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
+                            Ncv32u iioffsBR = iioffsBL + rectWidth;
+
+                            Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
+                                             getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
+                                             getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
+                                             getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
+
+    #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
+                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
+    #else
+                        curNodeVal += (Ncv32f)rectSum * rectWeight;
+    #endif
+                        }
+
+                        HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
+                        HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
+                        Ncv32f nodeThreshold = curNode.getThreshold();
+
+                        HaarClassifierNodeDescriptor32 nextNodeDescriptor;
+                        NcvBool nextNodeIsLeaf;
+
+                        if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
+                        {
+                            nextNodeDescriptor = nodeLeft;
+                            nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
+                        }
+                        else
+                        {
+                            nextNodeDescriptor = nodeRight;
+                            nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
+                        }
+
+                        if (nextNodeIsLeaf)
+                        {
+                            Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
+                            curStageSum += tmpLeafValue;
+                            bMoreNodesToTraverse = false;
+                        }
+                        else
+                        {
+                            iNode = nextNodeDescriptor.getNextNodeOffset();
+                        }
+                    }
+                }
+
+                __syncthreads();
+                curRootNodeOffset++;
+            }
+
+            if (curStageSum < stageThreshold)
+            {
+                bPass = false;
+                outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
+            }
+        }
+    }
+
+    __syncthreads();
+
+    if (!tbDoAtomicCompaction)
+    {
+        if (!tbReadPixelIndexFromVector ||
+            (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask)))
+        {
+            d_outMask[maskOffset] = outMaskVal;
+        }
+    }
+    else
+    {
+        compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread,
+                                           outMaskVal,
+                                           d_outMask);
+    }
+}
+
+
+template <NcvBool tbCacheTextureIImg,
+          NcvBool tbCacheTextureCascade,
+          NcvBool tbDoAtomicCompaction>
+__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
+                                                      Ncv32f *d_weights, Ncv32u weightsStride,
+                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
+                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
+                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
+                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
+{
+    Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
+
+    if (maskOffset >= mask1Dlen)
+    {
+        return;
+    }
+
+    Ncv32u outMaskVal = d_inMask[maskOffset];
+    Ncv32u y_offs = outMaskVal >> 16;
+    Ncv32u x_offs = outMaskVal & 0xFFFF;
+
+    Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
+    NcvBool bPass = true;
+
+    for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)
+    {
+        //this variable is subject to reduction
+        Ncv32f curStageSum = 0.0f;
+
+        HaarStage64 curStage = getStage(iStage, d_Stages);
+        Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes();
+        Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x;
+        Ncv32f stageThreshold = curStage.getStageThreshold();
+
+        Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2;
+
+        for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++)
+        {
+            NcvBool bMoreNodesToTraverse = true;
+
+            if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage)
+            {
+                Ncv32u iNode = curRootNodeOffset;
+
+                while (bMoreNodesToTraverse)
+                {
+                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
+                    HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
+                    Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
+                    Ncv32u iFeature = featuresDesc.getFeaturesOffset();
+
+                    Ncv32f curNodeVal = 0.0f;
+                    //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce
+                    for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
+                    {
+                        Ncv32f rectWeight;
+                        Ncv32u rectX, rectY, rectWidth, rectHeight;
+                        getFeature<tbCacheTextureCascade>
+                            (iFeature + iRect, d_Features,
+                            &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
+
+                        Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
+                        Ncv32u iioffsTR = iioffsTL + rectWidth;
+                        Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
+                        Ncv32u iioffsBR = iioffsBL + rectWidth;
+
+                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
+                                         getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
+                                         getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
+                                         getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
+
+#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
+                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
+#else
+                        curNodeVal += (Ncv32f)rectSum * rectWeight;
+#endif
+                    }
+
+                    HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
+                    HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
+                    Ncv32f nodeThreshold = curNode.getThreshold();
+
+                    HaarClassifierNodeDescriptor32 nextNodeDescriptor;
+                    NcvBool nextNodeIsLeaf;
+
+                    if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
+                    {
+                        nextNodeDescriptor = nodeLeft;
+                        nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
+                    }
+                    else
+                    {
+                        nextNodeDescriptor = nodeRight;
+                        nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
+                    }
+
+                    if (nextNodeIsLeaf)
+                    {
+                        Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
+                        curStageSum += tmpLeafValue;
+                        bMoreNodesToTraverse = false;
+                    }
+                    else
+                    {
+                        iNode = nextNodeDescriptor.getNextNodeOffset();
+                    }
+                }
+            }
+            __syncthreads();
+
+            curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
+        }
+
+        Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues<Ncv32f>, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum);
+
+        if (finalStageSum < stageThreshold)
+        {
+            bPass = false;
+            outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
+            break;
+        }
+    }
+
+    if (!tbDoAtomicCompaction)
+    {
+        if (!bPass || d_inMask != d_outMask)
+        {
+            if (!threadIdx.x)
+            {
+                d_outMask[maskOffset] = outMaskVal;
+            }
+        }
+    }
+    else
+    {
+#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
+        if (bPass && !threadIdx.x)
+        {
+            Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1);
+            d_outMask[outMaskOffset] = outMaskVal;
+        }
+#endif
+    }
+}
+
+
+template <NcvBool tbMaskByInmask,
+          NcvBool tbDoAtomicCompaction>
+__global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask,
+                                     Ncv32u mask1Dlen, Ncv32u mask2Dstride,
+                                     NcvSize32u anchorsRoi, Ncv32u step)
+{
+    Ncv32u y_offs = blockIdx.y;
+    Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
+    Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs;
+
+    Ncv32u y_offs_upsc = step * y_offs;
+    Ncv32u x_offs_upsc = step * x_offs;
+    Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc;
+
+    Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U;
+
+    if (x_offs_upsc < anchorsRoi.width &&
+        (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U))
+    {
+        outElem = (y_offs_upsc << 16) | x_offs_upsc;
+    }
+
+    if (!tbDoAtomicCompaction)
+    {
+        d_outMask[outMaskOffset] = outElem;
+    }
+    else
+    {
+        compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U,
+                                           outElem,
+                                           d_outMask);
+    }
+}
+
+
+struct applyHaarClassifierAnchorParallelFunctor
+{
+    dim3 gridConf, blockConf;
+    cudaStream_t cuStream;
+
+    //Kernel arguments are stored as members;
+    Ncv32u *d_IImg;
+    Ncv32u IImgStride;
+    Ncv32f *d_weights;
+    Ncv32u weightsStride;
+    HaarFeature64 *d_Features;
+    HaarClassifierNode128 *d_ClassifierNodes;
+    HaarStage64 *d_Stages;
+    Ncv32u *d_inMask;
+    Ncv32u *d_outMask;
+    Ncv32u mask1Dlen;
+    Ncv32u mask2Dstride;
+    NcvSize32u anchorsRoi;
+    Ncv32u startStageInc;
+    Ncv32u endStageExc;
+    Ncv32f scaleArea;
+
+    //Arguments are passed through the constructor
+    applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
+                                             Ncv32u *_d_IImg, Ncv32u _IImgStride,
+                                             Ncv32f *_d_weights, Ncv32u _weightsStride,
+                                             HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
+                                             Ncv32u *_d_inMask, Ncv32u *_d_outMask,
+                                             Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
+                                             NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
+                                             Ncv32u _endStageExc, Ncv32f _scaleArea) :
+    gridConf(_gridConf),
+    blockConf(_blockConf),
+    cuStream(_cuStream),
+    d_IImg(_d_IImg),
+    IImgStride(_IImgStride),
+    d_weights(_d_weights),
+    weightsStride(_weightsStride),
+    d_Features(_d_Features),
+    d_ClassifierNodes(_d_ClassifierNodes),
+    d_Stages(_d_Stages),
+    d_inMask(_d_inMask),
+    d_outMask(_d_outMask),
+    mask1Dlen(_mask1Dlen),
+    mask2Dstride(_mask2Dstride),
+    anchorsRoi(_anchorsRoi),
+    startStageInc(_startStageInc),
+    endStageExc(_endStageExc),
+    scaleArea(_scaleArea)
+    {}
+
+    template<class TList>
+    void call(TList tl)
+    {
+        CV_UNUSED(tl);
+        applyHaarClassifierAnchorParallel <
+            Loki::TL::TypeAt<TList, 0>::Result::value,
+            Loki::TL::TypeAt<TList, 1>::Result::value,
+            Loki::TL::TypeAt<TList, 2>::Result::value,
+            Loki::TL::TypeAt<TList, 3>::Result::value,
+            Loki::TL::TypeAt<TList, 4>::Result::value >
+            <<<gridConf, blockConf, 0, cuStream>>>
+            (d_IImg, IImgStride,
+            d_weights, weightsStride,
+            d_Features, d_ClassifierNodes, d_Stages,
+            d_inMask, d_outMask,
+            mask1Dlen, mask2Dstride,
+            anchorsRoi, startStageInc,
+            endStageExc, scaleArea);
+    }
+};
+
+
+void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
+                                                  NcvBool tbCacheTextureIImg,
+                                                  NcvBool tbCacheTextureCascade,
+                                                  NcvBool tbReadPixelIndexFromVector,
+                                                  NcvBool tbDoAtomicCompaction,
+
+                                                  dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
+
+                                                  Ncv32u *d_IImg, Ncv32u IImgStride,
+                                                  Ncv32f *d_weights, Ncv32u weightsStride,
+                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
+                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,
+                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,
+                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc,
+                                                  Ncv32u endStageExc, Ncv32f scaleArea)
+{
+
+    applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
+                                                     d_IImg, IImgStride,
+                                                     d_weights, weightsStride,
+                                                     d_Features, d_ClassifierNodes, d_Stages,
+                                                     d_inMask, d_outMask,
+                                                     mask1Dlen, mask2Dstride,
+                                                     anchorsRoi, startStageInc,
+                                                     endStageExc, scaleArea);
+
+    //Second parameter is the number of "dynamic" template parameters
+    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
+        ::call( &functor,
+                tbInitMaskPositively,
+                tbCacheTextureIImg,
+                tbCacheTextureCascade,
+                tbReadPixelIndexFromVector,
+                tbDoAtomicCompaction);
+}
+
+
+struct applyHaarClassifierClassifierParallelFunctor
+{
+    dim3 gridConf, blockConf;
+    cudaStream_t cuStream;
+
+    //Kernel arguments are stored as members;
+    Ncv32u *d_IImg;
+    Ncv32u IImgStride;
+    Ncv32f *d_weights;
+    Ncv32u weightsStride;
+    HaarFeature64 *d_Features;
+    HaarClassifierNode128 *d_ClassifierNodes;
+    HaarStage64 *d_Stages;
+    Ncv32u *d_inMask;
+    Ncv32u *d_outMask;
+    Ncv32u mask1Dlen;
+    Ncv32u mask2Dstride;
+    NcvSize32u anchorsRoi;
+    Ncv32u startStageInc;
+    Ncv32u endStageExc;
+    Ncv32f scaleArea;
+
+    //Arguments are passed through the constructor
+    applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
+                                                 Ncv32u *_d_IImg, Ncv32u _IImgStride,
+                                                 Ncv32f *_d_weights, Ncv32u _weightsStride,
+                                                 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
+                                                 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
+                                                 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
+                                                 NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
+                                                 Ncv32u _endStageExc, Ncv32f _scaleArea) :
+    gridConf(_gridConf),
+    blockConf(_blockConf),
+    cuStream(_cuStream),
+    d_IImg(_d_IImg),
+    IImgStride(_IImgStride),
+    d_weights(_d_weights),
+    weightsStride(_weightsStride),
+    d_Features(_d_Features),
+    d_ClassifierNodes(_d_ClassifierNodes),
+    d_Stages(_d_Stages),
+    d_inMask(_d_inMask),
+    d_outMask(_d_outMask),
+    mask1Dlen(_mask1Dlen),
+    mask2Dstride(_mask2Dstride),
+    anchorsRoi(_anchorsRoi),
+    startStageInc(_startStageInc),
+    endStageExc(_endStageExc),
+    scaleArea(_scaleArea)
+    {}
+
+    template<class TList>
+    void call(TList tl)
+    {
+        CV_UNUSED(tl);
+        applyHaarClassifierClassifierParallel <
+            Loki::TL::TypeAt<TList, 0>::Result::value,
+            Loki::TL::TypeAt<TList, 1>::Result::value,
+            Loki::TL::TypeAt<TList, 2>::Result::value >
+            <<<gridConf, blockConf, 0, cuStream>>>
+            (d_IImg, IImgStride,
+            d_weights, weightsStride,
+            d_Features, d_ClassifierNodes, d_Stages,
+            d_inMask, d_outMask,
+            mask1Dlen, mask2Dstride,
+            anchorsRoi, startStageInc,
+            endStageExc, scaleArea);
+    }
+};
+
+
+void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
+                                                      NcvBool tbCacheTextureCascade,
+                                                      NcvBool tbDoAtomicCompaction,
+
+                                                      dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
+
+                                                      Ncv32u *d_IImg, Ncv32u IImgStride,
+                                                      Ncv32f *d_weights, Ncv32u weightsStride,
+                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
+                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
+                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
+                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc,
+                                                      Ncv32u endStageExc, Ncv32f scaleArea)
+{
+    applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
+                                                         d_IImg, IImgStride,
+                                                         d_weights, weightsStride,
+                                                         d_Features, d_ClassifierNodes, d_Stages,
+                                                         d_inMask, d_outMask,
+                                                         mask1Dlen, mask2Dstride,
+                                                         anchorsRoi, startStageInc,
+                                                         endStageExc, scaleArea);
+
+    //Second parameter is the number of "dynamic" template parameters
+    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
+        ::call( &functor,
+                tbCacheTextureIImg,
+                tbCacheTextureCascade,
+                tbDoAtomicCompaction);
+}
+
+
+struct initializeMaskVectorFunctor
+{
+    dim3 gridConf, blockConf;
+    cudaStream_t cuStream;
+
+    //Kernel arguments are stored as members;
+    Ncv32u *d_inMask;
+    Ncv32u *d_outMask;
+    Ncv32u mask1Dlen;
+    Ncv32u mask2Dstride;
+    NcvSize32u anchorsRoi;
+    Ncv32u step;
+
+    //Arguments are passed through the constructor
+    initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
+                                Ncv32u *_d_inMask, Ncv32u *_d_outMask,
+                                Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
+                                NcvSize32u _anchorsRoi, Ncv32u _step) :
+    gridConf(_gridConf),
+    blockConf(_blockConf),
+    cuStream(_cuStream),
+    d_inMask(_d_inMask),
+    d_outMask(_d_outMask),
+    mask1Dlen(_mask1Dlen),
+    mask2Dstride(_mask2Dstride),
+    anchorsRoi(_anchorsRoi),
+    step(_step)
+    {}
+
+    template<class TList>
+    void call(TList tl)
+    {
+        CV_UNUSED(tl);
+        initializeMaskVector <
+            Loki::TL::TypeAt<TList, 0>::Result::value,
+            Loki::TL::TypeAt<TList, 1>::Result::value >
+            <<<gridConf, blockConf, 0, cuStream>>>
+            (d_inMask, d_outMask,
+             mask1Dlen, mask2Dstride,
+             anchorsRoi, step);
+    }
+};
+
+
+void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
+                                     NcvBool tbDoAtomicCompaction,
+
+                                     dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
+
+                                     Ncv32u *d_inMask, Ncv32u *d_outMask,
+                                     Ncv32u mask1Dlen, Ncv32u mask2Dstride,
+                                     NcvSize32u anchorsRoi, Ncv32u step)
+{
+    initializeMaskVectorFunctor functor(gridConf, blockConf, cuStream,
+                                        d_inMask, d_outMask,
+                                        mask1Dlen, mask2Dstride,
+                                        anchorsRoi, step);
+
+    //Second parameter is the number of "dynamic" template parameters
+    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
+        ::call( &functor,
+                tbMaskByInmask,
+                tbDoAtomicCompaction);
+}
+
+
+Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar,
+                                              NCVVector<HaarStage64> &h_HaarStages)
+{
+    Ncv32u i = 0;
+    for (; i<haar.NumStages; i++)
+    {
+        if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N)
+        {
+            break;
+        }
+    }
+    return i;
+}
+
+
+NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
+                                               NCVMatrix<Ncv32f> &d_weights,
+                                               NCVMatrixAlloc<Ncv32u> &d_pixelMask,
+                                               Ncv32u &numDetections,
+                                               HaarClassifierCascadeDescriptor &haar,
+                                               NCVVector<HaarStage64> &h_HaarStages,
+                                               NCVVector<HaarStage64> &d_HaarStages,
+                                               NCVVector<HaarClassifierNode128> &d_HaarNodes,
+                                               NCVVector<HaarFeature64> &d_HaarFeatures,
+                                               NcvBool bMaskElements,
+                                               NcvSize32u anchorsRoi,
+                                               Ncv32u pixelStep,
+                                               Ncv32f scaleArea,
+                                               INCVMemAllocator &gpuAllocator,
+                                               INCVMemAllocator &cpuAllocator,
+                                               cudaDeviceProp &devProp,
+                                               cudaStream_t cuStream)
+{
+    ncvAssertReturn(integral.memType() == d_weights.memType()&&
+                    integral.memType() == d_pixelMask.memType() &&
+                    integral.memType() == gpuAllocator.memType() &&
+                   (integral.memType() == NCVMemoryTypeDevice ||
+                    integral.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+
+    ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
+                    d_HaarStages.memType() == d_HaarFeatures.memType() &&
+                     (d_HaarStages.memType() == NCVMemoryTypeDevice ||
+                      d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+
+    ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
+
+    ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
+
+    ncvAssertReturn((integral.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&
+                     h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
+                     d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
+
+    ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
+                    d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&
+                    d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&
+                    integral.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
+                    integral.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
+
+    ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
+
+    ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
+                    d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
+                    d_HaarFeatures.length() >= haar.NumFeatures &&
+                    d_HaarStages.length() == h_HaarStages.length() &&
+                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
+
+    ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);
+
+    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
+
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+
+#if defined _SELF_TEST_
+
+    NCVStatus ncvStat;
+
+    NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, integral.width, integral.height, integral.pitch);
+    ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);
+    ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
+    ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length);
+    ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length);
+    ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
+    ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvStat = integral.copySolid(h_integralImage, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvStat = d_weights.copySolid(h_weights, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+    for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++)
+    {
+        for (Ncv32u j=0; j<d_pixelMask.stride(); j++)
+        {
+            if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width))
+            {
+                if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U)
+                {
+                    h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j;
+                }
+            }
+            else
+            {
+                h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;
+            }
+        }
+    }
+
+    NCV_SKIP_COND_END
+
+#endif
+
+    NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());
+    ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
+
+    NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, static_cast<Ncv32u>(d_vecPixelMask.length()));
+    ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);
+    ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    Ncv32u *hp_zero = &hp_pool32u.ptr()[0];
+    Ncv32u *hp_numDet = &hp_pool32u.ptr()[1];
+
+    NCV_SKIP_COND_BEGIN
+    *hp_zero = 0;
+    *hp_numDet = 0;
+    NCV_SKIP_COND_END
+
+    Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
+                                          (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
+
+    NcvBool bTexCacheCascade = devProp.major < 2;
+    NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
+    NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
+
+    NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
+    NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
+
+    Ncv32u szNppCompactTmpBuf;
+    nppsStCompactGetSize_32u(static_cast<Ncv32u>(d_vecPixelMask.length()), &szNppCompactTmpBuf, devProp);
+    if (bDoAtomicCompaction)
+    {
+        szNppCompactTmpBuf = 0;
+    }
+    NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf);
+
+    NCV_SKIP_COND_BEGIN
+
+    if (bTexCacheIImg)
+    {
+        cudaChannelFormatDesc cfdTexIImage;
+        cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
+
+        size_t alignmentOffset;
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
+            (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
+        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
+    }
+
+    if (bTexCacheCascade)
+    {
+        cudaChannelFormatDesc cfdTexHaarFeatures;
+        cudaChannelFormatDesc cfdTexHaarClassifierNodes;
+        cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
+        cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
+
+        size_t alignmentOffset;
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
+            d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
+        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
+            d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
+        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
+    }
+
+    Ncv32u stageStartAnchorParallel = 0;
+    Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL,
+        haar, h_HaarStages);
+    Ncv32u stageEndClassifierParallel = haar.NumStages;
+    if (stageMiddleSwitch == 0)
+    {
+        stageMiddleSwitch = 1;
+    }
+
+    //create stages subdivision for pixel-parallel processing
+    const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1;
+    Ncv32u curStop = stageStartAnchorParallel;
+    std::vector<Ncv32u> pixParallelStageStops;
+    while (curStop < stageMiddleSwitch)
+    {
+        pixParallelStageStops.push_back(curStop);
+        curStop += compactEveryNstage;
+    }
+    if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2)
+    {
+        pixParallelStageStops[pixParallelStageStops.size()-1] =
+            (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2;
+    }
+    pixParallelStageStops.push_back(stageMiddleSwitch);
+    Ncv32u pixParallelStageStopsIndex = 0;
+
+    if (pixelStep != 1 || bMaskElements)
+    {
+        if (bDoAtomicCompaction)
+        {
+            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
+                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+        }
+
+        dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
+                        (anchorsRoi.height + pixelStep - 1) / pixelStep);
+        dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL);
+
+        if (gridInit.x == 0 || gridInit.y == 0)
+        {
+            numDetections = 0;
+            return NCV_SUCCESS;
+        }
+
+        initializeMaskVectorDynTemplate(bMaskElements,
+                                        bDoAtomicCompaction,
+                                        gridInit, blockInit, cuStream,
+                                        d_ptrNowData->ptr(),
+                                        d_ptrNowTmp->ptr(),
+                                        static_cast<Ncv32u>(d_vecPixelMask.length()), d_pixelMask.stride(),
+                                        anchorsRoi, pixelStep);
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+
+        if (bDoAtomicCompaction)
+        {
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
+                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+            swap(d_ptrNowData, d_ptrNowTmp);
+        }
+        else
+        {
+            NCVStatus nppSt;
+            nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
+                                      d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
+        }
+        numDetections = *hp_numDet;
+    }
+    else
+    {
+        //
+        // 1. Run the first pixel-input pixel-parallel classifier for few stages
+        //
+
+        if (bDoAtomicCompaction)
+        {
+            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
+                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+        }
+
+        dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
+                   anchorsRoi.height);
+        dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
+        applyHaarClassifierAnchorParallelDynTemplate(
+            true,                         //tbInitMaskPositively
+            bTexCacheIImg,                //tbCacheTextureIImg
+            bTexCacheCascade,             //tbCacheTextureCascade
+            pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
+            bDoAtomicCompaction,          //tbDoAtomicCompaction
+            grid1,
+            block1,
+            cuStream,
+            integral.ptr(), integral.stride(),
+            d_weights.ptr(), d_weights.stride(),
+            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
+            d_ptrNowData->ptr(),
+            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
+            0,
+            d_pixelMask.stride(),
+            anchorsRoi,
+            pixParallelStageStops[pixParallelStageStopsIndex],
+            pixParallelStageStops[pixParallelStageStopsIndex+1],
+            scaleAreaPixels);
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+
+        if (bDoAtomicCompaction)
+        {
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
+                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+        }
+        else
+        {
+            NCVStatus nppSt;
+            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
+                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturnNcvStat(nppSt);
+        }
+
+        swap(d_ptrNowData, d_ptrNowTmp);
+        numDetections = *hp_numDet;
+
+        pixParallelStageStopsIndex++;
+    }
+
+    //
+    // 2. Run pixel-parallel stages
+    //
+
+    for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++)
+    {
+        if (numDetections == 0)
+        {
+            break;
+        }
+
+        if (bDoAtomicCompaction)
+        {
+            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
+                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+        }
+
+        dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL);
+        if (numDetections > MAX_GRID_DIM)
+        {
+            grid2.x = MAX_GRID_DIM;
+            grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
+        }
+        dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
+
+        applyHaarClassifierAnchorParallelDynTemplate(
+            false,                        //tbInitMaskPositively
+            bTexCacheIImg,                //tbCacheTextureIImg
+            bTexCacheCascade,             //tbCacheTextureCascade
+            pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
+            bDoAtomicCompaction,          //tbDoAtomicCompaction
+            grid2,
+            block2,
+            cuStream,
+            integral.ptr(), integral.stride(),
+            d_weights.ptr(), d_weights.stride(),
+            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
+            d_ptrNowData->ptr(),
+            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
+            numDetections,
+            d_pixelMask.stride(),
+            anchorsRoi,
+            pixParallelStageStops[pixParallelStageStopsIndex],
+            pixParallelStageStops[pixParallelStageStopsIndex+1],
+            scaleAreaPixels);
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+
+        if (bDoAtomicCompaction)
+        {
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
+                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+        }
+        else
+        {
+            NCVStatus nppSt;
+            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
+                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturnNcvStat(nppSt);
+        }
+
+        swap(d_ptrNowData, d_ptrNowTmp);
+        numDetections = *hp_numDet;
+    }
+
+    //
+    // 3. Run all left stages in one stage-parallel kernel
+    //
+
+    if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel)
+    {
+        if (bDoAtomicCompaction)
+        {
+            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
+                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+        }
+
+        dim3 grid3(numDetections);
+        if (numDetections > MAX_GRID_DIM)
+        {
+            grid3.x = MAX_GRID_DIM;
+            grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
+        }
+        dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
+
+        applyHaarClassifierClassifierParallelDynTemplate(
+            bTexCacheIImg,                //tbCacheTextureIImg
+            bTexCacheCascade,             //tbCacheTextureCascade
+            bDoAtomicCompaction,          //tbDoAtomicCompaction
+            grid3,
+            block3,
+            cuStream,
+            integral.ptr(), integral.stride(),
+            d_weights.ptr(), d_weights.stride(),
+            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
+            d_ptrNowData->ptr(),
+            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
+            numDetections,
+            d_pixelMask.stride(),
+            anchorsRoi,
+            stageMiddleSwitch,
+            stageEndClassifierParallel,
+            scaleAreaPixels);
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+
+        if (bDoAtomicCompaction)
+        {
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
+                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+        }
+        else
+        {
+            NCVStatus nppSt;
+            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
+                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturnNcvStat(nppSt);
+        }
+
+        swap(d_ptrNowData, d_ptrNowTmp);
+        numDetections = *hp_numDet;
+    }
+
+    if (d_ptrNowData != &d_vecPixelMask)
+    {
+        d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+    }
+
+#if defined _SELF_TEST_
+
+    ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+
+    if (bDoAtomicCompaction)
+    {
+        std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections);
+    }
+
+    Ncv32u fpu_oldcw, fpu_cw;
+    _controlfp_s(&fpu_cw, 0, 0);
+    fpu_oldcw = fpu_cw;
+    _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);
+    Ncv32u numDetGold;
+    ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar,
+                                                 h_HaarStages, h_HaarNodes, h_HaarFeatures,
+                                                 bMaskElements, anchorsRoi, pixelStep, scaleArea);
+    ncvAssertReturnNcvStat(ncvStat);
+    _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);
+
+    bool bPass = true;
+
+    if (numDetGold != numDetections)
+    {
+        printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections);
+        bPass = false;
+    }
+    else
+    {
+        for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++)
+        {
+            if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i])
+            {
+                printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]);
+                bPass = false;
+            }
+        }
+    }
+
+    printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED");
+#endif
+
+    NCV_SKIP_COND_END
+
+    return NCV_SUCCESS;
+}
+
+
+//==============================================================================
+//
+// HypothesesOperations file
+//
+//==============================================================================
+
+
+const Ncv32u NUM_GROW_THREADS = 128;
+
+
+__device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale)
+{
+    NcvRect32u res;
+    res.x = (Ncv32u)(scale * (pixel & 0xFFFF));
+    res.y = (Ncv32u)(scale * (pixel >> 16));
+    res.width = (Ncv32u)(scale * width);
+    res.height = (Ncv32u)(scale * height);
+    return res;
+}
+
+
+__global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements,
+                                     NcvRect32u *hypotheses,
+                                     Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale)
+{
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
+    Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x;
+    if (elemAddr >= numElements)
+    {
+        return;
+    }
+    hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale);
+}
+
+
+NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
+                                         Ncv32u numPixelMaskDetections,
+                                         NCVVector<NcvRect32u> &hypotheses,
+                                         Ncv32u &totalDetections,
+                                         Ncv32u totalMaxDetections,
+                                         Ncv32u rectWidth,
+                                         Ncv32u rectHeight,
+                                         Ncv32f curScale,
+                                         cudaStream_t cuStream)
+{
+    ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
+
+    ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
+                    pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
+
+    ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
+
+    ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
+
+    ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
+                    numPixelMaskDetections <= pixelMask.length() &&
+                    totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
+
+    NCVStatus ncvStat = NCV_SUCCESS;
+    Ncv32u numDetsToCopy = numPixelMaskDetections;
+
+    if (numDetsToCopy == 0)
+    {
+        return ncvStat;
+    }
+
+    if (totalDetections + numPixelMaskDetections > totalMaxDetections)
+    {
+        ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
+        numDetsToCopy = totalMaxDetections - totalDetections;
+    }
+
+    dim3 block(NUM_GROW_THREADS);
+    dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS);
+    if (grid.x > 65535)
+    {
+        grid.y = (grid.x + 65534) / 65535;
+        grid.x = 65535;
+    }
+    growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy,
+                                                       hypotheses.ptr() + totalDetections,
+                                                       rectWidth, rectHeight, curScale);
+    ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+
+    totalDetections += numDetsToCopy;
+    return ncvStat;
+}
+
+
+//==============================================================================
+//
+// Pipeline file
+//
+//==============================================================================
+
+
+NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
+                                            NcvSize32u srcRoi,
+                                            NCVVector<NcvRect32u> &d_dstRects,
+                                            Ncv32u &dstNumRects,
+
+                                            HaarClassifierCascadeDescriptor &haar,
+                                            NCVVector<HaarStage64> &h_HaarStages,
+                                            NCVVector<HaarStage64> &d_HaarStages,
+                                            NCVVector<HaarClassifierNode128> &d_HaarNodes,
+                                            NCVVector<HaarFeature64> &d_HaarFeatures,
+
+                                            NcvSize32u minObjSize,
+                                            Ncv32u minNeighbors,      //default 4
+                                            Ncv32f scaleStep,         //default 1.2f
+                                            Ncv32u pixelStep,         //default 1
+                                            Ncv32u flags,             //default NCVPipeObjDet_Default
+
+                                            INCVMemAllocator &gpuAllocator,
+                                            INCVMemAllocator &cpuAllocator,
+                                            cudaDeviceProp &devProp,
+                                            cudaStream_t cuStream)
+{
+    ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&
+                    d_srcImg.memType() == gpuAllocator.memType() &&
+                     (d_srcImg.memType() == NCVMemoryTypeDevice ||
+                      d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+
+    ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
+                    d_HaarStages.memType() == d_HaarFeatures.memType() &&
+                     (d_HaarStages.memType() == NCVMemoryTypeDevice ||
+                      d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+
+    ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
+
+    ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
+
+    ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&
+                     h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
+                     d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
+    ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 &&
+                    d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&
+                    srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&
+                    d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);
+
+    ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);
+
+    ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
+                    d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
+                    d_HaarFeatures.length() >= haar.NumFeatures &&
+                    d_HaarStages.length() == h_HaarStages.length() &&
+                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
+
+    ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
+
+    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
+
+    //TODO: set NPP active stream to cuStream
+
+    NCVStatus ncvStat;
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+
+    Ncv32u integralWidth = d_srcImg.width() + 1;
+    Ncv32u integralHeight = d_srcImg.height() + 1;
+
+    NCVMatrixAlloc<Ncv32u> integral(gpuAllocator, integralWidth, integralHeight);
+    ncvAssertReturn(integral.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);
+    ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height());
+    ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height());
+    ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight);
+    ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight);
+    ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height());
+    ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());
+    ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    NCVStatus nppStat;
+    Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
+    nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);
+    nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);
+    NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));
+    ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    NCV_SKIP_COND_BEGIN
+
+    nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
+                                       integral.ptr(), integral.pitch(),
+                                       NcvSize32u(d_srcImg.width(), d_srcImg.height()),
+                                       d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);
+
+    nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
+                                          d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
+                                          NcvSize32u(d_srcImg.width(), d_srcImg.height()),
+                                          d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);
+
+    NCV_SKIP_COND_END
+
+    dstNumRects = 0;
+
+    Ncv32u lastCheckedScale = 0;
+    NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0);
+    std::vector<Ncv32u> scalesVector;
+
+    NcvBool bFoundLargestFace = false;
+
+    for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep)
+    {
+        Ncv32u scale = (Ncv32u)scaleIter;
+        if (lastCheckedScale == scale)
+        {
+            continue;
+        }
+        lastCheckedScale = scale;
+
+        if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width ||
+            haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height)
+        {
+            continue;
+        }
+
+        NcvSize32s srcRoi_, srcIIRo_i, scaledIIRoi, searchRoi;
+
+        srcRoi_.width = d_srcImg.width();
+        srcRoi_.height = d_srcImg.height();
+
+        srcIIRo_i.width = srcRoi_.width + 1;
+        srcIIRo_i.height = srcRoi_.height + 1;
+
+        scaledIIRoi.width = srcIIRo_i.width / scale;
+        scaledIIRoi.height = srcIIRo_i.height / scale;
+
+        searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
+        searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
+
+        if (searchRoi.width <= 0 || searchRoi.height <= 0)
+        {
+            break;
+        }
+
+        scalesVector.push_back(scale);
+
+        if (gpuAllocator.isCounting())
+        {
+            break;
+        }
+    }
+
+    if (bReverseTraverseScale)
+    {
+        std::reverse(scalesVector.begin(), scalesVector.end());
+    }
+
+    //TODO: handle _fair_scale_ flag
+    for (Ncv32u i=0; i<scalesVector.size(); i++)
+    {
+        Ncv32u scale = scalesVector[i];
+
+        NcvSize32u srcRoi_, scaledIIRoi, searchRoi;
+        NcvSize32u srcIIRoi;
+        srcRoi_.width = d_srcImg.width();
+        srcRoi_.height = d_srcImg.height();
+        srcIIRoi.width = srcRoi_.width + 1;
+        srcIIRoi.height = srcRoi_.height + 1;
+        scaledIIRoi.width = srcIIRoi.width / scale;
+        scaledIIRoi.height = srcIIRoi.height / scale;
+        searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
+        searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
+
+        NCV_SKIP_COND_BEGIN
+
+        nppStat = nppiStDecimate_32u_C1R(
+            integral.ptr(), integral.pitch(),
+            d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
+            srcIIRoi, scale, true);
+        ncvAssertReturnNcvStat(nppStat);
+
+        nppStat = nppiStDecimate_64u_C1R(
+            d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
+            d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
+            srcIIRoi, scale, true);
+        ncvAssertReturnNcvStat(nppStat);
+
+        const NcvRect32u rect(
+            HAAR_STDDEV_BORDER,
+            HAAR_STDDEV_BORDER,
+            haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
+            haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);
+        nppStat = nppiStRectStdDev_32f_C1R(
+            d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
+            d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
+            d_rectStdDev.ptr(), d_rectStdDev.pitch(),
+            NcvSize32u(searchRoi.width, searchRoi.height), rect,
+            (Ncv32f)scale*scale, true);
+        ncvAssertReturnNcvStat(nppStat);
+
+        NCV_SKIP_COND_END
+
+        Ncv32u detectionsOnThisScale;
+        ncvStat = ncvApplyHaarClassifierCascade_device(
+            d_scaledIntegralImage, d_rectStdDev, d_pixelMask,
+            detectionsOnThisScale,
+            haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
+            searchRoi, pixelStep, (Ncv32f)scale*scale,
+            gpuAllocator, cpuAllocator, devProp, cuStream);
+        ncvAssertReturnNcvStat(nppStat);
+
+        NCV_SKIP_COND_BEGIN
+
+        NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment());
+        ncvStat = ncvGrowDetectionsVector_device(
+            d_vecPixelMask,
+            detectionsOnThisScale,
+            d_hypothesesIntermediate,
+            dstNumRects,
+            static_cast<Ncv32u>(d_hypothesesIntermediate.length()),
+            haar.ClassifierSize.width,
+            haar.ClassifierSize.height,
+            (Ncv32f)scale,
+            cuStream);
+        ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+
+        if (flags & NCVPipeObjDet_FindLargestObject)
+        {
+            if (dstNumRects == 0)
+            {
+                continue;
+            }
+
+            if (dstNumRects != 0)
+            {
+                ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+                ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
+                                                             dstNumRects * sizeof(NcvRect32u));
+                ncvAssertReturnNcvStat(ncvStat);
+                ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+            }
+
+            Ncv32u numStrongHypothesesNow = dstNumRects;
+            ncvStat = ncvGroupRectangles_host(
+                h_hypothesesIntermediate,
+                numStrongHypothesesNow,
+                minNeighbors,
+                RECT_SIMILARITY_PROPORTION,
+                NULL);
+            ncvAssertReturnNcvStat(ncvStat);
+
+            if (numStrongHypothesesNow > 0)
+            {
+                NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0];
+                for (Ncv32u j=1; j<numStrongHypothesesNow; j++)
+                {
+                    if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width)
+                    {
+                        maxRect = h_hypothesesIntermediate.ptr()[j];
+                    }
+                }
+
+                h_hypothesesIntermediate.ptr()[0] = maxRect;
+                dstNumRects = 1;
+
+                ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u));
+                ncvAssertReturnNcvStat(ncvStat);
+
+                bFoundLargestFace = true;
+
+                break;
+            }
+        }
+
+        NCV_SKIP_COND_END
+
+        if (gpuAllocator.isCounting())
+        {
+            break;
+        }
+    }
+
+    NCVStatus ncvRetCode = NCV_SUCCESS;
+
+    NCV_SKIP_COND_BEGIN
+
+    if (flags & NCVPipeObjDet_FindLargestObject)
+    {
+        if (!bFoundLargestFace)
+        {
+            dstNumRects = 0;
+        }
+    }
+    else
+    {
+        //TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left)
+        if (dstNumRects != 0)
+        {
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+            ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
+                                                         dstNumRects * sizeof(NcvRect32u));
+            ncvAssertReturnNcvStat(ncvStat);
+            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+        }
+
+        ncvStat = ncvGroupRectangles_host(
+            h_hypothesesIntermediate,
+            dstNumRects,
+            minNeighbors,
+            RECT_SIMILARITY_PROPORTION,
+            NULL);
+        ncvAssertReturnNcvStat(ncvStat);
+
+        if (dstNumRects > d_dstRects.length())
+        {
+            ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
+            dstNumRects = static_cast<Ncv32u>(d_dstRects.length());
+        }
+
+        if (dstNumRects != 0)
+        {
+            ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream,
+                                                         dstNumRects * sizeof(NcvRect32u));
+            ncvAssertReturnNcvStat(ncvStat);
+        }
+    }
+
+    if (flags & NCVPipeObjDet_VisualizeInPlace)
+    {
+        ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+        ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(),
+                               d_srcImg.width(), d_srcImg.height(),
+                               d_dstRects.ptr(), dstNumRects, 255, cuStream);
+    }
+
+    NCV_SKIP_COND_END
+
+    return ncvRetCode;
+}
+
+
+//==============================================================================
+//
+// Purely Host code: classifier IO, mock-ups
+//
+//==============================================================================
+
+
+#ifdef _SELF_TEST_
+#include <float.h>
+#endif
+
+
+NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
+                                             NCVMatrix<Ncv32f> &h_weights,
+                                             NCVMatrixAlloc<Ncv32u> &h_pixelMask,
+                                             Ncv32u &numDetections,
+                                             HaarClassifierCascadeDescriptor &haar,
+                                             NCVVector<HaarStage64> &h_HaarStages,
+                                             NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                             NCVVector<HaarFeature64> &h_HaarFeatures,
+                                             NcvBool bMaskElements,
+                                             NcvSize32u anchorsRoi,
+                                             Ncv32u pixelStep,
+                                             Ncv32f scaleArea)
+{
+    ncvAssertReturn(h_integralImage.memType() == h_weights.memType() &&
+                    h_integralImage.memType() == h_pixelMask.memType() &&
+                     (h_integralImage.memType() == NCVMemoryTypeHostPageable ||
+                      h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() &&
+                    h_HaarStages.memType() == h_HaarFeatures.memType() &&
+                     (h_HaarStages.memType() == NCVMemoryTypeHostPageable ||
+                      h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL &&
+                    h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR);
+    ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
+                    h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height &&
+                    h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height &&
+                    h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
+                    h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
+    ncvAssertReturn(h_HaarStages.length() >= haar.NumStages &&
+                    h_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
+                    h_HaarFeatures.length() >= haar.NumFeatures &&
+                    h_HaarStages.length() == h_HaarStages.length() &&
+                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
+    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
+
+    Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
+                                          (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
+
+    for (Ncv32u i=0; i<anchorsRoi.height; i++)
+    {
+        for (Ncv32u j=0; j<h_pixelMask.stride(); j++)
+        {
+            if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width)
+            {
+                h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
+            }
+            else
+            {
+                for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++)
+                {
+                    Ncv32f curStageSum = 0.0f;
+                    Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes();
+                    Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset();
+
+                    if (iStage == 0)
+                    {
+                        if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
+                        {
+                            break;
+                        }
+                        else
+                        {
+                            h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j);
+                        }
+                    }
+                    else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
+                    {
+                        break;
+                    }
+
+                    while (numRootNodesInStage--)
+                    {
+                        NcvBool bMoreNodesToTraverse = true;
+                        Ncv32u curNodeOffset = curRootNodeOffset;
+
+                        while (bMoreNodesToTraverse)
+                        {
+                            HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset];
+                            HaarFeatureDescriptor32 curFeatDesc = curNode.getFeatureDesc();
+                            Ncv32u curNodeFeaturesNum = curFeatDesc.getNumFeatures();
+                            Ncv32u curNodeFeaturesOffs = curFeatDesc.getFeaturesOffset();
+
+                            Ncv32f curNodeVal = 0.f;
+                            for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
+                            {
+                                HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect];
+                                Ncv32u rectX, rectY, rectWidth, rectHeight;
+                                feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight);
+                                Ncv32f rectWeight = feature.getWeight();
+                                Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX);
+                                Ncv32u iioffsTR = iioffsTL + rectWidth;
+                                Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride();
+                                Ncv32u iioffsBR = iioffsBL + rectWidth;
+
+                                Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL];
+                                Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR];
+                                Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL];
+                                Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR];
+                                Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR;
+                                curNodeVal += (Ncv32f)rectSum * rectWeight;
+                            }
+
+                            HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
+                            HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
+                            Ncv32f nodeThreshold = curNode.getThreshold();
+
+                            HaarClassifierNodeDescriptor32 nextNodeDescriptor;
+                            NcvBool nextNodeIsLeaf;
+
+                            if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold)
+                            {
+                                nextNodeDescriptor = nodeLeft;
+                                nextNodeIsLeaf = curFeatDesc.isLeftNodeLeaf();
+                            }
+                            else
+                            {
+                                nextNodeDescriptor = nodeRight;
+                                nextNodeIsLeaf = curFeatDesc.isRightNodeLeaf();
+                            }
+
+                            if (nextNodeIsLeaf)
+                            {
+                                Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost();
+                                curStageSum += tmpLeafValue;
+                                bMoreNodesToTraverse = false;
+                            }
+                            else
+                            {
+                                curNodeOffset = nextNodeDescriptor.getNextNodeOffset();
+                            }
+                        }
+
+                        curRootNodeOffset++;
+                    }
+
+                    Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold();
+                    if (curStageSum < tmpStageThreshold)
+                    {
+                        //drop
+                        h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride());
+    Ncv32u i = 0;
+    for (; i<anchorsRoi.height * h_pixelMask.stride(); i++)
+    {
+        if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U)
+        {
+            break;
+        }
+    }
+    numDetections = i;
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
+                                       Ncv32u numPixelMaskDetections,
+                                       NCVVector<NcvRect32u> &hypotheses,
+                                       Ncv32u &totalDetections,
+                                       Ncv32u totalMaxDetections,
+                                       Ncv32u rectWidth,
+                                       Ncv32u rectHeight,
+                                       Ncv32f curScale)
+{
+    ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
+    ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
+                    pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
+    ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
+    ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
+                    numPixelMaskDetections <= pixelMask.length() &&
+                    totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
+
+    NCVStatus ncvStat = NCV_SUCCESS;
+    Ncv32u numDetsToCopy = numPixelMaskDetections;
+
+    if (numDetsToCopy == 0)
+    {
+        return ncvStat;
+    }
+
+    if (totalDetections + numPixelMaskDetections > totalMaxDetections)
+    {
+        ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
+        numDetsToCopy = totalMaxDetections - totalDetections;
+    }
+
+    for (Ncv32u i=0; i<numDetsToCopy; i++)
+    {
+        hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale);
+    }
+
+    totalDetections += numDetsToCopy;
+    return ncvStat;
+}
+
+static NCVStatus loadFromXML(const cv::String &filename,
+                      HaarClassifierCascadeDescriptor &haar,
+                      std::vector<HaarStage64> &haarStages,
+                      std::vector<HaarClassifierNode128> &haarClassifierNodes,
+                      std::vector<HaarFeature64> &haarFeatures)
+{
+#ifndef HAVE_OPENCV_OBJDETECT
+    CV_UNUSED(filename);
+    CV_UNUSED(haar);
+    CV_UNUSED(haarStages);
+    CV_UNUSED(haarClassifierNodes);
+    CV_UNUSED(haarFeatures);
+    CV_Error(cv::Error::StsNotImplemented, "This functionality requires objdetect module");
+    return NCV_HAAR_XML_LOADING_EXCEPTION;
+#else
+    NCVStatus ncvStat;
+
+    haar.NumStages = 0;
+    haar.NumClassifierRootNodes = 0;
+    haar.NumClassifierTotalNodes = 0;
+    haar.NumFeatures = 0;
+    haar.ClassifierSize.width = 0;
+    haar.ClassifierSize.height = 0;
+    haar.bHasStumpsOnly = true;
+    haar.bNeedsTiltedII = false;
+    Ncv32u curMaxTreeDepth = 0;
+
+    std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
+    haarStages.resize(0);
+    haarClassifierNodes.resize(0);
+    haarFeatures.resize(0);
+
+    cv::Ptr<CvHaarClassifierCascade> oldCascade((CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0));
+    if (!oldCascade)
+    {
+        return NCV_HAAR_XML_LOADING_EXCEPTION;
+    }
+
+    haar.ClassifierSize.width = oldCascade->orig_window_size.width;
+    haar.ClassifierSize.height = oldCascade->orig_window_size.height;
+
+    int stagesCount = oldCascade->count;
+    for(int s = 0; s < stagesCount; ++s) // by stages
+    {
+        HaarStage64 curStage;
+        curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
+
+        curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);
+
+        int treesCount = oldCascade->stage_classifier[s].count;
+        for(int t = 0; t < treesCount; ++t) // by trees
+        {
+            Ncv32u nodeId = 0;
+            CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];
+
+            int nodesCount = tree->count;
+            for(int n = 0; n < nodesCount; ++n)  //by features
+            {
+                CvHaarFeature* feature = &tree->haar_feature[n];
+
+                HaarClassifierNode128 curNode;
+                curNode.setThreshold(tree->threshold[n]);
+
+                NcvBool bIsLeftNodeLeaf = false;
+                NcvBool bIsRightNodeLeaf = false;
+
+                HaarClassifierNodeDescriptor32 nodeLeft;
+                if ( tree->left[n] <= 0 )
+                {
+                    Ncv32f leftVal = tree->alpha[-tree->left[n]];
+                    ncvStat = nodeLeft.create(leftVal);
+                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+                    bIsLeftNodeLeaf = true;
+                }
+                else
+                {
+                    Ncv32u leftNodeOffset = tree->left[n];
+                    nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
+                    haar.bHasStumpsOnly = false;
+                }
+                curNode.setLeftNodeDesc(nodeLeft);
+
+                HaarClassifierNodeDescriptor32 nodeRight;
+                if ( tree->right[n] <= 0 )
+                {
+                    Ncv32f rightVal = tree->alpha[-tree->right[n]];
+                    ncvStat = nodeRight.create(rightVal);
+                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+                    bIsRightNodeLeaf = true;
+                }
+                else
+                {
+                    Ncv32u rightNodeOffset = tree->right[n];
+                    nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
+                    haar.bHasStumpsOnly = false;
+                }
+                curNode.setRightNodeDesc(nodeRight);
+
+                Ncv32u tiltedVal = feature->tilted;
+                haar.bNeedsTiltedII = (tiltedVal != 0);
+
+                Ncv32u featureId = 0;
+                for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects
+                {
+                    Ncv32u rectX = feature->rect[l].r.x;
+                    Ncv32u rectY = feature->rect[l].r.y;
+                    Ncv32u rectWidth = feature->rect[l].r.width;
+                    Ncv32u rectHeight = feature->rect[l].r.height;
+
+                    Ncv32f rectWeight = feature->rect[l].weight;
+
+                    if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)
+                        break;
+
+                    HaarFeature64 curFeature;
+                    ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
+                    curFeature.setWeight(rectWeight);
+                    ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
+                    haarFeatures.push_back(curFeature);
+
+                    featureId++;
+                }
+
+                HaarFeatureDescriptor32 tmpFeatureDesc;
+                ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
+                    featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
+                ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
+                curNode.setFeatureDesc(tmpFeatureDesc);
+
+                if (!nodeId)
+                {
+                    //root node
+                    haarClassifierNodes.push_back(curNode);
+                    curMaxTreeDepth = 1;
+                }
+                else
+                {
+                    //other node
+                    h_TmpClassifierNotRootNodes.push_back(curNode);
+                    curMaxTreeDepth++;
+                }
+
+                nodeId++;
+            }
+        }
+
+        curStage.setNumClassifierRootNodes(treesCount);
+        haarStages.push_back(curStage);
+    }
+
+    //fill in cascade stats
+    haar.NumStages = static_cast<Ncv32u>(haarStages.size());
+    haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
+    haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
+    haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
+
+    //merge root and leaf nodes in one classifiers array
+    Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
+    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
+    {
+        HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
+
+        HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
+        if (!featureDesc.isLeftNodeLeaf())
+        {
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
+            nodeLeft.create(newOffset);
+        }
+        haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
+
+        HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
+        if (!featureDesc.isRightNodeLeaf())
+        {
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
+            nodeRight.create(newOffset);
+        }
+        haarClassifierNodes[i].setRightNodeDesc(nodeRight);
+    }
+
+    for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
+    {
+        HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
+
+        HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
+        if (!featureDesc.isLeftNodeLeaf())
+        {
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
+            nodeLeft.create(newOffset);
+        }
+        h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
+
+        HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
+        if (!featureDesc.isRightNodeLeaf())
+        {
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
+            nodeRight.create(newOffset);
+        }
+        h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
+
+        haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
+    }
+
+    return NCV_SUCCESS;
+#endif
+}
+
+
+#define NVBIN_HAAR_SIZERESERVED     16
+#define NVBIN_HAAR_VERSION          0x1
+
+
+static NCVStatus loadFromNVBIN(const cv::String &filename,
+                               HaarClassifierCascadeDescriptor &haar,
+                               std::vector<HaarStage64> &haarStages,
+                               std::vector<HaarClassifierNode128> &haarClassifierNodes,
+                               std::vector<HaarFeature64> &haarFeatures)
+{
+    size_t readCount;
+    FILE *fp = fopen(filename.c_str(), "rb");
+    ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
+    Ncv32u fileVersion;
+    readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
+    ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
+    ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
+    Ncv32u fsize;
+    readCount = fread(&fsize, sizeof(Ncv32u), 1, fp);
+    ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
+    fseek(fp, 0, SEEK_END);
+    Ncv32u fsizeActual = ftell(fp);
+    ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);
+
+    std::vector<unsigned char> fdata;
+    fdata.resize(fsize);
+    Ncv32u dataOffset = 0;
+    fseek(fp, 0, SEEK_SET);
+    readCount = fread(&fdata[0], fsize, 1, fp);
+    ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
+    fclose(fp);
+
+    //data
+    dataOffset = NVBIN_HAAR_SIZERESERVED;
+    haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset);
+    dataOffset += sizeof(Ncv32u);
+    haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
+    dataOffset += sizeof(Ncv32u);
+    haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
+    dataOffset += sizeof(Ncv32u);
+    haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset);
+    dataOffset += sizeof(Ncv32u);
+    haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset);
+    dataOffset += sizeof(NcvSize32u);
+    haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset);
+    dataOffset += sizeof(NcvBool);
+    haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset);
+    dataOffset += sizeof(NcvBool);
+
+    haarStages.resize(haar.NumStages);
+    haarClassifierNodes.resize(haar.NumClassifierTotalNodes);
+    haarFeatures.resize(haar.NumFeatures);
+
+    Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
+    Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
+    Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
+
+    memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages);
+    dataOffset += szStages;
+    memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers);
+    dataOffset += szClassifiers;
+    memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures);
+    dataOffset += szFeatures;
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus ncvHaarGetClassifierSize(const cv::String &filename, Ncv32u &numStages,
+                                   Ncv32u &numNodes, Ncv32u &numFeatures)
+{
+    size_t readCount;
+    NCVStatus ncvStat;
+
+    cv::String fext = filename.substr(filename.find_last_of(".") + 1);
+    std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);
+
+    if (fext == "nvbin")
+    {
+        FILE *fp = fopen(filename.c_str(), "rb");
+        ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
+        Ncv32u fileVersion;
+        readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
+        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
+        ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
+        fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);
+        Ncv32u tmp;
+        readCount = fread(&numStages,   sizeof(Ncv32u), 1, fp);
+        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
+        readCount = fread(&tmp,         sizeof(Ncv32u), 1, fp);
+        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
+        readCount = fread(&numNodes,    sizeof(Ncv32u), 1, fp);
+        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
+        readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp);
+        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
+        fclose(fp);
+    }
+    else if (fext == "xml")
+    {
+        HaarClassifierCascadeDescriptor haar;
+        std::vector<HaarStage64> haarStages;
+        std::vector<HaarClassifierNode128> haarNodes;
+        std::vector<HaarFeature64> haarFeatures;
+
+        ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
+        ncvAssertReturnNcvStat(ncvStat);
+
+        numStages = haar.NumStages;
+        numNodes = haar.NumClassifierTotalNodes;
+        numFeatures = haar.NumFeatures;
+    }
+    else
+    {
+        return NCV_HAAR_XML_LOADING_EXCEPTION;
+    }
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus ncvHaarLoadFromFile_host(const cv::String &filename,
+                                   HaarClassifierCascadeDescriptor &haar,
+                                   NCVVector<HaarStage64> &h_HaarStages,
+                                   NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                   NCVVector<HaarFeature64> &h_HaarFeatures)
+{
+    ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
+                    h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
+                    h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
+
+    NCVStatus ncvStat;
+
+    cv::String fext = filename.substr(filename.find_last_of(".") + 1);
+    std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);
+
+    std::vector<HaarStage64> haarStages;
+    std::vector<HaarClassifierNode128> haarNodes;
+    std::vector<HaarFeature64> haarFeatures;
+
+    if (fext == "nvbin")
+    {
+        ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures);
+        ncvAssertReturnNcvStat(ncvStat);
+    }
+    else if (fext == "xml")
+    {
+        ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
+        ncvAssertReturnNcvStat(ncvStat);
+    }
+    else
+    {
+        return NCV_HAAR_XML_LOADING_EXCEPTION;
+    }
+
+    ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
+    ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
+    ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
+
+    memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64));
+    memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128));
+    memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64));
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename,
+                                 HaarClassifierCascadeDescriptor haar,
+                                 NCVVector<HaarStage64> &h_HaarStages,
+                                 NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                 NCVVector<HaarFeature64> &h_HaarFeatures)
+{
+    ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT);
+    ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT);
+    ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT);
+    ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
+                    h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
+                    h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
+
+    Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
+    Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
+    Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
+
+    Ncv32u dataOffset = 0;
+    std::vector<unsigned char> fdata;
+    fdata.resize(szStages+szClassifiers+szFeatures+1024, 0);
+
+    //header
+    *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION;
+
+    //data
+    dataOffset = NVBIN_HAAR_SIZERESERVED;
+    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages;
+    dataOffset += sizeof(Ncv32u);
+    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes;
+    dataOffset += sizeof(Ncv32u);
+    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes;
+    dataOffset += sizeof(Ncv32u);
+    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures;
+    dataOffset += sizeof(Ncv32u);
+    *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize;
+    dataOffset += sizeof(NcvSize32u);
+    *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII;
+    dataOffset += sizeof(NcvBool);
+    *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly;
+    dataOffset += sizeof(NcvBool);
+
+    memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages);
+    dataOffset += szStages;
+    memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers);
+    dataOffset += szClassifiers;
+    memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures);
+    dataOffset += szFeatures;
+    Ncv32u fsize = dataOffset;
+
+    //TODO: CRC32 here
+
+    //update header
+    dataOffset = sizeof(Ncv32u);
+    *(Ncv32u *)(&fdata[0]+dataOffset) = fsize;
+
+    FILE *fp = fopen(filename.c_str(), "wb");
+    ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
+    fwrite(&fdata[0], fsize, 1, fp);
+    fclose(fp);
+    return NCV_SUCCESS;
+}
diff --git a/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp b/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp
new file mode 100644
index 00000000000..3d570c5faac
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp
@@ -0,0 +1,351 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_pixel_operations_hpp_
+#define _ncv_pixel_operations_hpp_
+
+#include <limits.h>
+#include <float.h>
+#include "opencv2/cudalegacy/NCV.hpp"
+
+template<typename TBase> inline __host__ __device__ TBase _pixMaxVal();
+template<> inline __host__ __device__ Ncv8u  _pixMaxVal<Ncv8u>()  {return UCHAR_MAX;}
+template<> inline __host__ __device__ Ncv16u _pixMaxVal<Ncv16u>() {return USHRT_MAX;}
+template<> inline __host__ __device__ Ncv32u _pixMaxVal<Ncv32u>() {return  UINT_MAX;}
+template<> inline __host__ __device__ Ncv8s  _pixMaxVal<Ncv8s>()  {return  SCHAR_MAX;}
+template<> inline __host__ __device__ Ncv16s _pixMaxVal<Ncv16s>() {return  SHRT_MAX;}
+template<> inline __host__ __device__ Ncv32s _pixMaxVal<Ncv32s>() {return   INT_MAX;}
+template<> inline __host__ __device__ Ncv32f _pixMaxVal<Ncv32f>() {return   FLT_MAX;}
+template<> inline __host__ __device__ Ncv64f _pixMaxVal<Ncv64f>() {return   DBL_MAX;}
+
+template<typename TBase> inline __host__ __device__ TBase _pixMinVal();
+template<> inline __host__ __device__ Ncv8u  _pixMinVal<Ncv8u>()  {return 0;}
+template<> inline __host__ __device__ Ncv16u _pixMinVal<Ncv16u>() {return 0;}
+template<> inline __host__ __device__ Ncv32u _pixMinVal<Ncv32u>() {return 0;}
+template<> inline __host__ __device__ Ncv8s  _pixMinVal<Ncv8s>()  {return SCHAR_MIN;}
+template<> inline __host__ __device__ Ncv16s _pixMinVal<Ncv16s>() {return SHRT_MIN;}
+template<> inline __host__ __device__ Ncv32s _pixMinVal<Ncv32s>() {return INT_MIN;}
+template<> inline __host__ __device__ Ncv32f _pixMinVal<Ncv32f>() {return FLT_MIN;}
+template<> inline __host__ __device__ Ncv64f _pixMinVal<Ncv64f>() {return DBL_MIN;}
+
+template<typename Tvec> struct TConvVec2Base;
+template<> struct TConvVec2Base<uchar1>  {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar3>  {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar4>  {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<ushort1> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<ushort3> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<ushort4> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<uint1>   {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<uint3>   {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<uint4>   {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<float1>  {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<float3>  {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<float4>  {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<double1> {typedef Ncv64f TBase;};
+template<> struct TConvVec2Base<double3> {typedef Ncv64f TBase;};
+template<> struct TConvVec2Base<double4> {typedef Ncv64f TBase;};
+
+#define NC(T)       (sizeof(T) / sizeof(TConvVec2Base<T>::TBase))
+
+template<typename TBase, Ncv32u NC> struct TConvBase2Vec;
+template<> struct TConvBase2Vec<Ncv8u, 1>  {typedef uchar1 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 3>  {typedef uchar3 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 4>  {typedef uchar4 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 1> {typedef ushort1 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 3> {typedef ushort3 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 4> {typedef ushort4 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 1> {typedef uint1 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 3> {typedef uint3 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 4> {typedef uint4 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 1> {typedef float1 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 3> {typedef float3 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 4> {typedef float4 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 1> {typedef double1 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 3> {typedef double3 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 4> {typedef double4 TVec;};
+
+//TODO: consider using CUDA intrinsics to avoid branching
+template<typename Tin> inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a);}
+template<typename Tin> inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a, 0, USHRT_MAX);}
+template<typename Tin> inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a, 0, UINT_MAX);}
+template<typename Tin> inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
+
+//TODO: consider using CUDA intrinsics to avoid branching
+template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a+0.5f);}
+template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a+0.5f, 0, USHRT_MAX);}
+template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);}
+template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
+
+template<typename Tout> inline Tout _pixMakeZero();
+template<> inline __host__ __device__ uchar1 _pixMakeZero<uchar1>() {return make_uchar1(0);}
+template<> inline __host__ __device__ uchar3 _pixMakeZero<uchar3>() {return make_uchar3(0,0,0);}
+template<> inline __host__ __device__ uchar4 _pixMakeZero<uchar4>() {return make_uchar4(0,0,0,0);}
+template<> inline __host__ __device__ ushort1 _pixMakeZero<ushort1>() {return make_ushort1(0);}
+template<> inline __host__ __device__ ushort3 _pixMakeZero<ushort3>() {return make_ushort3(0,0,0);}
+template<> inline __host__ __device__ ushort4 _pixMakeZero<ushort4>() {return make_ushort4(0,0,0,0);}
+template<> inline __host__ __device__ uint1 _pixMakeZero<uint1>() {return make_uint1(0);}
+template<> inline __host__ __device__ uint3 _pixMakeZero<uint3>() {return make_uint3(0,0,0);}
+template<> inline __host__ __device__ uint4 _pixMakeZero<uint4>() {return make_uint4(0,0,0,0);}
+template<> inline __host__ __device__ float1 _pixMakeZero<float1>() {return make_float1(0.f);}
+template<> inline __host__ __device__ float3 _pixMakeZero<float3>() {return make_float3(0.f,0.f,0.f);}
+template<> inline __host__ __device__ float4 _pixMakeZero<float4>() {return make_float4(0.f,0.f,0.f,0.f);}
+template<> inline __host__ __device__ double1 _pixMakeZero<double1>() {return make_double1(0.);}
+template<> inline __host__ __device__ double3 _pixMakeZero<double3>() {return make_double3(0.,0.,0.);}
+template<> inline __host__ __device__ double4 _pixMakeZero<double4>() {return make_double4(0.,0.,0.,0.);}
+
+static inline __host__ __device__ uchar1 _pixMake(Ncv8u x) {return make_uchar1(x);}
+static inline __host__ __device__ uchar3 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z) {return make_uchar3(x,y,z);}
+static inline __host__ __device__ uchar4 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z, Ncv8u w) {return make_uchar4(x,y,z,w);}
+static inline __host__ __device__ ushort1 _pixMake(Ncv16u x) {return make_ushort1(x);}
+static inline __host__ __device__ ushort3 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z) {return make_ushort3(x,y,z);}
+static inline __host__ __device__ ushort4 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z, Ncv16u w) {return make_ushort4(x,y,z,w);}
+static inline __host__ __device__ uint1 _pixMake(Ncv32u x) {return make_uint1(x);}
+static inline __host__ __device__ uint3 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z) {return make_uint3(x,y,z);}
+static inline __host__ __device__ uint4 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z, Ncv32u w) {return make_uint4(x,y,z,w);}
+static inline __host__ __device__ float1 _pixMake(Ncv32f x) {return make_float1(x);}
+static inline __host__ __device__ float3 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z) {return make_float3(x,y,z);}
+static inline __host__ __device__ float4 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z, Ncv32f w) {return make_float4(x,y,z,w);}
+static inline __host__ __device__ double1 _pixMake(Ncv64f x) {return make_double1(x);}
+static inline __host__ __device__ double3 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z) {return make_double3(x,y,z);}
+static inline __host__ __device__ double4 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z, Ncv64f w) {return make_double4(x,y,z,w);}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampZ_CN {static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix);};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 1> {
+static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 3> {
+static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    _TDemoteClampZ(pix.y, out.y);
+    _TDemoteClampZ(pix.z, out.z);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 4> {
+static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    _TDemoteClampZ(pix.y, out.y);
+    _TDemoteClampZ(pix.z, out.z);
+    _TDemoteClampZ(pix.w, out.w);
+    return out;
+}};
+
+template<typename Tin, typename Tout> inline __host__ __device__ Tout _pixDemoteClampZ(Tin &pix)
+{
+    return __pixDemoteClampZ_CN<Tin, Tout, NC(Tin)>::_pixDemoteClampZ_CN(pix);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampNN_CN {static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix);};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 1> {
+static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 3> {
+static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    _TDemoteClampNN(pix.y, out.y);
+    _TDemoteClampNN(pix.z, out.z);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 4> {
+static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    _TDemoteClampNN(pix.y, out.y);
+    _TDemoteClampNN(pix.z, out.z);
+    _TDemoteClampNN(pix.w, out.w);
+    return out;
+}};
+
+template<typename Tin, typename Tout> inline __host__ __device__ Tout _pixDemoteClampNN(Tin &pix)
+{
+    return __pixDemoteClampNN_CN<Tin, Tout, NC(Tin)>::_pixDemoteClampNN_CN(pix);
+}
+
+
+template<typename Tin, typename Tout, typename Tw, Ncv32u CN> struct __pixScale_CN {static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w);};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 1> {
+static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 3> {
+static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    out.y = (TBout)(pix.y * w);
+    out.z = (TBout)(pix.z * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 4> {
+static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    out.y = (TBout)(pix.y * w);
+    out.z = (TBout)(pix.z * w);
+    out.w = (TBout)(pix.w * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> static __host__ __device__ Tout _pixScale(Tin &pix, Tw w)
+{
+    return __pixScale_CN<Tin, Tout, Tw, NC(Tin)>::_pixScale_CN(pix, w);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixAdd_CN {static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2);};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 1> {
+static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 3> {
+static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    out.y = pix1.y + pix2.y;
+    out.z = pix1.z + pix2.z;
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 4> {
+static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    out.y = pix1.y + pix2.y;
+    out.z = pix1.z + pix2.z;
+    out.w = pix1.w + pix2.w;
+    return out;
+}};
+
+template<typename Tin, typename Tout> static __host__ __device__ Tout _pixAdd(Tout &pix1, Tin &pix2)
+{
+    return __pixAdd_CN<Tin, Tout, NC(Tin)>::_pixAdd_CN(pix1, pix2);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDist_CN {static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2);};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 1> {
+static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x));
+}};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 3> {
+static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z));
+}};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 4> {
+static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z) + SQR(pix1.w - pix2.w));
+}};
+
+template<typename Tin, typename Tout> static __host__ __device__ Tout _pixDist(Tin &pix1, Tin &pix2)
+{
+    return __pixDist_CN<Tin, Tout, NC(Tin)>::_pixDist_CN(pix1, pix2);
+}
+
+
+template <typename T> struct TAccPixWeighted;
+template<> struct TAccPixWeighted<uchar1> {typedef double1 type;};
+template<> struct TAccPixWeighted<uchar3> {typedef double3 type;};
+template<> struct TAccPixWeighted<uchar4> {typedef double4 type;};
+template<> struct TAccPixWeighted<ushort1> {typedef double1 type;};
+template<> struct TAccPixWeighted<ushort3> {typedef double3 type;};
+template<> struct TAccPixWeighted<ushort4> {typedef double4 type;};
+template<> struct TAccPixWeighted<float1> {typedef double1 type;};
+template<> struct TAccPixWeighted<float3> {typedef double3 type;};
+template<> struct TAccPixWeighted<float4> {typedef double4 type;};
+
+template<typename Tfrom> struct TAccPixDist {};
+template<> struct TAccPixDist<uchar1> {typedef Ncv32u type;};
+template<> struct TAccPixDist<uchar3> {typedef Ncv32u type;};
+template<> struct TAccPixDist<uchar4> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort1> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort3> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort4> {typedef Ncv32u type;};
+template<> struct TAccPixDist<float1> {typedef Ncv32f type;};
+template<> struct TAccPixDist<float3> {typedef Ncv32f type;};
+template<> struct TAccPixDist<float4> {typedef Ncv32f type;};
+
+#endif //_ncv_pixel_operations_hpp_
diff --git a/modules/cudalegacy/src/cuda/NCVPyramid.cu b/modules/cudalegacy/src/cuda/NCVPyramid.cu
new file mode 100644
index 00000000000..c37b1a9e1dc
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/NCVPyramid.cu
@@ -0,0 +1,621 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#include "opencv2/core/cuda/common.hpp"
+
+#include "opencv2/cudalegacy/NCV.hpp"
+#include "opencv2/cudalegacy/NCVPyramid.hpp"
+
+#include "NCVAlg.hpp"
+#include "NCVPixelOperations.hpp"
+
+template<typename T, Ncv32u CN> struct __average4_CN {static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);};
+
+template<typename T> struct __average4_CN<T, 1> {
+static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float1, 1> {
+static __host__ __device__ float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11)
+{
+    float1 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double1, 1> {
+static __host__ __device__ double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11)
+{
+    double1 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    return out;
+}};
+
+template<typename T> struct __average4_CN<T, 3> {
+static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
+    out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float3, 3> {
+static __host__ __device__ float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11)
+{
+    float3 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double3, 3> {
+static __host__ __device__ double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11)
+{
+    double3 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    return out;
+}};
+
+template<typename T> struct __average4_CN<T, 4> {
+static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
+    out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
+    out.w = ((Ncv32s)p00.w + p01.w + p10.w + p11.w + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float4, 4> {
+static __host__ __device__ float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11)
+{
+    float4 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double4, 4> {
+static __host__ __device__ double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11)
+{
+    double4 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
+    return out;
+}};
+
+template<typename T> static __host__ __device__ T _average4(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    return __average4_CN<T, NC(T)>::_average4_CN(p00, p01, p10, p11);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __lerp_CN {static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 1> {
+static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 3> {
+static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)),
+                    TB(b.y * d + a.y * (1 - d)),
+                    TB(b.z * d + a.z * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 4> {
+static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)),
+                    TB(b.y * d + a.y * (1 - d)),
+                    TB(b.z * d + a.z * (1 - d)),
+                    TB(b.w * d + a.w * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> static __host__ __device__ Tout _lerp(const Tin &a, const Tin &b, Ncv32f d)
+{
+    return __lerp_CN<Tin, Tout, NC(Tin)>::_lerp_CN(a, b, d);
+}
+
+
+template<typename T>
+__global__ void kernelDownsampleX2(T *d_src,
+                                   Ncv32u srcPitch,
+                                   T *d_dst,
+                                   Ncv32u dstPitch,
+                                   NcvSize32u dstRoi)
+{
+    Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
+    Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < dstRoi.height && j < dstRoi.width)
+    {
+        T *d_src_line1 = (T *)((Ncv8u *)d_src + (2 * i + 0) * srcPitch);
+        T *d_src_line2 = (T *)((Ncv8u *)d_src + (2 * i + 1) * srcPitch);
+        T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
+
+        T p00 = d_src_line1[2*j+0];
+        T p01 = d_src_line1[2*j+1];
+        T p10 = d_src_line2[2*j+0];
+        T p11 = d_src_line2[2*j+1];
+
+        d_dst_line[j] = _average4(p00, p01, p10, p11);
+    }
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace pyramid
+    {
+        template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            dim3 bDim(16, 8);
+            dim3 gDim(divUp(src.cols, bDim.x), divUp(src.rows, bDim.y));
+
+            kernelDownsampleX2<<<gDim, bDim, 0, stream>>>((T*)src.data, static_cast<Ncv32u>(src.step),
+                (T*)dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void downsampleX2(PtrStepSzb src, PtrStepSzb dst, int depth, int cn, cudaStream_t stream)
+        {
+            typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+            static const func_t funcs[6][4] =
+            {
+                {kernelDownsampleX2_gpu<uchar1>       , 0 /*kernelDownsampleX2_gpu<uchar2>*/ , kernelDownsampleX2_gpu<uchar3>      , kernelDownsampleX2_gpu<uchar4>      },
+                {0 /*kernelDownsampleX2_gpu<char1>*/  , 0 /*kernelDownsampleX2_gpu<char2>*/  , 0 /*kernelDownsampleX2_gpu<char3>*/ , 0 /*kernelDownsampleX2_gpu<char4>*/ },
+                {kernelDownsampleX2_gpu<ushort1>      , 0 /*kernelDownsampleX2_gpu<ushort2>*/, kernelDownsampleX2_gpu<ushort3>     , kernelDownsampleX2_gpu<ushort4>     },
+                {0 /*kernelDownsampleX2_gpu<short1>*/ , 0 /*kernelDownsampleX2_gpu<short2>*/ , 0 /*kernelDownsampleX2_gpu<short3>*/, 0 /*kernelDownsampleX2_gpu<short4>*/},
+                {0 /*kernelDownsampleX2_gpu<int1>*/   , 0 /*kernelDownsampleX2_gpu<int2>*/   , 0 /*kernelDownsampleX2_gpu<int3>*/  , 0 /*kernelDownsampleX2_gpu<int4>*/  },
+                {kernelDownsampleX2_gpu<float1>       , 0 /*kernelDownsampleX2_gpu<float2>*/ , kernelDownsampleX2_gpu<float3>      , kernelDownsampleX2_gpu<float4>      }
+            };
+
+            const func_t func = funcs[depth][cn - 1];
+            CV_Assert(func != 0);
+
+            func(src, dst, stream);
+        }
+    }
+}}}
+
+
+
+
+template<typename T>
+__global__ void kernelInterpolateFrom1(T *d_srcTop,
+                                       Ncv32u srcTopPitch,
+                                       NcvSize32u szTopRoi,
+                                       T *d_dst,
+                                       Ncv32u dstPitch,
+                                       NcvSize32u dstRoi)
+{
+    Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
+    Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < dstRoi.height && j < dstRoi.width)
+    {
+        Ncv32f ptTopX = 1.0f * (szTopRoi.width - 1) * j / (dstRoi.width - 1);
+        Ncv32f ptTopY = 1.0f * (szTopRoi.height - 1) * i / (dstRoi.height - 1);
+        Ncv32u xl = (Ncv32u)ptTopX;
+        Ncv32u xh = xl+1;
+        Ncv32f dx = ptTopX - xl;
+        Ncv32u yl = (Ncv32u)ptTopY;
+        Ncv32u yh = yl+1;
+        Ncv32f dy = ptTopY - yl;
+
+        T *d_src_line1 = (T *)((Ncv8u *)d_srcTop + yl * srcTopPitch);
+        T *d_src_line2 = (T *)((Ncv8u *)d_srcTop + yh * srcTopPitch);
+        T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
+
+        T p00, p01, p10, p11;
+        p00 = d_src_line1[xl];
+        p01 = xh < szTopRoi.width ? d_src_line1[xh] : p00;
+        p10 = yh < szTopRoi.height ? d_src_line2[xl] : p00;
+        p11 = (xh < szTopRoi.width && yh < szTopRoi.height) ? d_src_line2[xh] : p00;
+        typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+        TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
+        TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
+        TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
+        T outPix = _pixDemoteClampZ<TVFlt, T>(mixture);
+
+        d_dst_line[j] = outPix;
+    }
+}
+namespace cv { namespace cuda { namespace device
+{
+    namespace pyramid
+    {
+        template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            dim3 bDim(16, 8);
+            dim3 gDim(divUp(dst.cols, bDim.x), divUp(dst.rows, bDim.y));
+
+            kernelInterpolateFrom1<<<gDim, bDim, 0, stream>>>((T*) src.data, static_cast<Ncv32u>(src.step), NcvSize32u(src.cols, src.rows),
+                (T*) dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void interpolateFrom1(PtrStepSzb src, PtrStepSzb dst, int depth, int cn, cudaStream_t stream)
+        {
+            typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+            static const func_t funcs[6][4] =
+            {
+                {kernelInterpolateFrom1_gpu<uchar1>      , 0 /*kernelInterpolateFrom1_gpu<uchar2>*/ , kernelInterpolateFrom1_gpu<uchar3>      , kernelInterpolateFrom1_gpu<uchar4>      },
+                {0 /*kernelInterpolateFrom1_gpu<char1>*/ , 0 /*kernelInterpolateFrom1_gpu<char2>*/  , 0 /*kernelInterpolateFrom1_gpu<char3>*/ , 0 /*kernelInterpolateFrom1_gpu<char4>*/ },
+                {kernelInterpolateFrom1_gpu<ushort1>     , 0 /*kernelInterpolateFrom1_gpu<ushort2>*/, kernelInterpolateFrom1_gpu<ushort3>     , kernelInterpolateFrom1_gpu<ushort4>     },
+                {0 /*kernelInterpolateFrom1_gpu<short1>*/, 0 /*kernelInterpolateFrom1_gpu<short2>*/ , 0 /*kernelInterpolateFrom1_gpu<short3>*/, 0 /*kernelInterpolateFrom1_gpu<short4>*/},
+                {0 /*kernelInterpolateFrom1_gpu<int1>*/  , 0 /*kernelInterpolateFrom1_gpu<int2>*/   , 0 /*kernelInterpolateFrom1_gpu<int3>*/  , 0 /*kernelInterpolateFrom1_gpu<int4>*/  },
+                {kernelInterpolateFrom1_gpu<float1>      , 0 /*kernelInterpolateFrom1_gpu<float2>*/ , kernelInterpolateFrom1_gpu<float3>      , kernelInterpolateFrom1_gpu<float4>      }
+            };
+
+            const func_t func = funcs[depth][cn - 1];
+            CV_Assert(func != 0);
+
+            func(src, dst, stream);
+        }
+    }
+}}}
+
+
+#if 0 //def _WIN32
+
+template<typename T>
+static T _interpLinear(const T &a, const T &b, Ncv32f d)
+{
+    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+    TVFlt tmp = _lerp<T, TVFlt>(a, b, d);
+    return _pixDemoteClampZ<TVFlt, T>(tmp);
+}
+
+
+template<typename T>
+static T _interpBilinear(const NCVMatrix<T> &refLayer, Ncv32f x, Ncv32f y)
+{
+    Ncv32u xl = (Ncv32u)x;
+    Ncv32u xh = xl+1;
+    Ncv32f dx = x - xl;
+    Ncv32u yl = (Ncv32u)y;
+    Ncv32u yh = yl+1;
+    Ncv32f dy = y - yl;
+    T p00, p01, p10, p11;
+    p00 = refLayer.at(xl, yl);
+    p01 = xh < refLayer.width() ? refLayer.at(xh, yl) : p00;
+    p10 = yh < refLayer.height() ? refLayer.at(xl, yh) : p00;
+    p11 = (xh < refLayer.width() && yh < refLayer.height()) ? refLayer.at(xh, yh) : p00;
+    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+    TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
+    TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
+    TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
+    return _pixDemoteClampZ<TVFlt, T>(mixture);
+}
+
+template <class T>
+NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
+                                    Ncv8u numLayers,
+                                    INCVMemAllocator &alloc,
+                                    cudaStream_t cuStream)
+{
+    this->_isInitialized = false;
+    ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid::ctor error", );
+
+    this->layer0 = &img;
+    NcvSize32u szLastLayer(img.width(), img.height());
+    this->nLayers = 1;
+
+    NCV_SET_SKIP_COND(alloc.isCounting());
+    NcvBool bDeviceCode = alloc.memType() == NCVMemoryTypeDevice;
+
+    if (numLayers == 0)
+    {
+        numLayers = 255; //it will cut-off when any of the dimensions goes 1
+    }
+
+#ifdef SELF_CHECK_GPU
+    NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
+#endif
+
+    for (Ncv32u i=0; i<(Ncv32u)numLayers-1; i++)
+    {
+        NcvSize32u szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
+        if (szCurLayer.width == 0 || szCurLayer.height == 0)
+        {
+            break;
+        }
+
+        this->pyramid.push_back(new NCVMatrixAlloc<T>(alloc, szCurLayer.width, szCurLayer.height));
+        ncvAssertPrintReturn(((NCVMatrixAlloc<T> *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid::ctor error", );
+        this->nLayers++;
+
+        //fill in the layer
+        NCV_SKIP_COND_BEGIN
+
+        const NCVMatrix<T> *prevLayer = i == 0 ? this->layer0 : this->pyramid[i-1];
+        NCVMatrix<T> *curLayer = this->pyramid[i];
+
+        if (bDeviceCode)
+        {
+            dim3 bDim(16, 8);
+            dim3 gDim(divUp(szCurLayer.width, bDim.x), divUp(szCurLayer.height, bDim.y));
+            kernelDownsampleX2<<<gDim, bDim, 0, cuStream>>>(prevLayer->ptr(),
+                                                            prevLayer->pitch(),
+                                                            curLayer->ptr(),
+                                                            curLayer->pitch(),
+                                                            szCurLayer);
+            ncvAssertPrintReturn(cudaSuccess == cudaGetLastError(), "NCVImagePyramid::ctor error", );
+
+#ifdef SELF_CHECK_GPU
+            NCVMatrixAlloc<T> h_prevLayer(allocCPU, prevLayer->width(), prevLayer->height());
+            ncvAssertPrintReturn(h_prevLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
+            NCVMatrixAlloc<T> h_curLayer(allocCPU, curLayer->width(), curLayer->height());
+            ncvAssertPrintReturn(h_curLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
+            ncvAssertPrintReturn(NCV_SUCCESS == prevLayer->copy2D(h_prevLayer, prevLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
+            ncvAssertPrintReturn(NCV_SUCCESS == curLayer->copy2D(h_curLayer, curLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
+            ncvAssertPrintReturn(cudaSuccess == cudaStreamSynchronize(cuStream), "Validation failure in NCVImagePyramid::ctor", );
+            for (Ncv32u i=0; i<szCurLayer.height; i++)
+            {
+                for (Ncv32u j=0; j<szCurLayer.width; j++)
+                {
+                    T p00 = h_prevLayer.at(2*j+0, 2*i+0);
+                    T p01 = h_prevLayer.at(2*j+1, 2*i+0);
+                    T p10 = h_prevLayer.at(2*j+0, 2*i+1);
+                    T p11 = h_prevLayer.at(2*j+1, 2*i+1);
+                    T outGold = _average4(p00, p01, p10, p11);
+                    T outGPU = h_curLayer.at(j, i);
+                    ncvAssertPrintReturn(0 == memcmp(&outGold, &outGPU, sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelDownsampleX2", );
+                }
+            }
+#endif
+        }
+        else
+        {
+            for (Ncv32u i=0; i<szCurLayer.height; i++)
+            {
+                for (Ncv32u j=0; j<szCurLayer.width; j++)
+                {
+                    T p00 = prevLayer->at(2*j+0, 2*i+0);
+                    T p01 = prevLayer->at(2*j+1, 2*i+0);
+                    T p10 = prevLayer->at(2*j+0, 2*i+1);
+                    T p11 = prevLayer->at(2*j+1, 2*i+1);
+                    curLayer->at(j, i) = _average4(p00, p01, p10, p11);
+                }
+            }
+        }
+
+        NCV_SKIP_COND_END
+
+        szLastLayer = szCurLayer;
+    }
+
+    this->_isInitialized = true;
+}
+
+
+template <class T>
+NCVImagePyramid<T>::~NCVImagePyramid()
+{
+}
+
+
+template <class T>
+NcvBool NCVImagePyramid<T>::isInitialized() const
+{
+    return this->_isInitialized;
+}
+
+
+template <class T>
+NCVStatus NCVImagePyramid<T>::getLayer(NCVMatrix<T> &outImg,
+                                       NcvSize32u outRoi,
+                                       NcvBool bTrilinear,
+                                       cudaStream_t cuStream) const
+{
+    ncvAssertReturn(this->isInitialized(), NCV_UNKNOWN_ERROR);
+    ncvAssertReturn(outImg.memType() == this->layer0->memType(), NCV_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn(outRoi.width <= this->layer0->width() && outRoi.height <= this->layer0->height() &&
+                    outRoi.width > 0 && outRoi.height > 0, NCV_DIMENSIONS_INVALID);
+
+    if (outRoi.width == this->layer0->width() && outRoi.height == this->layer0->height())
+    {
+        ncvAssertReturnNcvStat(this->layer0->copy2D(outImg, NcvSize32u(this->layer0->width(), this->layer0->height()), cuStream));
+        return NCV_SUCCESS;
+    }
+
+    Ncv32f lastScale = 1.0f;
+    Ncv32f curScale;
+    const NCVMatrix<T> *lastLayer = this->layer0;
+    const NCVMatrix<T> *curLayer = NULL;
+    NcvBool bUse2Refs = false;
+
+    for (Ncv32u i=0; i<this->nLayers-1; i++)
+    {
+        curScale = lastScale * 0.5f;
+        curLayer = this->pyramid[i];
+
+        if (outRoi.width == curLayer->width() && outRoi.height == curLayer->height())
+        {
+            ncvAssertReturnNcvStat(this->pyramid[i]->copy2D(outImg, NcvSize32u(this->pyramid[i]->width(), this->pyramid[i]->height()), cuStream));
+            return NCV_SUCCESS;
+        }
+
+        if (outRoi.width >= curLayer->width() && outRoi.height >= curLayer->height())
+        {
+            if (outRoi.width < lastLayer->width() && outRoi.height < lastLayer->height())
+            {
+                bUse2Refs = true;
+            }
+            break;
+        }
+
+        lastScale = curScale;
+        lastLayer = curLayer;
+    }
+
+    bUse2Refs = bUse2Refs && bTrilinear;
+
+    NCV_SET_SKIP_COND(outImg.memType() == NCVMemoryTypeNone);
+    NcvBool bDeviceCode = this->layer0->memType() == NCVMemoryTypeDevice;
+
+#ifdef SELF_CHECK_GPU
+    NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
+#endif
+
+    NCV_SKIP_COND_BEGIN
+
+    if (bDeviceCode)
+    {
+        ncvAssertReturn(bUse2Refs == false, NCV_NOT_IMPLEMENTED);
+
+        dim3 bDim(16, 8);
+        dim3 gDim(divUp(outRoi.width, bDim.x), divUp(outRoi.height, bDim.y));
+        kernelInterpolateFrom1<<<gDim, bDim, 0, cuStream>>>(lastLayer->ptr(),
+                                                            lastLayer->pitch(),
+                                                            lastLayer->size(),
+                                                            outImg.ptr(),
+                                                            outImg.pitch(),
+                                                            outRoi);
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+
+#ifdef SELF_CHECK_GPU
+        ncvSafeMatAlloc(h_lastLayer, T, allocCPU, lastLayer->width(), lastLayer->height(), NCV_ALLOCATOR_BAD_ALLOC);
+        ncvSafeMatAlloc(h_outImg, T, allocCPU, outImg.width(), outImg.height(), NCV_ALLOCATOR_BAD_ALLOC);
+        ncvAssertReturnNcvStat(lastLayer->copy2D(h_lastLayer, lastLayer->size(), cuStream));
+        ncvAssertReturnNcvStat(outImg.copy2D(h_outImg, outRoi, cuStream));
+        ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+
+        for (Ncv32u i=0; i<outRoi.height; i++)
+        {
+            for (Ncv32u j=0; j<outRoi.width; j++)
+            {
+                NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
+                Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
+                Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
+                T outGold = _interpBilinear(h_lastLayer, ptTopX, ptTopY);
+                ncvAssertPrintReturn(0 == memcmp(&outGold, &h_outImg.at(j,i), sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelInterpolateFrom1", NCV_UNKNOWN_ERROR);
+            }
+        }
+#endif
+    }
+    else
+    {
+        for (Ncv32u i=0; i<outRoi.height; i++)
+        {
+            for (Ncv32u j=0; j<outRoi.width; j++)
+            {
+                //top layer pixel (always exists)
+                NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
+                Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
+                Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
+                T topPix = _interpBilinear(*lastLayer, ptTopX, ptTopY);
+                T trilinearPix = topPix;
+
+                if (bUse2Refs)
+                {
+                    //bottom layer pixel (exists only if the requested scale is greater than the smallest layer scale)
+                    NcvSize32u szBottomLayer(curLayer->width(), curLayer->height());
+                    Ncv32f ptBottomX = 1.0f * (szBottomLayer.width - 1) * j / (outRoi.width - 1);
+                    Ncv32f ptBottomY = 1.0f * (szBottomLayer.height - 1) * i / (outRoi.height - 1);
+                    T bottomPix = _interpBilinear(*curLayer, ptBottomX, ptBottomY);
+
+                    Ncv32f scale = (1.0f * outRoi.width / layer0->width() + 1.0f * outRoi.height / layer0->height()) / 2;
+                    Ncv32f dl = (scale - curScale) / (lastScale - curScale);
+                    dl = CLAMP(dl, 0.0f, 1.0f);
+                    trilinearPix = _interpLinear(bottomPix, topPix, dl);
+                }
+
+                outImg.at(j, i) = trilinearPix;
+            }
+        }
+    }
+
+    NCV_SKIP_COND_END
+
+    return NCV_SUCCESS;
+}
+
+
+template class NCVImagePyramid<uchar1>;
+template class NCVImagePyramid<uchar3>;
+template class NCVImagePyramid<uchar4>;
+template class NCVImagePyramid<ushort1>;
+template class NCVImagePyramid<ushort3>;
+template class NCVImagePyramid<ushort4>;
+template class NCVImagePyramid<uint1>;
+template class NCVImagePyramid<uint3>;
+template class NCVImagePyramid<uint4>;
+template class NCVImagePyramid<float1>;
+template class NCVImagePyramid<float3>;
+template class NCVImagePyramid<float4>;
+
+#endif //_WIN32
diff --git a/modules/cudalegacy/src/cuda/NCVRuntimeTemplates.hpp b/modules/cudalegacy/src/cuda/NCVRuntimeTemplates.hpp
new file mode 100644
index 00000000000..ad59b32f7af
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/NCVRuntimeTemplates.hpp
@@ -0,0 +1,221 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncvruntimetemplates_hpp_
+#define _ncvruntimetemplates_hpp_
+#if defined _MSC_VER &&_MSC_VER >= 1200
+#pragma warning( disable: 4800 )
+#endif
+
+
+#include <stdarg.h>
+#include <vector>
+
+
+////////////////////////////////////////////////////////////////////////////////
+// The Loki Library
+// Copyright (c) 2001 by Andrei Alexandrescu
+// This code accompanies the book:
+// Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design
+//     Patterns Applied". Copyright (c) 2001. Addison-Wesley.
+// Permission to use, copy, modify, distribute and sell this software for any
+//     purpose is hereby granted without fee, provided that the above copyright
+//     notice appear in all copies and that both that copyright notice and this
+//     permission notice appear in supporting documentation.
+// The author or Addison-Welsey Longman make no representations about the
+//     suitability of this software for any purpose. It is provided "as is"
+//     without express or implied warranty.
+// http://loki-lib.sourceforge.net/index.php?n=Main.License
+////////////////////////////////////////////////////////////////////////////////
+
+namespace Loki
+{
+    //==============================================================================
+    // class NullType
+    // Used as a placeholder for "no type here"
+    // Useful as an end marker in typelists
+    //==============================================================================
+
+    class NullType {};
+
+    //==============================================================================
+    // class template Typelist
+    // The building block of typelists of any length
+    // Use it through the LOKI_TYPELIST_NN macros
+    // Defines nested types:
+    //     Head (first element, a non-typelist type by convention)
+    //     Tail (second element, can be another typelist)
+    //==============================================================================
+
+    template <class T, class U>
+    struct Typelist
+    {
+        typedef T Head;
+        typedef U Tail;
+    };
+
+    //==============================================================================
+    // class template Int2Type
+    // Converts each integral constant into a unique type
+    // Invocation: Int2Type<v> where v is a compile-time constant integral
+    // Defines 'value', an enum that evaluates to v
+    //==============================================================================
+
+    template <int v>
+    struct Int2Type
+    {
+        enum { value = v };
+    };
+
+    namespace TL
+    {
+        //==============================================================================
+        // class template TypeAt
+        // Finds the type at a given index in a typelist
+        // Invocation (TList is a typelist and index is a compile-time integral
+        //     constant):
+        // TypeAt<TList, index>::Result
+        // returns the type in position 'index' in TList
+        // If you pass an out-of-bounds index, the result is a compile-time error
+        //==============================================================================
+
+        template <class TList, unsigned int index> struct TypeAt;
+
+        template <class Head, class Tail>
+        struct TypeAt<Typelist<Head, Tail>, 0>
+        {
+            typedef Head Result;
+        };
+
+        template <class Head, class Tail, unsigned int i>
+        struct TypeAt<Typelist<Head, Tail>, i>
+        {
+            typedef typename TypeAt<Tail, i - 1>::Result Result;
+        };
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime boolean template instance dispatcher
+// Cyril Crassin <cyril.crassin@icare3d.org>
+// NVIDIA, 2010
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NCVRuntimeTemplateBool
+{
+    //This struct is used to transform a list of parameters into template arguments
+    //The idea is to build a typelist containing the arguments
+    //and to pass this typelist to a user defined functor
+    template<typename TList, int NumArguments, class Func>
+    struct KernelCaller
+    {
+        //Convenience function used by the user
+        //Takes a variable argument list, transforms it into a list
+        static void call(Func *functor, ...)
+        {
+            //Vector used to collect arguments
+            std::vector<int> templateParamList;
+
+            //Variable argument list manipulation
+            va_list listPointer;
+            va_start(listPointer, functor);
+            //Collect parameters into the list
+            for(int i=0; i<NumArguments; i++)
+            {
+                int val = va_arg(listPointer, int);
+                templateParamList.push_back(val);
+            }
+            va_end(listPointer);
+
+            //Call the actual typelist building function
+            call(*functor, templateParamList);
+        }
+
+        //Actual function called recursively to build a typelist based
+        //on a list of values
+        static void call( Func &functor, std::vector<int> &templateParamList)
+        {
+            //Get current parameter value in the list
+            NcvBool val = templateParamList[templateParamList.size() - 1];
+            templateParamList.pop_back();
+
+            //Select the compile time value to add into the typelist
+            //depending on the runtime variable and make recursive call.
+            //Both versions are really instantiated
+            if (val)
+            {
+                KernelCaller<
+                    Loki::Typelist<typename Loki::Int2Type<1>, TList >,
+                    NumArguments-1, Func >
+                    ::call(functor, templateParamList);
+            }
+            else
+            {
+                KernelCaller<
+                    Loki::Typelist<typename Loki::Int2Type<0>, TList >,
+                    NumArguments-1, Func >
+                    ::call(functor, templateParamList);
+            }
+        }
+    };
+
+    //Specialization for 0 value left in the list
+    //-> actual kernel functor call
+    template<class TList, class Func>
+    struct KernelCaller<TList, 0, Func>
+    {
+        static void call(Func &functor)
+        {
+            //Call to the functor's kernel call method
+            functor.call(TList()); //TList instantiated to get the method template parameter resolved
+        }
+
+        static void call(Func &functor, std::vector<int> &templateParams)
+        {
+            CV_UNUSED(templateParams);
+            functor.call(TList());
+        }
+    };
+}
+
+#endif //_ncvruntimetemplates_hpp_
diff --git a/modules/cudalegacy/src/cuda/NPP_staging.cu b/modules/cudalegacy/src/cuda/NPP_staging.cu
new file mode 100644
index 00000000000..a96f44ff991
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/NPP_staging.cu
@@ -0,0 +1,2616 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <vector>
+#include <cuda_runtime.h>
+
+#include "opencv2/core/cuda/warp.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+#include "opencv2/cudalegacy/NPP_staging.hpp"
+
+
+texture<Ncv8u,  1, cudaReadModeElementType> tex8u;
+texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
+texture<uint2,  1, cudaReadModeElementType> tex64u;
+
+
+//==============================================================================
+//
+// CUDA streams handling
+//
+//==============================================================================
+
+
+static cudaStream_t nppStream = 0;
+
+
+cudaStream_t nppStGetActiveCUDAstream(void)
+{
+    return nppStream;
+}
+
+
+
+cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
+{
+    cudaStream_t tmp = nppStream;
+    nppStream = cudaStream;
+    return tmp;
+}
+
+
+//==============================================================================
+//
+// BlockScan.cuh
+//
+//==============================================================================
+
+
+NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
+
+
+//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
+//assuming size <= WARP_SIZE and size is power of 2
+template <class T>
+inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
+{
+#if __CUDA_ARCH__ >= 300
+    const unsigned int laneId = cv::cuda::device::Warp::laneId();
+
+    // scan on shuffl functions
+    #pragma unroll
+    for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
+    {
+        const T n = cv::cuda::device::shfl_up(idata, i);
+        if (laneId >= i)
+              idata += n;
+    }
+
+    return idata;
+#else
+    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
+    s_Data[pos] = 0;
+    pos += K_WARP_SIZE;
+    s_Data[pos] = idata;
+
+    s_Data[pos] += s_Data[pos - 1];
+    s_Data[pos] += s_Data[pos - 2];
+    s_Data[pos] += s_Data[pos - 4];
+    s_Data[pos] += s_Data[pos - 8];
+    s_Data[pos] += s_Data[pos - 16];
+
+    return s_Data[pos];
+#endif
+}
+inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data)
+{
+    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
+    s_Data[pos] = 0;
+    pos += K_WARP_SIZE;
+    s_Data[pos] = idata;
+
+    s_Data[pos] += s_Data[pos - 1];
+    s_Data[pos] += s_Data[pos - 2];
+    s_Data[pos] += s_Data[pos - 4];
+    s_Data[pos] += s_Data[pos - 8];
+    s_Data[pos] += s_Data[pos - 16];
+
+    return s_Data[pos];
+}
+
+
+template <class T>
+inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
+{
+    return warpScanInclusive(idata, s_Data) - idata;
+}
+
+
+template <class T, Ncv32u tiNumScanThreads>
+inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
+{
+    if (tiNumScanThreads > K_WARP_SIZE)
+    {
+        //Bottom-level inclusive warp scan
+        T warpResult = warpScanInclusive(idata, s_Data);
+
+        //Save top elements of each warp for exclusive warp scan
+        //sync to wait for warp scans to complete (because s_Data is being overwritten)
+        __syncthreads();
+        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
+        {
+            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
+        }
+
+        //wait for warp scans to complete
+        __syncthreads();
+
+        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
+        {
+            //grab top warp elements
+            T val = s_Data[threadIdx.x];
+            //calculate exclusive scan and write back to shared memory
+            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
+        }
+
+        //return updated warp scans with exclusive scan results
+        __syncthreads();
+        return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
+    }
+    else
+    {
+        return warpScanInclusive(idata, s_Data);
+    }
+}
+
+
+//==============================================================================
+//
+// IntegralImage.cu
+//
+//==============================================================================
+
+
+const Ncv32u NUM_SCAN_THREADS = 256;
+const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
+
+
+template<class T_in, class T_out>
+struct _scanElemOp
+{
+    template<bool tbDoSqr>
+    static inline __host__ __device__ T_out scanElemOp(T_in elem)
+    {
+        return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
+    }
+
+private:
+
+    template <int v> struct Int2Type { enum { value = v }; };
+
+    static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>)
+    {
+        return (T_out)elem;
+    }
+
+    static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>)
+    {
+        return (T_out)(elem*elem);
+    }
+};
+
+
+template<class T>
+inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
+
+
+template<>
+inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+{
+    return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
+}
+
+
+template<>
+inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+{
+    return d_src[curElemOffs];
+}
+
+
+template<>
+inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+{
+    return d_src[curElemOffs];
+}
+
+
+/**
+* \brief Segmented scan kernel
+*
+* Calculates per-row prefix scans of the input image.
+* Out-of-bounds safe: reads 'size' elements, writes 'size+1' elements
+*
+* \tparam T_in      Type of input image elements
+* \tparam T_out     Type of output image elements
+* \tparam T_op      Defines an operation to be performed on the input image pixels
+*
+* \param d_src      [IN] Source image pointer
+* \param srcWidth   [IN] Source image width
+* \param srcStride  [IN] Source image stride
+* \param d_II       [OUT] Output image pointer
+* \param IIstride   [IN] Output image stride
+*
+* \return None
+*/
+template <class T_in, class T_out, bool tbDoSqr>
+__global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
+                         T_out *d_II, Ncv32u IIstride)
+{
+    //advance pointers to the current line
+    if (sizeof(T_in) != 1)
+    {
+        d_src += srcStride * blockIdx.x;
+    }
+    //for initial image 8bit source we use texref tex8u
+    d_II += IIstride * blockIdx.x;
+
+    Ncv32u numBuckets = (srcWidth + NUM_SCAN_THREADS - 1) >> LOG2_NUM_SCAN_THREADS;
+    Ncv32u offsetX = 0;
+
+    __shared__ T_out shmem[NUM_SCAN_THREADS * 2];
+    __shared__ T_out carryElem;
+    carryElem = 0;
+    __syncthreads();
+
+    while (numBuckets--)
+    {
+        Ncv32u curElemOffs = offsetX + threadIdx.x;
+        T_out curScanElem;
+
+        T_in curElem;
+        T_out curElemMod;
+
+        if (curElemOffs < srcWidth)
+        {
+            //load elements
+            curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
+        }
+        curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
+
+        //inclusive scan
+        curScanElem = blockScanInclusive<T_out, NUM_SCAN_THREADS>(curElemMod, shmem);
+
+        if (curElemOffs <= srcWidth)
+        {
+            //make scan exclusive and write the bucket to the output buffer
+            d_II[curElemOffs] = carryElem + curScanElem - curElemMod;
+            offsetX += NUM_SCAN_THREADS;
+        }
+
+        //remember last element for subsequent buckets adjustment
+        __syncthreads();
+        if (threadIdx.x == NUM_SCAN_THREADS-1)
+        {
+            carryElem += curScanElem;
+        }
+        __syncthreads();
+    }
+
+    if (offsetX == srcWidth && !threadIdx.x)
+    {
+        d_II[offsetX] = carryElem;
+    }
+}
+
+
+template <bool tbDoSqr, class T_in, class T_out>
+NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
+                                T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
+{
+    cudaChannelFormatDesc cfdTex;
+    size_t alignmentOffset = 0;
+    if (sizeof(T_in) == 1)
+    {
+        cfdTex = cudaCreateChannelDesc<Ncv8u>();
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
+        if (alignmentOffset > 0)
+        {
+            ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
+        }
+    }
+    scanRows
+        <T_in, T_out, tbDoSqr>
+        <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
+        (d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
+
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    return NPPST_SUCCESS;
+}
+
+
+static Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
+{
+    Ncv32u alignMask = allocatorAlignment-1;
+    Ncv32u inverseAlignMask = ~alignMask;
+    Ncv32u dimBytes = dim * elemTypeSize;
+    Ncv32u pitch = (dimBytes + alignMask) & inverseAlignMask;
+    Ncv32u PaddedDim = pitch / elemTypeSize;
+    return PaddedDim;
+}
+
+
+template <class T_in, class T_out>
+NCVStatus ncvIntegralImage_device(T_in *d_src, Ncv32u srcStep,
+                                  T_out *d_dst, Ncv32u dstStep, NcvSize32u roi,
+                                  INCVMemAllocator &gpuAllocator)
+{
+    ncvAssertReturn(sizeof(T_out) == sizeof(Ncv32u), NPPST_MEM_INTERNAL_ERROR);
+    ncvAssertReturn(gpuAllocator.memType() == NCVMemoryTypeDevice ||
+                      gpuAllocator.memType() == NCVMemoryTypeNone, NPPST_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+    ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStep >= roi.width * sizeof(T_in) &&
+                      dstStep >= (roi.width + 1) * sizeof(T_out) &&
+                      srcStep % sizeof(T_in) == 0 &&
+                      dstStep % sizeof(T_out) == 0, NPPST_INVALID_STEP);
+    srcStep /= sizeof(T_in);
+    dstStep /= sizeof(T_out);
+
+    Ncv32u WidthII = roi.width + 1;
+    Ncv32u HeightII = roi.height + 1;
+    Ncv32u PaddedWidthII32 = getPaddedDimension(WidthII, sizeof(Ncv32u), gpuAllocator.alignment());
+    Ncv32u PaddedHeightII32 = getPaddedDimension(HeightII, sizeof(Ncv32u), gpuAllocator.alignment());
+
+    NCVMatrixAlloc<T_out> Tmp32_1(gpuAllocator, PaddedWidthII32, PaddedHeightII32);
+    ncvAssertReturn(gpuAllocator.isCounting() || Tmp32_1.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+    NCVMatrixAlloc<T_out> Tmp32_2(gpuAllocator, PaddedHeightII32, PaddedWidthII32);
+    ncvAssertReturn(gpuAllocator.isCounting() || Tmp32_2.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+    ncvAssertReturn(Tmp32_1.pitch() * Tmp32_1.height() == Tmp32_2.pitch() * Tmp32_2.height(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat;
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = scanRowsWrapperDevice
+        <false>
+        (d_src, srcStep, Tmp32_1.ptr(), PaddedWidthII32, roi);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)Tmp32_1.ptr(), PaddedWidthII32*sizeof(Ncv32u),
+                                      (Ncv32u *)Tmp32_2.ptr(), PaddedHeightII32*sizeof(Ncv32u), NcvSize32u(WidthII, roi.height));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = scanRowsWrapperDevice
+        <false>
+        (Tmp32_2.ptr(), PaddedHeightII32, Tmp32_1.ptr(), PaddedHeightII32, NcvSize32u(roi.height, WidthII));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)Tmp32_1.ptr(), PaddedHeightII32*sizeof(Ncv32u),
+                                      (Ncv32u *)d_dst, dstStep*sizeof(Ncv32u), NcvSize32u(HeightII, WidthII));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    NCV_SKIP_COND_END
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus ncvSquaredIntegralImage_device(Ncv8u *d_src, Ncv32u srcStep,
+                                         Ncv64u *d_dst, Ncv32u dstStep, NcvSize32u roi,
+                                         INCVMemAllocator &gpuAllocator)
+{
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+    ncvAssertReturn(gpuAllocator.memType() == NCVMemoryTypeDevice ||
+                      gpuAllocator.memType() == NCVMemoryTypeNone, NPPST_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStep >= roi.width &&
+                      dstStep >= (roi.width + 1) * sizeof(Ncv64u) &&
+                      dstStep % sizeof(Ncv64u) == 0, NPPST_INVALID_STEP);
+    dstStep /= sizeof(Ncv64u);
+
+    Ncv32u WidthII = roi.width + 1;
+    Ncv32u HeightII = roi.height + 1;
+    Ncv32u PaddedWidthII32 = getPaddedDimension(WidthII, sizeof(Ncv32u), gpuAllocator.alignment());
+    Ncv32u PaddedHeightII32 = getPaddedDimension(HeightII, sizeof(Ncv32u), gpuAllocator.alignment());
+    Ncv32u PaddedWidthII64 = getPaddedDimension(WidthII, sizeof(Ncv64u), gpuAllocator.alignment());
+    Ncv32u PaddedHeightII64 = getPaddedDimension(HeightII, sizeof(Ncv64u), gpuAllocator.alignment());
+    Ncv32u PaddedWidthMax = PaddedWidthII32 > PaddedWidthII64 ? PaddedWidthII32 : PaddedWidthII64;
+    Ncv32u PaddedHeightMax = PaddedHeightII32 > PaddedHeightII64 ? PaddedHeightII32 : PaddedHeightII64;
+
+    NCVMatrixAlloc<Ncv32u> Tmp32_1(gpuAllocator, PaddedWidthII32, PaddedHeightII32);
+    ncvAssertReturn(Tmp32_1.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+    NCVMatrixAlloc<Ncv64u> Tmp64(gpuAllocator, PaddedWidthMax, PaddedHeightMax);
+    ncvAssertReturn(Tmp64.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVMatrixReuse<Ncv32u> Tmp32_2(Tmp64.getSegment(), gpuAllocator.alignment(), PaddedWidthII32, PaddedHeightII32);
+    ncvAssertReturn(Tmp32_2.isMemReused(), NPPST_MEM_INTERNAL_ERROR);
+    NCVMatrixReuse<Ncv64u> Tmp64_2(Tmp64.getSegment(), gpuAllocator.alignment(), PaddedWidthII64, PaddedHeightII64);
+    ncvAssertReturn(Tmp64_2.isMemReused(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat;
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = scanRowsWrapperDevice
+        <true, Ncv8u, Ncv32u>
+        (d_src, srcStep, Tmp32_2.ptr(), PaddedWidthII32, roi);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = nppiStTranspose_32u_C1R(Tmp32_2.ptr(), PaddedWidthII32*sizeof(Ncv32u),
+                                      Tmp32_1.ptr(), PaddedHeightII32*sizeof(Ncv32u), NcvSize32u(WidthII, roi.height));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = scanRowsWrapperDevice
+        <false, Ncv32u, Ncv64u>
+        (Tmp32_1.ptr(), PaddedHeightII32, Tmp64_2.ptr(), PaddedHeightII64, NcvSize32u(roi.height, WidthII));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    ncvStat = nppiStTranspose_64u_C1R(Tmp64_2.ptr(), PaddedHeightII64*sizeof(Ncv64u),
+                                      d_dst, dstStep*sizeof(Ncv64u), NcvSize32u(HeightII, WidthII));
+    ncvAssertReturnNcvStat(ncvStat);
+
+    NCV_SKIP_COND_END
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegralGetSize_8u32u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+
+    NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvIntegralImage_device((Ncv8u*)NULL, roiSize.width,
+                                                  (Ncv32u*)NULL, (roiSize.width+1) * sizeof(Ncv32u),
+                                                  roiSize, gpuCounter);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    *pBufsize = (Ncv32u)gpuCounter.maxSize();
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegralGetSize_32f32f(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+
+    NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvIntegralImage_device((Ncv32f*)NULL, roiSize.width * sizeof(Ncv32f),
+                                                  (Ncv32f*)NULL, (roiSize.width+1) * sizeof(Ncv32f),
+                                                  roiSize, gpuCounter);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    *pBufsize = (Ncv32u)gpuCounter.maxSize();
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+
+    NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvSquaredIntegralImage_device(NULL, roiSize.width,
+                                                         NULL, (roiSize.width+1) * sizeof(Ncv64u),
+                                                         roiSize, gpuCounter);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    *pBufsize = (Ncv32u)gpuCounter.maxSize();
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegral_8u32u_C1R(Ncv8u *d_src, Ncv32u srcStep,
+                                   Ncv32u *d_dst, Ncv32u dstStep,
+                                   NcvSize32u roiSize, Ncv8u *pBuffer,
+                                   Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegral_32f32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
+                                    Ncv32f *d_dst, Ncv32u dstStep,
+                                    NcvSize32u roiSize, Ncv8u *pBuffer,
+                                    Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStSqrIntegral_8u64u_C1R(Ncv8u *d_src, Ncv32u srcStep,
+                                      Ncv64u *d_dst, Ncv32u dstStep,
+                                      NcvSize32u roiSize, Ncv8u *pBuffer,
+                                      Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = ncvSquaredIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegral_8u32u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
+                                        Ncv32u *h_dst, Ncv32u dstStep,
+                                        NcvSize32u roiSize)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStep >= roiSize.width &&
+                      dstStep >= (roiSize.width + 1) * sizeof(Ncv32u) &&
+                      dstStep % sizeof(Ncv32u) == 0, NPPST_INVALID_STEP);
+    dstStep /= sizeof(Ncv32u);
+
+    Ncv32u WidthII = roiSize.width + 1;
+    Ncv32u HeightII = roiSize.height + 1;
+
+    memset(h_dst, 0, WidthII * sizeof(Ncv32u));
+    for (Ncv32u i=1; i<HeightII; i++)
+    {
+        h_dst[i * dstStep] = 0;
+        for (Ncv32u j=1; j<WidthII; j++)
+        {
+            Ncv32u top = h_dst[(i-1) * dstStep + j];
+            Ncv32u left = h_dst[i * dstStep + (j - 1)];
+            Ncv32u topleft = h_dst[(i - 1) * dstStep + (j - 1)];
+            Ncv32u elem = h_src[(i - 1) * srcStep + (j - 1)];
+            h_dst[i * dstStep + j] = elem + left - topleft + top;
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStIntegral_32f32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
+                                         Ncv32f *h_dst, Ncv32u dstStep,
+                                         NcvSize32u roiSize)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStep >= roiSize.width * sizeof(Ncv32f) &&
+                      dstStep >= (roiSize.width + 1) * sizeof(Ncv32f) &&
+                      srcStep % sizeof(Ncv32f) == 0 &&
+                      dstStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
+    srcStep /= sizeof(Ncv32f);
+    dstStep /= sizeof(Ncv32f);
+
+    Ncv32u WidthII = roiSize.width + 1;
+    Ncv32u HeightII = roiSize.height + 1;
+
+    memset(h_dst, 0, WidthII * sizeof(Ncv32u));
+    for (Ncv32u i=1; i<HeightII; i++)
+    {
+        h_dst[i * dstStep] = 0.0f;
+        for (Ncv32u j=1; j<WidthII; j++)
+        {
+            Ncv32f top = h_dst[(i-1) * dstStep + j];
+            Ncv32f left = h_dst[i * dstStep + (j - 1)];
+            Ncv32f topleft = h_dst[(i - 1) * dstStep + (j - 1)];
+            Ncv32f elem = h_src[(i - 1) * srcStep + (j - 1)];
+            h_dst[i * dstStep + j] = elem + left - topleft + top;
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
+                                           Ncv64u *h_dst, Ncv32u dstStep,
+                                           NcvSize32u roiSize)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStep >= roiSize.width &&
+                      dstStep >= (roiSize.width + 1) * sizeof(Ncv64u) &&
+                      dstStep % sizeof(Ncv64u) == 0, NPPST_INVALID_STEP);
+    dstStep /= sizeof(Ncv64u);
+
+    Ncv32u WidthII = roiSize.width + 1;
+    Ncv32u HeightII = roiSize.height + 1;
+
+    memset(h_dst, 0, WidthII * sizeof(Ncv64u));
+    for (Ncv32u i=1; i<HeightII; i++)
+    {
+        h_dst[i * dstStep] = 0;
+        for (Ncv32u j=1; j<WidthII; j++)
+        {
+            Ncv64u top = h_dst[(i-1) * dstStep + j];
+            Ncv64u left = h_dst[i * dstStep + (j - 1)];
+            Ncv64u topleft = h_dst[(i - 1) * dstStep + (j - 1)];
+            Ncv64u elem = h_src[(i - 1) * srcStep + (j - 1)];
+            h_dst[i * dstStep + j] = elem*elem + left - topleft + top;
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+//==============================================================================
+//
+// Decimate.cu
+//
+//==============================================================================
+
+
+const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32;
+const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
+
+
+template<class T, NcvBool tbCacheTexture>
+__device__ T getElem_Decimate(Ncv32u x, T *d_src);
+
+
+template<>
+__device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
+{
+    return tex1Dfetch(tex32u, x);
+}
+
+
+template<>
+__device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
+{
+    return d_src[x];
+}
+
+
+template<>
+__device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
+{
+    uint2 tmp = tex1Dfetch(tex64u, x);
+    Ncv64u res = (Ncv64u)tmp.y;
+    res <<= 32;
+    res |= tmp.x;
+    return res;
+}
+
+
+template<>
+__device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
+{
+    return d_src[x];
+}
+
+
+template <class T, NcvBool tbCacheTexture>
+__global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
+                                      NcvSize32u dstRoi, Ncv32u scale)
+{
+    int curX = blockIdx.x * blockDim.x + threadIdx.x;
+    int curY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (curX >= dstRoi.width || curY >= dstRoi.height)
+    {
+        return;
+    }
+
+    d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
+}
+
+
+template <class T>
+static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
+                                                T *d_dst, Ncv32u dstStep,
+                                                NcvSize32u srcRoi, Ncv32u scale,
+                                                NcvBool readThruTexture)
+{
+    ncvAssertReturn(d_src != NULL && d_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(scale != 0, NPPST_INVALID_SCALE);
+    ncvAssertReturn(srcStep >= (Ncv32u)(srcRoi.width) * sizeof(T) &&
+                      dstStep >= (Ncv32u)(srcRoi.width * sizeof(T) / scale), NPPST_INVALID_STEP);
+    srcStep /= sizeof(T);
+    dstStep /= sizeof(T);
+
+    NcvSize32u dstRoi;
+    dstRoi.width = srcRoi.width / scale;
+    dstRoi.height = srcRoi.height / scale;
+
+    dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X,
+              (dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
+    dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
+
+    if (!readThruTexture)
+    {
+        decimate_C1R
+            <T, false>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
+    }
+    else
+    {
+        cudaChannelFormatDesc cfdTexSrc;
+
+        if (sizeof(T) == sizeof(Ncv32u))
+        {
+            cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
+
+            size_t alignmentOffset;
+            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
+            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
+        }
+        else
+        {
+            cfdTexSrc = cudaCreateChannelDesc<uint2>();
+
+            size_t alignmentOffset;
+            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
+            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
+        }
+
+        decimate_C1R
+            <T, true>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
+    }
+
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    return NPPST_SUCCESS;
+}
+
+
+template <class T>
+static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
+                                              T *h_dst, Ncv32u dstStep,
+                                              NcvSize32u srcRoi, Ncv32u scale)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(srcRoi.width != 0 && srcRoi.height != 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(scale != 0, NPPST_INVALID_SCALE);
+    ncvAssertReturn(srcStep >= (Ncv32u)(srcRoi.width) * sizeof(T) &&
+                      dstStep >= (Ncv32u)(srcRoi.width * sizeof(T) / scale) &&
+                      srcStep % sizeof(T) == 0 && dstStep % sizeof(T) == 0, NPPST_INVALID_STEP);
+    srcStep /= sizeof(T);
+    dstStep /= sizeof(T);
+
+    NcvSize32u dstRoi;
+    dstRoi.width = srcRoi.width / scale;
+    dstRoi.height = srcRoi.height / scale;
+
+    for (Ncv32u i=0; i<dstRoi.height; i++)
+    {
+        for (Ncv32u j=0; j<dstRoi.width; j++)
+        {
+            h_dst[i*dstStep+j] = h_src[i*scale*srcStep + j*scale];
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+#define implementNppDecimate(bit, typ) \
+    NCVStatus nppiStDecimate_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
+                                                     Ncv##bit##typ *d_dst, Ncv32u dstStep, \
+                                                     NcvSize32u srcRoi, Ncv32u scale, NcvBool readThruTexture) \
+    { \
+        return decimateWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
+                                                           (Ncv##bit##u *)d_dst, dstStep, \
+                                                           srcRoi, scale, readThruTexture); \
+    }
+
+
+#define implementNppDecimateHost(bit, typ) \
+    NCVStatus nppiStDecimate_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
+                                                          Ncv##bit##typ *h_dst, Ncv32u dstStep, \
+                                                          NcvSize32u srcRoi, Ncv32u scale) \
+    { \
+        return decimateWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
+                                                         (Ncv##bit##u *)h_dst, dstStep, \
+                                                         srcRoi, scale); \
+    }
+
+
+implementNppDecimate(32, u)
+implementNppDecimate(32, s)
+implementNppDecimate(32, f)
+implementNppDecimate(64, u)
+implementNppDecimate(64, s)
+implementNppDecimate(64, f)
+implementNppDecimateHost(32, u)
+implementNppDecimateHost(32, s)
+implementNppDecimateHost(32, f)
+implementNppDecimateHost(64, u)
+implementNppDecimateHost(64, s)
+implementNppDecimateHost(64, f)
+
+
+//==============================================================================
+//
+// RectStdDev.cu
+//
+//==============================================================================
+
+
+const Ncv32u NUM_RECTSTDDEV_THREADS = 128;
+
+
+template <NcvBool tbCacheTexture>
+__device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum)
+{
+    if (tbCacheTexture)
+    {
+        return tex1Dfetch(tex32u, x);
+    }
+    else
+    {
+        return d_sum[x];
+    }
+}
+
+
+template <NcvBool tbCacheTexture>
+__device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum)
+{
+    if (tbCacheTexture)
+    {
+        uint2 tmp = tex1Dfetch(tex64u, x);
+        Ncv64u res = (Ncv64u)tmp.y;
+        res <<= 32;
+        res |= tmp.x;
+        return res;
+    }
+    else
+    {
+        return d_sqsum[x];
+    }
+}
+
+
+template <NcvBool tbCacheTexture>
+__global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
+                                   Ncv64u *d_sqsum, Ncv32u sqsumStep,
+                                   Ncv32f *d_norm, Ncv32u normStep,
+                                   NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
+{
+    Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x;
+    if (x_offs >= roi.width)
+    {
+        return;
+    }
+
+    Ncv32u sum_offset = blockIdx.y * sumStep + x_offs;
+    Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs;
+
+    //OPT: try swapping order (could change cache hit/miss ratio)
+    Ncv32u sum_tl = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x, d_sum);
+    Ncv32u sum_bl = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
+    Ncv32u sum_tr = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
+    Ncv32u sum_br = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
+    Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl;
+
+    Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br;
+    sqsum_tl = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
+    sqsum_bl = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
+    sqsum_tr = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
+    sqsum_br = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
+    Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl;
+
+    Ncv32f mean = sum_val * invRectArea;
+
+    //////////////////////////////////////////////////////////////////////////
+    // sqsum_val_res = sqsum_val / rectArea
+    //////////////////////////////////////////////////////////////////////////
+
+    Ncv32f sqsum_val_1 = __ull2float_rz(sqsum_val);
+    Ncv64u sqsum_val_2 = __float2ull_rz(sqsum_val_1);
+    Ncv64u sqsum_val_3 = sqsum_val - sqsum_val_2;
+    Ncv32f sqsum_val_4 = __ull2float_rn(sqsum_val_3);
+    sqsum_val_1 *= invRectArea;
+    sqsum_val_4 *= invRectArea;
+    Ncv32f sqsum_val_res = sqsum_val_1 + sqsum_val_4;
+
+    //////////////////////////////////////////////////////////////////////////
+    // variance = sqsum_val_res - mean * mean
+    //////////////////////////////////////////////////////////////////////////
+
+#if defined DISABLE_MAD_SELECTIVELY
+    Ncv32f variance = sqsum_val_2 - __fmul_rn(mean, mean);
+#else
+    Ncv32f variance = sqsum_val_res - mean * mean;
+#endif
+
+    //////////////////////////////////////////////////////////////////////////
+    // stddev = sqrtf(variance)
+    //////////////////////////////////////////////////////////////////////////
+
+    //Ncv32f stddev = sqrtf(variance);
+    Ncv32f stddev = __fsqrt_rn(variance);
+
+    d_norm[blockIdx.y * normStep + x_offs] = stddev;
+}
+
+
+NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
+                                   Ncv64u *d_sqsum, Ncv32u sqsumStep,
+                                   Ncv32f *d_norm, Ncv32u normStep,
+                                   NcvSize32u roi, NcvRect32u rect,
+                                   Ncv32f scaleArea, NcvBool readThruTexture)
+{
+    ncvAssertReturn(d_sum != NULL && d_sqsum != NULL && d_norm != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(sumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv32u) &&
+                      sqsumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv64u) &&
+                      normStep >= (Ncv32u)roi.width * sizeof(Ncv32f) &&
+                      sumStep % sizeof(Ncv32u) == 0 &&
+                      sqsumStep % sizeof(Ncv64u) == 0 &&
+                      normStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
+    ncvAssertReturn(scaleArea >= 1.0f, NPPST_INVALID_SCALE);
+    sumStep /= sizeof(Ncv32u);
+    sqsumStep /= sizeof(Ncv64u);
+    normStep /= sizeof(Ncv32f);
+
+    Ncv32f rectArea = rect.width * rect.height * scaleArea;
+    Ncv32f invRectArea = 1.0f / rectArea;
+
+    dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height);
+    dim3 block(NUM_RECTSTDDEV_THREADS);
+
+    if (!readThruTexture)
+    {
+        rectStdDev_32f_C1R
+            <false>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
+    }
+    else
+    {
+        cudaChannelFormatDesc cfdTexSrc;
+        cudaChannelFormatDesc cfdTexSqr;
+        cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
+        cfdTexSqr = cudaCreateChannelDesc<uint2>();
+
+        size_t alignmentOffset;
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR);
+        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR);
+        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
+
+        rectStdDev_32f_C1R
+            <true>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
+    }
+
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStRectStdDev_32f_C1R_host(Ncv32u *h_sum, Ncv32u sumStep,
+                                        Ncv64u *h_sqsum, Ncv32u sqsumStep,
+                                        Ncv32f *h_norm, Ncv32u normStep,
+                                        NcvSize32u roi, NcvRect32u rect,
+                                        Ncv32f scaleArea)
+{
+    ncvAssertReturn(h_sum != NULL && h_sqsum != NULL && h_norm != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(sumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv32u) &&
+                      sqsumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv64u) &&
+                      normStep >= (Ncv32u)roi.width * sizeof(Ncv32f) &&
+                      sumStep % sizeof(Ncv32u) == 0 &&
+                      sqsumStep % sizeof(Ncv64u) == 0 &&
+                      normStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
+    ncvAssertReturn(scaleArea >= 1.0f, NPPST_INVALID_SCALE);
+    sumStep /= sizeof(Ncv32u);
+    sqsumStep /= sizeof(Ncv64u);
+    normStep /= sizeof(Ncv32f);
+
+    Ncv32f rectArea = rect.width * rect.height * scaleArea;
+    Ncv32f invRectArea = 1.0f / rectArea;
+
+    for (Ncv32u i=0; i<roi.height; i++)
+    {
+        for (Ncv32u j=0; j<roi.width; j++)
+        {
+            Ncv32u sum_offset = i * sumStep + j;
+            Ncv32u sqsum_offset = i * sqsumStep + j;
+
+            Ncv32u sum_tl = h_sum[sum_offset + rect.y * sumStep + rect.x];
+            Ncv32u sum_bl = h_sum[sum_offset + (rect.y + rect.height) * sumStep + rect.x];
+            Ncv32u sum_tr = h_sum[sum_offset + rect.y * sumStep + rect.x + rect.width];
+            Ncv32u sum_br = h_sum[sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width];
+            Ncv64f sum_val = sum_br + sum_tl - sum_tr - sum_bl;
+
+            Ncv64u sqsum_tl = h_sqsum[sqsum_offset + rect.y * sqsumStep + rect.x];
+            Ncv64u sqsum_bl = h_sqsum[sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x];
+            Ncv64u sqsum_tr = h_sqsum[sqsum_offset + rect.y * sqsumStep + rect.x + rect.width];
+            Ncv64u sqsum_br = h_sqsum[sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width];
+            Ncv64f sqsum_val = (Ncv64f)(sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl);
+
+            Ncv64f mean = sum_val * invRectArea;
+            Ncv64f sqsum_val_2 = sqsum_val / rectArea;
+            Ncv64f variance = sqsum_val_2 - mean * mean;
+
+            h_norm[i * normStep + j] = (Ncv32f)sqrt(variance);
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+//==============================================================================
+//
+// Transpose.cu
+//
+//==============================================================================
+
+
+const Ncv32u TRANSPOSE_TILE_DIM   = 16;
+const Ncv32u TRANSPOSE_BLOCK_ROWS = 16;
+
+
+/**
+* \brief Matrix transpose kernel
+*
+* Calculates transpose of the input image
+* \see TRANSPOSE_TILE_DIM
+*
+* \tparam T_in      Type of input image elements
+* \tparam T_out     Type of output image elements
+*
+* \param d_src      [IN] Source image pointer
+* \param srcStride  [IN] Source image stride
+* \param d_dst      [OUT] Output image pointer
+* \param dstStride  [IN] Output image stride
+*
+* \return None
+*/
+template <class T>
+__global__ void transpose(T *d_src, Ncv32u srcStride,
+                          T *d_dst, Ncv32u dstStride, NcvSize32u srcRoi)
+{
+    __shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM+1];
+
+    Ncv32u blockIdx_x, blockIdx_y;
+
+    // do diagonal reordering
+    if (gridDim.x == gridDim.y)
+    {
+        blockIdx_y = blockIdx.x;
+        blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
+    }
+    else
+    {
+        Ncv32u bid = blockIdx.x + gridDim.x * blockIdx.y;
+        blockIdx_y = bid % gridDim.y;
+        blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
+    }
+
+    Ncv32u xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
+    Ncv32u yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
+    Ncv32u index_gmem = xIndex + yIndex * srcStride;
+
+    if (xIndex < srcRoi.width)
+    {
+        for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
+        {
+            if (yIndex + i < srcRoi.height)
+            {
+                tile[threadIdx.y+i][threadIdx.x] = d_src[index_gmem+i*srcStride];
+            }
+        }
+    }
+
+    __syncthreads();
+
+    xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
+    yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
+    index_gmem = xIndex + yIndex * dstStride;
+
+    if (xIndex < srcRoi.height)
+    {
+        for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
+        {
+            if (yIndex + i < srcRoi.width)
+            {
+                d_dst[index_gmem+i*dstStride] = tile[threadIdx.x][threadIdx.y+i];
+            }
+        }
+    }
+}
+
+
+template <class T>
+NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride,
+                                   T *d_dst, Ncv32u dstStride, NcvSize32u srcRoi)
+{
+    ncvAssertReturn(d_src != NULL && d_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStride >= srcRoi.width * sizeof(T) &&
+                      dstStride >= srcRoi.height * sizeof(T) &&
+                      srcStride % sizeof(T) == 0 && dstStride % sizeof(T) == 0, NPPST_INVALID_STEP);
+    srcStride /= sizeof(T);
+    dstStride /= sizeof(T);
+
+    dim3 grid((srcRoi.width + TRANSPOSE_TILE_DIM - 1) / TRANSPOSE_TILE_DIM,
+              (srcRoi.height + TRANSPOSE_TILE_DIM - 1) / TRANSPOSE_TILE_DIM);
+    dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
+    transpose
+        <T>
+        <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+        (d_src, srcStride, d_dst, dstStride, srcRoi);
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    return NPPST_SUCCESS;
+}
+
+
+template <class T>
+static NCVStatus transposeWrapperHost(T *h_src, Ncv32u srcStride,
+                                        T *h_dst, Ncv32u dstStride, NcvSize32u srcRoi)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn(srcStride >= srcRoi.width * sizeof(T) &&
+                      dstStride >= srcRoi.height * sizeof(T) &&
+                      srcStride % sizeof(T) == 0 && dstStride % sizeof(T) == 0, NPPST_INVALID_STEP);
+    srcStride /= sizeof(T);
+    dstStride /= sizeof(T);
+
+    for (Ncv32u i=0; i<srcRoi.height; i++)
+    {
+        for (Ncv32u j=0; j<srcRoi.width; j++)
+        {
+            h_dst[j*dstStride+i] = h_src[i*srcStride + j];
+        }
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+#define implementNppTranspose(bit, typ) \
+    NCVStatus nppiStTranspose_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
+                                             Ncv##bit##typ *d_dst, Ncv32u dstStep, NcvSize32u srcRoi) \
+    { \
+        return transposeWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
+                                                   (Ncv##bit##u *)d_dst, dstStep, srcRoi); \
+    }
+
+
+#define implementNppTransposeHost(bit, typ) \
+    NCVStatus nppiStTranspose_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
+                                                  Ncv##bit##typ *h_dst, Ncv32u dstStep, \
+                                                  NcvSize32u srcRoi) \
+    { \
+        return transposeWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
+                                                 (Ncv##bit##u *)h_dst, dstStep, srcRoi); \
+    }
+
+
+implementNppTranspose(32,u)
+implementNppTranspose(32,s)
+implementNppTranspose(32,f)
+implementNppTranspose(64,u)
+implementNppTranspose(64,s)
+implementNppTranspose(64,f)
+
+implementNppTransposeHost(32,u)
+implementNppTransposeHost(32,s)
+implementNppTransposeHost(32,f)
+implementNppTransposeHost(64,u)
+implementNppTransposeHost(64,s)
+implementNppTransposeHost(64,f)
+
+
+NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
+                                  void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
+{
+    return transposeWrapperDevice<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
+}
+
+
+NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
+                                       void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
+{
+    return transposeWrapperHost<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
+}
+
+
+//==============================================================================
+//
+// Compact.cu
+//
+//==============================================================================
+
+
+const Ncv32u NUM_REMOVE_THREADS = 256;
+
+
+template <bool bRemove, bool bWritePartial>
+__global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen,
+                                Ncv32u *d_offsets, Ncv32u *d_blockSums,
+                                Ncv32u elemRemove)
+{
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
+    Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
+
+    if (elemAddrIn > srcLen + blockDim.x)
+    {
+        return;
+    }
+
+    __shared__ Ncv32u shmem[NUM_REMOVE_THREADS * 2];
+
+    Ncv32u scanElem = 0;
+    if (elemAddrIn < srcLen)
+    {
+        if (bRemove)
+        {
+            scanElem = (d_src[elemAddrIn] != elemRemove) ? 1 : 0;
+        }
+        else
+        {
+            scanElem = d_src[elemAddrIn];
+        }
+    }
+
+    Ncv32u localScanInc = blockScanInclusive<Ncv32u, NUM_REMOVE_THREADS>(scanElem, shmem);
+    __syncthreads();
+
+    if (elemAddrIn < srcLen)
+    {
+        if (threadIdx.x == NUM_REMOVE_THREADS-1 && bWritePartial)
+        {
+            d_blockSums[blockId] = localScanInc;
+        }
+
+        if (bRemove)
+        {
+            d_offsets[elemAddrIn] = localScanInc - scanElem;
+        }
+        else
+        {
+            d_src[elemAddrIn] = localScanInc - scanElem;
+        }
+    }
+}
+
+
+__global__ void removePass2Adjust(Ncv32u *d_offsets, Ncv32u srcLen, Ncv32u *d_blockSums)
+{
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
+    Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
+    if (elemAddrIn >= srcLen)
+    {
+        return;
+    }
+
+    __shared__ Ncv32u valOffs;
+    valOffs = d_blockSums[blockId];
+    __syncthreads();
+
+    d_offsets[elemAddrIn] += valOffs;
+}
+
+
+__global__ void removePass3Compact(Ncv32u *d_src, Ncv32u srcLen,
+                                   Ncv32u *d_offsets, Ncv32u *d_dst,
+                                   Ncv32u elemRemove, Ncv32u *dstLenValue)
+{
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
+    Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
+    if (elemAddrIn >= srcLen)
+    {
+        return;
+    }
+
+    Ncv32u elem = d_src[elemAddrIn];
+    Ncv32u elemAddrOut = d_offsets[elemAddrIn];
+    if (elem != elemRemove)
+    {
+        d_dst[elemAddrOut] = elem;
+    }
+
+    if (elemAddrIn == srcLen-1)
+    {
+        if (elem != elemRemove)
+        {
+            *dstLenValue = elemAddrOut + 1;
+        }
+        else
+        {
+            *dstLenValue = elemAddrOut;
+        }
+    }
+}
+
+
+NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
+                                   Ncv32u *d_dst, Ncv32u *dstLenPinned,
+                                   Ncv32u elemRemove,
+                                   INCVMemAllocator &gpuAllocator)
+{
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+    ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
+
+    if (srcLen == 0)
+    {
+        if (dstLenPinned != NULL)
+        {
+            *dstLenPinned = 0;
+        }
+        return NPPST_SUCCESS;
+    }
+
+    std::vector<Ncv32u> partSumNums;
+    std::vector<Ncv32u> partSumOffsets;
+    Ncv32u partSumLastNum = srcLen;
+    Ncv32u partSumLastOffs = 0;
+    do
+    {
+        partSumNums.push_back(partSumLastNum);
+        partSumOffsets.push_back(partSumLastOffs);
+
+        Ncv32u curPartSumAlignedLength = alignUp(partSumLastNum * sizeof(Ncv32u),
+                                                 gpuAllocator.alignment()) / sizeof(Ncv32u);
+        partSumLastOffs += curPartSumAlignedLength;
+
+        partSumLastNum = (partSumLastNum + NUM_REMOVE_THREADS - 1) / NUM_REMOVE_THREADS;
+    }
+    while (partSumLastNum>1);
+    partSumNums.push_back(partSumLastNum);
+    partSumOffsets.push_back(partSumLastOffs);
+
+    NCVVectorAlloc<Ncv32u> d_hierSums(gpuAllocator, partSumLastOffs+1);
+    ncvAssertReturn(gpuAllocator.isCounting() || d_hierSums.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+    NCVVectorAlloc<Ncv32u> d_numDstElements(gpuAllocator, 1);
+    ncvAssertReturn(gpuAllocator.isCounting() || d_numDstElements.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+    NCV_SKIP_COND_BEGIN
+
+    dim3 block(NUM_REMOVE_THREADS);
+
+    //calculate zero-level partial sums for indices calculation
+    if (partSumNums.size() > 2)
+    {
+        dim3 grid(partSumNums[1]);
+
+        if (grid.x > 65535)
+        {
+            grid.y = (grid.x + 65534) / 65535;
+            grid.x = 65535;
+        }
+        removePass1Scan
+            <true, true>
+            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            (d_src, srcLen,
+             d_hierSums.ptr(),
+             d_hierSums.ptr() + partSumOffsets[1],
+             elemRemove);
+
+        ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+        //calculate hierarchical partial sums
+        for (Ncv32u i=1; i<partSumNums.size()-1; i++)
+        {
+            dim3 grid_partial(partSumNums[i+1]);
+            if (grid_partial.x > 65535)
+            {
+                grid_partial.y = (grid_partial.x + 65534) / 65535;
+                grid_partial.x = 65535;
+            }
+            if (grid_partial.x != 1)
+            {
+                removePass1Scan
+                    <false, true>
+                    <<<grid_partial, block, 0, nppStGetActiveCUDAstream()>>>
+                    (d_hierSums.ptr() + partSumOffsets[i],
+                     partSumNums[i], NULL,
+                     d_hierSums.ptr() + partSumOffsets[i+1],
+                     0);
+            }
+            else
+            {
+                removePass1Scan
+                    <false, false>
+                    <<<grid_partial, block, 0, nppStGetActiveCUDAstream()>>>
+                    (d_hierSums.ptr() + partSumOffsets[i],
+                     partSumNums[i], NULL,
+                     NULL,
+                     0);
+            }
+
+            ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+        }
+
+        //adjust hierarchical partial sums
+        for (Ncv32s i=(Ncv32s)partSumNums.size()-3; i>=0; i--)
+        {
+            dim3 grid_local(partSumNums[i+1]);
+            if (grid_local.x > 65535)
+            {
+                grid_local.y = (grid_local.x + 65534) / 65535;
+                grid_local.x = 65535;
+            }
+            removePass2Adjust
+                <<<grid_local, block, 0, nppStGetActiveCUDAstream()>>>
+                (d_hierSums.ptr() + partSumOffsets[i], partSumNums[i],
+                 d_hierSums.ptr() + partSumOffsets[i+1]);
+
+            ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+        }
+    }
+    else
+    {
+        dim3 grid_local(partSumNums[1]);
+        removePass1Scan
+            <true, false>
+            <<<grid_local, block, 0, nppStGetActiveCUDAstream()>>>
+            (d_src, srcLen,
+             d_hierSums.ptr(),
+             NULL, elemRemove);
+
+        ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+    }
+
+    //compact source vector using indices
+    dim3 grid(partSumNums[1]);
+    if (grid.x > 65535)
+    {
+        grid.y = (grid.x + 65534) / 65535;
+        grid.x = 65535;
+    }
+    removePass3Compact
+        <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+        (d_src, srcLen, d_hierSums.ptr(), d_dst,
+         elemRemove, d_numDstElements.ptr());
+
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    //get number of dst elements
+    if (dstLenPinned != NULL)
+    {
+        ncvAssertCUDAReturn(cudaMemcpyAsync(dstLenPinned, d_numDstElements.ptr(), sizeof(Ncv32u),
+                                              cudaMemcpyDeviceToHost, nppStGetActiveCUDAstream()), NPPST_MEM_RESIDENCE_ERROR);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(nppStGetActiveCUDAstream()), NPPST_MEM_RESIDENCE_ERROR);
+    }
+
+    NCV_SKIP_COND_END
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppsStCompactGetSize_32u(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
+
+    if (srcLen == 0)
+    {
+        *pBufsize = 0;
+        return NPPST_SUCCESS;
+    }
+
+    NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = compactVector_32u_device(NULL, srcLen, NULL, NULL, 0xC001C0DE,
+                                                 gpuCounter);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    *pBufsize = (Ncv32u)gpuCounter.maxSize();
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppsStCompactGetSize_32s(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    return nppsStCompactGetSize_32u(srcLen, pBufsize, devProp);
+}
+
+
+NCVStatus nppsStCompactGetSize_32f(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
+{
+    return nppsStCompactGetSize_32u(srcLen, pBufsize, devProp);
+}
+
+
+NCVStatus nppsStCompact_32u(Ncv32u *d_src, Ncv32u srcLen,
+                            Ncv32u *d_dst, Ncv32u *p_dstLen,
+                            Ncv32u elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
+    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
+
+    NCVStatus ncvStat = compactVector_32u_device(d_src, srcLen, d_dst, p_dstLen, elemRemove,
+                                                 gpuAllocator);
+    ncvAssertReturnNcvStat(ncvStat);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen,
+                            Ncv32s *d_dst, Ncv32u *p_dstLen,
+                            Ncv32s elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
+                             *(Ncv32u *)&elemRemove, pBuffer, bufSize, devProp);
+}
+
+
+#if defined __GNUC__ && (__GNUC__*100 + __GNUC_MINOR__ > 204)
+typedef Ncv32u __attribute__((__may_alias__)) Ncv32u_a;
+#else
+typedef Ncv32u Ncv32u_a;
+#endif
+
+NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen,
+                            Ncv32f *d_dst, Ncv32u *p_dstLen,
+                            Ncv32f elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp)
+{
+    return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
+                             *(Ncv32u_a *)&elemRemove, pBuffer, bufSize, devProp);
+}
+
+NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
+                                 Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove)
+{
+    ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
+
+    if (srcLen == 0)
+    {
+        if (dstLen != NULL)
+        {
+            *dstLen = 0;
+        }
+        return NPPST_SUCCESS;
+    }
+
+    Ncv32u dstIndex = 0;
+    for (Ncv32u srcIndex=0; srcIndex<srcLen; srcIndex++)
+    {
+        if (h_src[srcIndex] != elemRemove)
+        {
+            h_dst[dstIndex++] = h_src[srcIndex];
+        }
+    }
+
+    if (dstLen != NULL)
+    {
+        *dstLen = dstIndex;
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppsStCompact_32s_host(Ncv32s *h_src, Ncv32u srcLen,
+                                 Ncv32s *h_dst, Ncv32u *dstLen, Ncv32s elemRemove)
+{
+    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
+}
+
+
+NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
+                                 Ncv32f *h_dst, Ncv32u *dstLen, Ncv32f elemRemove)
+{
+    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
+}
+
+//==============================================================================
+//
+// Filter.cu
+//
+//==============================================================================
+
+
+texture <float, 1, cudaReadModeElementType> texSrc;
+texture <float, 1, cudaReadModeElementType> texKernel;
+
+
+__forceinline__ __device__ float getValueMirrorRow(const int rowOffset,
+                                                   int i,
+                                                   int w)
+{
+    if (i < 0) i = 1 - i;
+    if (i >= w) i = w + w - i - 1;
+    return tex1Dfetch (texSrc, rowOffset + i);
+}
+
+
+__forceinline__ __device__ float getValueMirrorColumn(const int offset,
+                                                      const int rowStep,
+                                                      int j,
+                                                      int h)
+{
+    if (j < 0) j = 1 - j;
+    if (j >= h) j = h + h - j - 1;
+    return tex1Dfetch (texSrc, offset + j * rowStep);
+}
+
+
+__global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
+                                              Ncv32f *pDst,
+                                              NcvSize32u dstSize,
+                                              Ncv32u dstStep,
+                                              NcvRect32u roi,
+                                              Ncv32s nKernelSize,
+                                              Ncv32s nAnchor,
+                                              Ncv32f multiplier)
+{
+    // position within ROI
+    const int ix = blockDim.x * blockIdx.x + threadIdx.x;
+    const int iy = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (ix >= roi.width || iy >= roi.height)
+    {
+        return;
+    }
+
+    const int p = nKernelSize - nAnchor - 1;
+
+    const int j = roi.y + iy;
+
+    const int rowOffset = j * srcStep + roi.x;
+
+    float sum = 0.0f;
+    for (int m = 0; m < nKernelSize; ++m)
+    {
+        sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width)
+            * tex1Dfetch (texKernel, m);
+    }
+
+    pDst[iy * dstStep + ix] = sum * multiplier;
+}
+
+
+__global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
+                                                 Ncv32f *pDst,
+                                                 NcvSize32u dstSize,
+                                                 Ncv32u dstStep,
+                                                 NcvRect32u roi,
+                                                 Ncv32s nKernelSize,
+                                                 Ncv32s nAnchor,
+                                                 Ncv32f multiplier)
+{
+    const int ix = blockDim.x * blockIdx.x + threadIdx.x;
+    const int iy = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (ix >= roi.width || iy >= roi.height)
+    {
+        return;
+    }
+
+    const int p = nKernelSize - nAnchor - 1;
+    const int i = roi.x + ix;
+    const int offset = i + roi.y * srcStep;
+
+    float sum = 0.0f;
+    for (int m = 0; m < nKernelSize; ++m)
+    {
+        sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height)
+            * tex1Dfetch (texKernel, m);
+    }
+
+    pDst[ix + iy * dstStep] = sum * multiplier;
+}
+
+
+NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
+                                        NcvSize32u srcSize,
+                                        Ncv32u nSrcStep,
+                                        Ncv32f *pDst,
+                                        NcvSize32u dstSize,
+                                        Ncv32u nDstStep,
+                                        NcvRect32u oROI,
+                                        NppStBorderType borderType,
+                                        const Ncv32f *pKernel,
+                                        Ncv32s nKernelSize,
+                                        Ncv32s nAnchor,
+                                        Ncv32f multiplier)
+{
+    ncvAssertReturn (pSrc != NULL &&
+        pDst != NULL &&
+        pKernel != NULL, NCV_NULL_PTR);
+
+    ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI);
+
+    ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
+        dstSize.width * sizeof (Ncv32f) <= nDstStep &&
+        oROI.width * sizeof (Ncv32f) <= nSrcStep &&
+        oROI.width * sizeof (Ncv32f) <= nDstStep &&
+        nSrcStep % sizeof (Ncv32f) == 0 &&
+        nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP);
+
+    Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
+    Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
+
+    // adjust ROI size to be within source image
+    if (oROI.x + oROI.width > srcSize.width)
+    {
+        oROI.width = srcSize.width - oROI.x;
+    }
+
+    if (oROI.y + oROI.height > srcSize.height)
+    {
+        oROI.height = srcSize.height - oROI.y;
+    }
+
+    cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
+    texSrc.normalized    = false;
+    texKernel.normalized = false;
+
+    cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
+    cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
+
+    dim3 ctaSize (32, 6);
+    dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
+        (oROI.height + ctaSize.y - 1) / ctaSize.y);
+
+    switch (borderType)
+    {
+    case nppStBorderNone:
+        return NPPST_ERROR;
+    case nppStBorderClamp:
+        return NPPST_ERROR;
+    case nppStBorderWrap:
+        return NPPST_ERROR;
+    case nppStBorderMirror:
+        FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
+            (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
+        ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+        break;
+    default:
+        return NPPST_ERROR;
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
+                                           NcvSize32u srcSize,
+                                           Ncv32u nSrcStep,
+                                           Ncv32f *pDst,
+                                           NcvSize32u dstSize,
+                                           Ncv32u nDstStep,
+                                           NcvRect32u oROI,
+                                           NppStBorderType borderType,
+                                           const Ncv32f *pKernel,
+                                           Ncv32s nKernelSize,
+                                           Ncv32s nAnchor,
+                                           Ncv32f multiplier)
+{
+    ncvAssertReturn (pSrc != NULL &&
+        pDst != NULL &&
+        pKernel != NULL, NCV_NULL_PTR);
+
+    ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI);
+
+    ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
+        dstSize.width * sizeof (Ncv32f) <= nDstStep &&
+        oROI.width * sizeof (Ncv32f) <= nSrcStep &&
+        oROI.width * sizeof (Ncv32f) <= nDstStep &&
+        nSrcStep % sizeof (Ncv32f) == 0 &&
+        nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP);
+
+    Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
+    Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
+
+    // adjust ROI size to be within source image
+    if (oROI.x + oROI.width > srcSize.width)
+    {
+        oROI.width = srcSize.width - oROI.x;
+    }
+
+    if (oROI.y + oROI.height > srcSize.height)
+    {
+        oROI.height = srcSize.height - oROI.y;
+    }
+
+    cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
+    texSrc.normalized    = false;
+    texKernel.normalized = false;
+
+    cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
+    cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
+
+    dim3 ctaSize (32, 6);
+    dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
+        (oROI.height + ctaSize.y - 1) / ctaSize.y);
+
+    switch (borderType)
+    {
+    case nppStBorderClamp:
+        return NPPST_ERROR;
+    case nppStBorderWrap:
+        return NPPST_ERROR;
+    case nppStBorderMirror:
+        FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
+            (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
+        ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+        break;
+    default:
+        return NPPST_ERROR;
+    }
+
+    return NPPST_SUCCESS;
+}
+
+
+//==============================================================================
+//
+// FrameInterpolate.cu
+//
+//==============================================================================
+
+
+inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom)
+{
+    return (num + denom - 1)/denom;
+}
+
+
+texture<float, 2, cudaReadModeElementType> tex_src1;
+texture<float, 2, cudaReadModeElementType> tex_src0;
+
+
+__global__ void BlendFramesKernel(const float *u, const float *v,   // forward flow
+                                  const float *ur, const float *vr, // backward flow
+                                  const float *o0, const float *o1, // coverage masks
+                                  int w, int h, int s,
+                                  float theta, float *out)
+{
+    const int ix = threadIdx.x + blockDim.x * blockIdx.x;
+    const int iy = threadIdx.y + blockDim.y * blockIdx.y;
+
+    const int pos = ix + s * iy;
+
+    if (ix >= w || iy >= h) return;
+
+    float _u = u[pos];
+    float _v = v[pos];
+
+    float _ur = ur[pos];
+    float _vr = vr[pos];
+
+    float x = (float)ix + 0.5f;
+    float y = (float)iy + 0.5f;
+    bool b0 = o0[pos] > 1e-4f;
+    bool b1 = o1[pos] > 1e-4f;
+
+    if (b0 && b1)
+    {
+        // pixel is visible on both frames
+        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) +
+            tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta;
+    }
+    else if (b0)
+    {
+        // visible on the first frame only
+        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta);
+    }
+    else
+    {
+        // visible on the second frame only
+        out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta));
+    }
+}
+
+
+NCVStatus BlendFrames(const Ncv32f *src0,
+                      const Ncv32f *src1,
+                      const Ncv32f *ufi,
+                      const Ncv32f *vfi,
+                      const Ncv32f *ubi,
+                      const Ncv32f *vbi,
+                      const Ncv32f *o1,
+                      const Ncv32f *o2,
+                      Ncv32u width,
+                      Ncv32u height,
+                      Ncv32u stride,
+                      Ncv32f theta,
+                      Ncv32f *out)
+{
+    tex_src1.addressMode[0] = cudaAddressModeClamp;
+    tex_src1.addressMode[1] = cudaAddressModeClamp;
+    tex_src1.filterMode = cudaFilterModeLinear;
+    tex_src1.normalized = false;
+
+    tex_src0.addressMode[0] = cudaAddressModeClamp;
+    tex_src0.addressMode[1] = cudaAddressModeClamp;
+    tex_src0.filterMode = cudaFilterModeLinear;
+    tex_src0.normalized = false;
+
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
+    const Ncv32u pitch = stride * sizeof (float);
+    ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
+    ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
+
+    dim3 threads (32, 4);
+    dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y));
+
+    BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
+        (ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
+
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStGetInterpolationBufferSize(NcvSize32u srcSize,
+                                           Ncv32u nStep,
+                                           Ncv32u *hpSize)
+{
+    NCVStatus status = NPPST_ERROR;
+    status = nppiStVectorWarpGetBufferSize(srcSize, nStep, hpSize);
+    return status;
+}
+
+
+NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
+{
+    // check state validity
+    ncvAssertReturn (pState->pSrcFrame0 != 0 &&
+        pState->pSrcFrame1 != 0 &&
+        pState->pFU != 0 &&
+        pState->pFV != 0 &&
+        pState->pBU != 0 &&
+        pState->pBV != 0 &&
+        pState->pNewFrame != 0 &&
+        pState->ppBuffers[0] != 0 &&
+        pState->ppBuffers[1] != 0 &&
+        pState->ppBuffers[2] != 0 &&
+        pState->ppBuffers[3] != 0 &&
+        pState->ppBuffers[4] != 0 &&
+        pState->ppBuffers[5] != 0, NPPST_NULL_POINTER_ERROR);
+
+    ncvAssertReturn (pState->size.width  > 0 &&
+        pState->size.height > 0, NPPST_ERROR);
+
+    ncvAssertReturn (pState->nStep >= pState->size.width * sizeof (Ncv32f) &&
+        pState->nStep > 0 &&
+        pState->nStep % sizeof (Ncv32f) == 0,
+        NPPST_INVALID_STEP);
+
+    // change notation
+    Ncv32f *cov0 = pState->ppBuffers[0];
+    Ncv32f *cov1 = pState->ppBuffers[1];
+    Ncv32f *fwdU = pState->ppBuffers[2]; // forward u
+    Ncv32f *fwdV = pState->ppBuffers[3]; // forward v
+    Ncv32f *bwdU = pState->ppBuffers[4]; // backward u
+    Ncv32f *bwdV = pState->ppBuffers[5]; // backward v
+    // warp flow
+    ncvAssertReturnNcvStat (
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFU,
+        pState->size,
+        pState->nStep,
+        pState->pFU,
+        pState->pFV,
+        pState->nStep,
+        cov0,
+        pState->pos,
+        fwdU) );
+    ncvAssertReturnNcvStat (
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFV,
+        pState->size,
+        pState->nStep,
+        pState->pFU,
+        pState->pFV,
+        pState->nStep,
+        cov0,
+        pState->pos,
+        fwdV) );
+    // warp backward flow
+    ncvAssertReturnNcvStat (
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBU,
+        pState->size,
+        pState->nStep,
+        pState->pBU,
+        pState->pBV,
+        pState->nStep,
+        cov1,
+        1.0f - pState->pos,
+        bwdU) );
+    ncvAssertReturnNcvStat (
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBV,
+        pState->size,
+        pState->nStep,
+        pState->pBU,
+        pState->pBV,
+        pState->nStep,
+        cov1,
+        1.0f - pState->pos,
+        bwdU) );
+    // interpolate frame
+    ncvAssertReturnNcvStat (
+        BlendFrames (pState->pSrcFrame0,
+        pState->pSrcFrame1,
+        fwdU,
+        fwdV,
+        bwdU,
+        bwdV,
+        cov0,
+        cov1,
+        pState->size.width,
+        pState->size.height,
+        pState->nStep / sizeof (Ncv32f),
+        pState->pos,
+        pState->pNewFrame) );
+
+    return NPPST_SUCCESS;
+}
+
+
+//==============================================================================
+//
+// VectorWarpFrame.cu
+//
+//==============================================================================
+
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
+
+// FP32 atomic add
+static __forceinline__ __device__ float _atomicAdd(float *addr, float val)
+{
+    float old = *addr, assumed;
+
+    do {
+        assumed = old;
+        old = int_as_float(__iAtomicCAS((int*)addr,
+              float_as_int(assumed),
+              float_as_int(val+assumed)));
+    } while( assumed!=old );
+
+    return old;
+}
+#else
+#define _atomicAdd atomicAdd
+#endif
+
+
+__global__ void ForwardWarpKernel_PSF2x2(const float *u,
+                                         const float *v,
+                                         const float *src,
+                                         const int w,
+                                         const int h,
+                                         const int flow_stride,
+                                         const int image_stride,
+                                         const float time_scale,
+                                         float *normalization_factor,
+                                         float *dst)
+{
+    int j = threadIdx.x + blockDim.x * blockIdx.x;
+    int i = threadIdx.y + blockDim.y * blockIdx.y;
+
+    if (i >= h || j >= w) return;
+
+    int flow_row_offset  = i * flow_stride;
+    int image_row_offset = i * image_stride;
+
+    //bottom left corner of a target pixel
+    float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
+    float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
+    // pixel containing bottom left corner
+    float px;
+    float py;
+    float dx = modff (cx, &px);
+    float dy = modff (cy, &py);
+    // target pixel integer coords
+    int tx;
+    int ty;
+    tx = (int) px;
+    ty = (int) py;
+    float value = src[image_row_offset + j];
+    float weight;
+    // fill pixel containing bottom right corner
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = dx * dy;
+        _atomicAdd (dst + ty * image_stride + tx, value * weight);
+        _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
+    }
+
+    // fill pixel containing bottom left corner
+    tx -= 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = (1.0f - dx) * dy;
+        _atomicAdd (dst + ty * image_stride + tx, value * weight);
+        _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
+    }
+
+    // fill pixel containing upper left corner
+    ty -= 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = (1.0f - dx) * (1.0f - dy);
+        _atomicAdd (dst + ty * image_stride + tx, value * weight);
+        _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
+    }
+
+    // fill pixel containing upper right corner
+    tx += 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = dx * (1.0f - dy);
+        _atomicAdd (dst + ty * image_stride + tx, value * weight);
+        _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
+    }
+}
+
+
+__global__ void ForwardWarpKernel_PSF1x1(const float *u,
+                                         const float *v,
+                                         const float *src,
+                                         const int w,
+                                         const int h,
+                                         const int flow_stride,
+                                         const int image_stride,
+                                         const float time_scale,
+                                         float *dst)
+{
+    int j = threadIdx.x + blockDim.x * blockIdx.x;
+    int i = threadIdx.y + blockDim.y * blockIdx.y;
+
+    if (i >= h || j >= w) return;
+
+    int flow_row_offset = i * flow_stride;
+    int image_row_offset = i * image_stride;
+
+    float u_ = u[flow_row_offset + j];
+    float v_ = v[flow_row_offset + j];
+
+    //bottom left corner of target pixel
+    float cx = u_ * time_scale + (float)j + 1.0f;
+    float cy = v_ * time_scale + (float)i + 1.0f;
+    // pixel containing bottom left corner
+    int tx = __float2int_rn (cx);
+    int ty = __float2int_rn (cy);
+
+    float value = src[image_row_offset + j];
+    // fill pixel
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        _atomicAdd (dst + ty * image_stride + tx, value);
+    }
+}
+
+
+__global__ void NormalizeKernel(const float *normalization_factor, int w, int h, int s, float *image)
+{
+    int i = threadIdx.y + blockDim.y * blockIdx.y;
+    int j = threadIdx.x + blockDim.x * blockIdx.x;
+
+    if (i >= h || j >= w) return;
+
+    const int pos = i * s + j;
+
+    float scale = normalization_factor[pos];
+
+    float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
+
+    image[pos] *= invScale;
+}
+
+
+__global__ void MemsetKernel(const float value, int w, int h, float *image)
+{
+    int i = threadIdx.y + blockDim.y * blockIdx.y;
+    int j = threadIdx.x + blockDim.x * blockIdx.x;
+
+    if (i >= h || j >= w) return;
+
+    const int pos = i * w + j;
+
+    image[pos] = value;
+}
+
+
+NCVStatus nppiStVectorWarpGetBufferSize (NcvSize32u srcSize, Ncv32u nSrcStep, Ncv32u *hpSize)
+{
+    ncvAssertReturn (hpSize != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep,
+        NPPST_INVALID_STEP);
+
+    *hpSize = nSrcStep * srcSize.height;
+
+    return NPPST_SUCCESS;
+}
+
+
+// does not require normalization
+NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
+                                         NcvSize32u srcSize,
+                                         Ncv32u nSrcStep,
+                                         const Ncv32f *pU,
+                                         const Ncv32f *pV,
+                                         Ncv32u nVFStep,
+                                         Ncv32f timeScale,
+                                         Ncv32f *pDst)
+{
+    ncvAssertReturn (pSrc != NULL &&
+        pU   != NULL &&
+        pV   != NULL &&
+        pDst != NULL, NPPST_NULL_POINTER_ERROR);
+
+    ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
+        srcSize.width * sizeof (Ncv32f) <= nVFStep,
+        NPPST_INVALID_STEP);
+
+    Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
+    Ncv32u vfStep  = nVFStep / sizeof (Ncv32f);
+
+    dim3 ctaSize (32, 6);
+    dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y));
+
+    ForwardWarpKernel_PSF1x1 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
+        (pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pDst);
+
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    return NPPST_SUCCESS;
+}
+
+
+NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
+                                         NcvSize32u srcSize,
+                                         Ncv32u nSrcStep,
+                                         const Ncv32f *pU,
+                                         const Ncv32f *pV,
+                                         Ncv32u nVFStep,
+                                         Ncv32f *pBuffer,
+                                         Ncv32f timeScale,
+                                         Ncv32f *pDst)
+{
+    ncvAssertReturn (pSrc != NULL &&
+        pU   != NULL &&
+        pV   != NULL &&
+        pDst != NULL &&
+        pBuffer != NULL, NPPST_NULL_POINTER_ERROR);
+
+    ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
+        srcSize.width * sizeof (Ncv32f) <= nVFStep, NPPST_INVALID_STEP);
+
+    Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
+    Ncv32u vfStep = nVFStep / sizeof(Ncv32f);
+
+    dim3 ctaSize(32, 6);
+    dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y));
+
+    MemsetKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
+        (0, srcSize.width, srcSize.height, pBuffer);
+
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    ForwardWarpKernel_PSF2x2 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
+        (pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pBuffer, pDst);
+
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    NormalizeKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
+        (pBuffer, srcSize.width, srcSize.height, srcStep, pDst);
+
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    return NPPST_SUCCESS;
+}
+
+
+//==============================================================================
+//
+// Resize.cu
+//
+//==============================================================================
+
+
+texture <float, 2, cudaReadModeElementType> texSrc2D;
+
+
+__forceinline__
+__device__ float processLine(int spos,
+                             float xmin,
+                             float xmax,
+                             int ixmin,
+                             int ixmax,
+                             float fxmin,
+                             float cxmax)
+{
+    // first element
+    float wsum = 1.0f - xmin + fxmin;
+    float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin);
+    spos++;
+    for (int ix = ixmin + 1; ix < ixmax; ++ix)
+    {
+        sum += tex1Dfetch(texSrc, spos);
+        spos++;
+        wsum += 1.0f;
+    }
+    sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax);
+    wsum += cxmax - xmax;
+    return sum / wsum;
+}
+
+
+__global__ void resizeSuperSample_32f(NcvSize32u srcSize,
+                                      Ncv32u srcStep,
+                                      NcvRect32u srcROI,
+                                      Ncv32f *dst,
+                                      NcvSize32u dstSize,
+                                      Ncv32u dstStep,
+                                      NcvRect32u dstROI,
+                                      Ncv32f scaleX,
+                                      Ncv32f scaleY)
+{
+    // position within dst ROI
+    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    const int iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (ix >= dstROI.width || iy >= dstROI.height)
+    {
+        return;
+    }
+
+    float rw = (float) srcROI.width;
+    float rh = (float) srcROI.height;
+
+    // source position
+    float x = scaleX * (float) ix;
+    float y = scaleY * (float) iy;
+
+    // x sampling range
+    float xBegin = fmax (x - scaleX, 0.0f);
+    float xEnd   = fmin (x + scaleX, rw - 1.0f);
+    // y sampling range
+    float yBegin = fmax (y - scaleY, 0.0f);
+    float yEnd   = fmin (y + scaleY, rh - 1.0f);
+    // x range of source samples
+    float floorXBegin = floorf (xBegin);
+    float ceilXEnd    = ceilf (xEnd);
+    int iXBegin = srcROI.x + (int) floorXBegin;
+    int iXEnd   = srcROI.x + (int) ceilXEnd;
+    // y range of source samples
+    float floorYBegin = floorf (yBegin);
+    float ceilYEnd    = ceilf (yEnd);
+    int iYBegin = srcROI.y + (int) floorYBegin;
+    int iYEnd   = srcROI.y + (int) ceilYEnd;
+
+    // first row
+    int pos = iYBegin * srcStep + iXBegin;
+
+    float wsum = 1.0f - yBegin + floorYBegin;
+
+    float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+        ceilXEnd) * (1.0f - yBegin + floorYBegin);
+    pos += srcStep;
+    for (int iy = iYBegin + 1; iy < iYEnd; ++iy)
+    {
+        sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+            ceilXEnd);
+        pos += srcStep;
+        wsum += 1.0f;
+    }
+
+    sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+        ceilXEnd) * (ceilYEnd - yEnd);
+    wsum += ceilYEnd - yEnd;
+    sum /= wsum;
+
+    dst[(ix + dstROI.x) + (iy + dstROI.y) * dstStep] = sum;
+}
+
+
+// bicubic interpolation
+__forceinline__
+__device__ float bicubicCoeff(float x_)
+{
+    float x = fabsf(x_);
+    if (x <= 1.0f)
+    {
+        return x * x * (1.5f * x - 2.5f) + 1.0f;
+    }
+    else if (x < 2.0f)
+    {
+        return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+
+__global__ void resizeBicubic(NcvSize32u srcSize,
+                              NcvRect32u srcROI,
+                              NcvSize32u dstSize,
+                              Ncv32u dstStep,
+                              Ncv32f *dst,
+                              NcvRect32u dstROI,
+                              Ncv32f scaleX,
+                              Ncv32f scaleY)
+{
+    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    const int iy = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (ix >= dstROI.width || iy >= dstROI.height)
+    {
+        return;
+    }
+
+    const float dx = 1.0f / srcROI.width;
+    const float dy = 1.0f / srcROI.height;
+
+    float rx = (float) srcROI.x;
+    float ry = (float) srcROI.y;
+
+    float rw = (float) srcROI.width;
+    float rh = (float) srcROI.height;
+
+    float x = scaleX * (float) ix;
+    float y = scaleY * (float) iy;
+
+    // sampling range
+    // border mode is clamp
+    float xmin = fmax (ceilf (x - 2.0f), 0.0f);
+    float xmax = fmin (floorf (x + 2.0f), rw - 1.0f);
+
+    float ymin = fmax (ceilf (y - 2.0f), 0.0f);
+    float ymax = fmin (floorf (y + 2.0f), rh - 1.0f);
+
+    // shift data window to match ROI
+    rx += 0.5f;
+    ry += 0.5f;
+
+    x += rx;
+    y += ry;
+
+    xmin += rx;
+    xmax += rx;
+    ymin += ry;
+    ymax += ry;
+
+    float sum  = 0.0f;
+    float wsum = 0.0f;
+
+    for (float cy = ymin; cy <= ymax; cy += 1.0f)
+    {
+        for (float cx = xmin; cx <= xmax; cx += 1.0f)
+        {
+            float xDist = x - cx;
+            float yDist = y - cy;
+            float wx = bicubicCoeff (xDist);
+            float wy = bicubicCoeff (yDist);
+            wx *= wy;
+            sum += wx * tex2D (texSrc2D, cx * dx, cy * dy);
+            wsum += wx;
+        }
+    }
+    dst[(ix + dstROI.x)+ (iy + dstROI.y) * dstStep] = (!wsum)? 0 : sum / wsum;
+}
+
+
+NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
+                               NcvSize32u srcSize,
+                               Ncv32u nSrcStep,
+                               NcvRect32u srcROI,
+                               Ncv32f *pDst,
+                               NcvSize32u dstSize,
+                               Ncv32u nDstStep,
+                               NcvRect32u dstROI,
+                               Ncv32f xFactor,
+                               Ncv32f yFactor,
+                               NppStInterpMode interpolation)
+{
+    NCVStatus status = NPPST_SUCCESS;
+
+    ncvAssertReturn (pSrc != NULL && pDst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn (xFactor != 0.0 && yFactor != 0.0, NPPST_INVALID_SCALE);
+
+    ncvAssertReturn (nSrcStep >= sizeof (Ncv32f) * (Ncv32u) srcSize.width &&
+        nDstStep >= sizeof (Ncv32f) * (Ncv32f) dstSize.width,
+        NPPST_INVALID_STEP);
+
+    Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
+    Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
+
+    // TODO: preprocess ROI to prevent out of bounds access
+
+    if (interpolation == nppStSupersample)
+    {
+        // bind texture
+        cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep);
+        // invoke kernel
+        dim3 ctaSize (32, 6);
+        dim3 gridSize ((dstROI.width  + ctaSize.x - 1) / ctaSize.x,
+            (dstROI.height + ctaSize.y - 1) / ctaSize.y);
+
+        resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
+            (srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
+    }
+    else if (interpolation == nppStBicubic)
+    {
+        texSrc2D.addressMode[0] = cudaAddressModeMirror;
+        texSrc2D.addressMode[1] = cudaAddressModeMirror;
+        texSrc2D.normalized = true;
+
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
+
+        cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height,
+            nSrcStep);
+
+        dim3 ctaSize (32, 6);
+        dim3 gridSize ((dstSize.width  + ctaSize.x - 1) / ctaSize.x,
+            (dstSize.height + ctaSize.y - 1) / ctaSize.y);
+
+        resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
+            (srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
+    }
+    else
+    {
+        status = NPPST_ERROR;
+    }
+
+    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+
+    return status;
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudalegacy/src/cuda/bm.cu b/modules/cudalegacy/src/cuda/bm.cu
new file mode 100644
index 00000000000..1307a8e3275
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/bm.cu
@@ -0,0 +1,169 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace optflowbm
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
+    {
+        int s = 0;
+
+        for (int y = 0; y < blockSize.y; ++y)
+        {
+            for (int x = 0; x < blockSize.x; ++x)
+                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
+        }
+
+        return s;
+    }
+
+    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
+                                  const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
+                                  const short2* ss, const int ssCount)
+    {
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (i >= velx.rows || j >= velx.cols)
+            return;
+
+        const int X1 = j * shiftSize.x;
+        const int Y1 = i * shiftSize.y;
+
+        const int offX = usePrevious ? __float2int_rn(velx(i, j)) : 0;
+        const int offY = usePrevious ? __float2int_rn(vely(i, j)) : 0;
+
+        int X2 = X1 + offX;
+        int Y2 = Y1 + offY;
+
+        int dist = numeric_limits<int>::max();
+
+        if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
+            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+
+        int countMin = 1;
+        int sumx = offX;
+        int sumy = offY;
+
+        if (dist > acceptLevel)
+        {
+            // do brute-force search
+            for (int k = 0; k < ssCount; ++k)
+            {
+                const short2 ssVal = ss[k];
+
+                const int dx = offX + ssVal.x;
+                const int dy = offY + ssVal.y;
+
+                X2 = X1 + dx;
+                Y2 = Y1 + dy;
+
+                if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
+                {
+                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+                    if (tmpDist < acceptLevel)
+                    {
+                        sumx = dx;
+                        sumy = dy;
+                        countMin = 1;
+                        break;
+                    }
+
+                    if (tmpDist < dist)
+                    {
+                        dist = tmpDist;
+                        sumx = dx;
+                        sumy = dy;
+                        countMin = 1;
+                    }
+                    else if (tmpDist == dist)
+                    {
+                        sumx += dx;
+                        sumy += dy;
+                        countMin++;
+                    }
+                }
+            }
+
+            if (dist > escapeLevel)
+            {
+                sumx = offX;
+                sumy = offY;
+                countMin = 1;
+            }
+        }
+
+        velx(i, j) = static_cast<float>(sumx) / countMin;
+        vely(i, j) = static_cast<float>(sumy) / countMin;
+    }
+
+    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
+              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
+    {
+        bindTexture(&tex_prev, prev);
+        bindTexture(&tex_curr, curr);
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
+
+        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
+                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif // !defined CUDA_DISABLER
diff --git a/modules/cudalegacy/src/cuda/bm_fast.cu b/modules/cudalegacy/src/cuda/bm_fast.cu
new file mode 100644
index 00000000000..fe1bc999238
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/bm_fast.cu
@@ -0,0 +1,295 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace optflowbm_fast
+{
+    enum
+    {
+        CTA_SIZE = 128,
+
+        TILE_COLS = 128,
+        TILE_ROWS = 32,
+
+        STRIDE = CTA_SIZE
+    };
+
+    template <typename T> __device__ __forceinline__ int calcDist(T a, T b)
+    {
+        return ::abs(a - b);
+    }
+
+    template <class T> struct FastOptFlowBM
+    {
+
+        int search_radius;
+        int block_radius;
+
+        int search_window;
+        int block_window;
+
+        PtrStepSz<T> I0;
+        PtrStep<T> I1;
+
+        mutable PtrStepi buffer;
+
+        FastOptFlowBM(int search_window_, int block_window_,
+                      PtrStepSz<T> I0_, PtrStepSz<T> I1_,
+                      PtrStepi buffer_) :
+            search_radius(search_window_ / 2), block_radius(block_window_ / 2),
+            search_window(search_window_), block_window(block_window_),
+            I0(I0_), I1(I1_),
+            buffer(buffer_)
+        {
+        }
+
+        __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                dist_sums[index] = 0;
+
+                for (int tx = 0; tx < block_window; ++tx)
+                    col_sums(tx, index) = 0;
+
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int ay = i;
+                int ax = j;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius;
+
+                for (int tx = -block_radius; tx <= block_radius; ++tx)
+                {
+                    int col_sum = 0;
+                    for (int ty = -block_radius; ty <= block_radius; ++ty)
+                    {
+                        int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx));
+
+                        dist_sums[index] += dist;
+                        col_sum += dist;
+                    }
+
+                    col_sums(tx + block_radius, index) = col_sum;
+                }
+
+                up_col_sums(j, index) = col_sums(block_window - 1, index);
+            }
+        }
+
+        __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int ay = i;
+                int ax = j + block_radius;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius + block_radius;
+
+                int col_sum = 0;
+
+                for (int ty = -block_radius; ty <= block_radius; ++ty)
+                    col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx));
+
+                dist_sums[index] += col_sum - col_sums(first, index);
+
+                col_sums(first, index) = col_sum;
+                up_col_sums(j, index) = col_sum;
+            }
+        }
+
+        __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            int ay = i;
+            int ax = j + block_radius;
+
+            T a_up   = I0(ay - block_radius - 1, ax);
+            T a_down = I0(ay + block_radius, ax);
+
+            for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius + block_radius;
+
+                T b_up   = I1(by - block_radius - 1, bx);
+                T b_down = I1(by + block_radius, bx);
+
+                int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
+
+                dist_sums[index] += col_sum  - col_sums(first, index);
+                col_sums(first, index) = col_sum;
+                up_col_sums(j, index) = col_sum;
+            }
+        }
+
+        __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const
+        {
+            int bestDist = numeric_limits<int>::max();
+            int bestInd = -1;
+
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int curDist = dist_sums[index];
+                if (curDist < bestDist)
+                {
+                    bestDist = curDist;
+                    bestInd = index;
+                }
+            }
+
+            __shared__ int cta_dist_buffer[CTA_SIZE];
+            __shared__ int cta_ind_buffer[CTA_SIZE];
+
+            reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>());
+
+            if (threadIdx.x == 0)
+            {
+                int y = bestInd / search_window;
+                int x = bestInd - y * search_window;
+
+                velx = x - search_radius;
+                vely = y - search_radius;
+            }
+        }
+
+        __device__ __forceinline__ void operator()(PtrStepf velx, PtrStepf vely) const
+        {
+            int tbx = blockIdx.x * TILE_COLS;
+            int tby = blockIdx.y * TILE_ROWS;
+
+            int tex = ::min(tbx + TILE_COLS, I0.cols);
+            int tey = ::min(tby + TILE_ROWS, I0.rows);
+
+            PtrStepi col_sums;
+            col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
+            col_sums.step = buffer.step;
+
+            PtrStepi up_col_sums;
+            up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
+            up_col_sums.step = buffer.step;
+
+            extern __shared__ int dist_sums[]; //search_window * search_window
+
+            int first = 0;
+
+            for (int i = tby; i < tey; ++i)
+            {
+                for (int j = tbx; j < tex; ++j)
+                {
+                    __syncthreads();
+
+                    if (j == tbx)
+                    {
+                        initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
+                        first = 0;
+                    }
+                    else
+                    {
+                        if (i == tby)
+                          shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
+                        else
+                          shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
+
+                        first = (first + 1) % block_window;
+                    }
+
+                    __syncthreads();
+
+                    convolve_window(i, j, dist_sums, velx(i, j), vely(i, j));
+                }
+            }
+        }
+
+    };
+
+    template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely)
+    {
+        fbm(velx, vely);
+    }
+
+    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
+    {
+        dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS));
+
+        buffer_cols = search_window * search_window * grid.y;
+        buffer_rows = src_cols + block_window * grid.x;
+    }
+
+    template <typename T>
+    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream)
+    {
+        FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer);
+
+        dim3 block(CTA_SIZE, 1);
+        dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS));
+
+        size_t smem = search_window * search_window * sizeof(int);
+
+        optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely);
+        cudaSafeCall ( cudaGetLastError () );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
+}
+
+#endif // !defined CUDA_DISABLER
diff --git a/modules/cudalegacy/src/cuda/calib3d.cu b/modules/cudalegacy/src/cuda/calib3d.cu
new file mode 100644
index 00000000000..f30d2a493ac
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/calib3d.cu
@@ -0,0 +1,193 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
+
+    namespace transform_points
+    {
+        __constant__ float3 crot0;
+        __constant__ float3 crot1;
+        __constant__ float3 crot2;
+        __constant__ float3 ctransl;
+
+        struct TransformOp : unary_function<float3, float3>
+        {
+            __device__ __forceinline__ float3 operator()(const float3& p) const
+            {
+                return make_float3(
+                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+            }
+            __host__ __device__ __forceinline__ TransformOp() {}
+            __host__ __device__ __forceinline__ TransformOp(const TransformOp&) {}
+        };
+
+        void call(const PtrStepSz<float3> src, const float* rot,
+                  const float* transl, PtrStepSz<float3> dst,
+                  cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+            cv::cuda::device::transform(src, dst, TransformOp(), WithOutMask(), stream);
+        }
+    } // namespace transform_points
+
+    namespace project_points
+    {
+        __constant__ float3 crot0;
+        __constant__ float3 crot1;
+        __constant__ float3 crot2;
+        __constant__ float3 ctransl;
+        __constant__ float3 cproj0;
+        __constant__ float3 cproj1;
+
+        struct ProjectOp : unary_function<float3, float3>
+        {
+            __device__ __forceinline__ float2 operator()(const float3& p) const
+            {
+                // Rotate and translate in 3D
+                float3 t = make_float3(
+                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+                // Project on 2D plane
+                return make_float2(
+                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
+                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
+            }
+            __host__ __device__ __forceinline__ ProjectOp() {}
+            __host__ __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
+        };
+
+        void call(const PtrStepSz<float3> src, const float* rot,
+                  const float* transl, const float* proj, PtrStepSz<float2> dst,
+                  cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
+            cv::cuda::device::transform(src, dst, ProjectOp(), WithOutMask(), stream);
+        }
+    } // namespace project_points
+
+    namespace solve_pnp_ransac
+    {
+        __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
+        __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
+
+        int maxNumIters()
+        {
+            return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
+        }
+
+        __device__ __forceinline__ float sqr(float x)
+        {
+            return x * x;
+        }
+
+        template <int BLOCK_SIZE>
+        __global__ void computeHypothesisScoresKernel(
+                const int num_points, const float3* object, const float2* image,
+                const float dist_threshold, int* g_num_inliers)
+        {
+            const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
+            const float3 &transl_vec = ctransl_vectors[blockIdx.x];
+            int num_inliers = 0;
+
+            for (int i = threadIdx.x; i < num_points; i += blockDim.x)
+            {
+                float3 p = object[i];
+                p = make_float3(
+                        rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
+                        rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
+                        rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
+                p.x /= p.z;
+                p.y /= p.z;
+                float2 image_p = image[i];
+                if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
+                    ++num_inliers;
+            }
+
+            __shared__ int s_num_inliers[BLOCK_SIZE];
+            reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
+
+            if (threadIdx.x == 0)
+                g_num_inliers[blockIdx.x] = num_inliers;
+        }
+
+        void computeHypothesisScores(
+                const int num_hypotheses, const int num_points, const float* rot_matrices,
+                const float3* transl_vectors, const float3* object, const float2* image,
+                const float dist_threshold, int* hypothesis_scores)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
+
+            dim3 threads(256);
+            dim3 grid(num_hypotheses);
+
+            computeHypothesisScoresKernel<256><<<grid, threads>>>(
+                    num_points, object, image, dist_threshold, hypothesis_scores);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } // namespace solvepnp_ransac
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudalegacy/src/cuda/ccomponetns.cu b/modules/cudalegacy/src/cuda/ccomponetns.cu
new file mode 100644
index 00000000000..59121296a6c
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/ccomponetns.cu
@@ -0,0 +1,534 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/core/cuda/vec_traits.hpp>
+#include <opencv2/core/cuda/vec_math.hpp>
+#include <opencv2/core/cuda/emulation.hpp>
+
+#include <iostream>
+#include <stdio.h>
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace ccl
+    {
+        enum
+        {
+            WARP_SIZE  = 32,
+            WARP_LOG   = 5,
+
+            CTA_SIZE_X = 32,
+            CTA_SIZE_Y = 8,
+
+            STA_SIZE_MERGE_Y = 4,
+            STA_SIZE_MERGE_X = 32,
+
+            TPB_X = 1,
+            TPB_Y = 4,
+
+            TILE_COLS = CTA_SIZE_X * TPB_X,
+            TILE_ROWS = CTA_SIZE_Y * TPB_Y
+        };
+
+        template<typename T> struct IntervalsTraits
+        {
+            typedef T elem_type;
+        };
+
+        template<> struct IntervalsTraits<unsigned char>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<uchar3>
+        {
+            typedef int3 dist_type;
+            enum {ch = 3};
+        };
+
+        template<> struct IntervalsTraits<uchar4>
+        {
+            typedef int4 dist_type;
+            enum {ch = 4};
+        };
+
+        template<> struct IntervalsTraits<unsigned short>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<ushort3>
+        {
+            typedef int3 dist_type;
+            enum {ch = 3};
+        };
+
+        template<> struct IntervalsTraits<ushort4>
+        {
+            typedef int4 dist_type;
+            enum {ch = 4};
+        };
+
+        template<> struct IntervalsTraits<float>
+        {
+            typedef float dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<int>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        typedef unsigned char component;
+        enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
+
+        template<typename T, int CH> struct InInterval {};
+
+        template<typename T> struct InInterval<T, 1>
+        {
+            typedef typename VecTraits<T>::elem_type E;
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) { }
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = a - b;
+                return lo <= d && d <= hi;
+            }
+        };
+
+
+        template<typename T> struct InInterval<T, 3>
+        {
+            typedef typename VecTraits<T>::elem_type E;
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
+            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)) { }
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = saturate_cast<I>(a - b);
+                return lo.x <= d.x && d.x <= hi.x &&
+                       lo.y <= d.y && d.y <= hi.y &&
+                       lo.z <= d.z && d.z <= hi.z;
+            }
+        };
+
+        template<typename T> struct InInterval<T, 4>
+        {
+            typedef typename VecTraits<T>::elem_type E;
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
+            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)) { }
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = saturate_cast<I>(a - b);
+                return lo.x <= d.x && d.x <= hi.x &&
+                       lo.y <= d.y && d.y <= hi.y &&
+                       lo.z <= d.z && d.z <= hi.z &&
+                       lo.w <= d.w && d.w <= hi.w;
+            }
+        };
+
+
+        template<typename T, typename F>
+        __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= image.cols || y >= image.rows) return;
+
+            T intensity = image(y, x);
+            component c = 0;
+
+            if ( x > 0 && connected(intensity, image(y, x - 1)))
+                c |= LEFT;
+
+            if ( y > 0 && connected(intensity, image(y - 1, x)))
+                c |= UP;
+
+            if ( x + 1 < image.cols && connected(intensity, image(y, x + 1)))
+                c |= RIGHT;
+
+            if ( y + 1 < image.rows && connected(intensity, image(y + 1, x)))
+                c |= DOWN;
+
+            components(y, x) = c;
+        }
+
+        template< typename T>
+        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
+        {
+            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
+            dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
+
+            typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
+
+            Int_t inInt(lo, hi);
+            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
+
+            cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+
+        __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
+        {
+            int x = threadIdx.x + blockIdx.x * TILE_COLS;
+            int y = threadIdx.y + blockIdx.y * TILE_ROWS;
+
+            if (x >= edges.cols || y >= edges.rows) return;
+
+            //currently x is 1
+            int bounds = ((y + TPB_Y) < edges.rows);
+
+            __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
+            __shared__ int  edgesTile[TILE_ROWS][TILE_COLS];
+
+            int new_labels[TPB_Y][TPB_X];
+            int old_labels[TPB_Y][TPB_X];
+
+            #pragma unroll
+            for (int i = 0; i < TPB_Y; ++i)
+                #pragma unroll
+                for (int j = 0; j < TPB_X; ++j)
+                {
+                    int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                    int xloc = threadIdx.x + CTA_SIZE_X * j;
+                    component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
+
+                    if (!xloc) c &= ~LEFT;
+                    if (!yloc) c &= ~UP;
+
+                    if (xloc == TILE_COLS -1) c &= ~RIGHT;
+                    if (yloc == TILE_ROWS -1) c &= ~DOWN;
+
+                    new_labels[i][j] = yloc * TILE_COLS + xloc;
+                    edgesTile[yloc][xloc] = c;
+                }
+
+            for (int k = 0; ;++k)
+            {
+                //1. backup
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                        int xloc = threadIdx.x + CTA_SIZE_X * j;
+
+                        old_labels[i][j]       = new_labels[i][j];
+                        labelsTile[yloc][xloc] = new_labels[i][j];
+                    }
+
+                __syncthreads();
+
+                //2. compare local arrays
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                        int xloc = threadIdx.x + CTA_SIZE_X * j;
+
+                        component c = edgesTile[yloc][xloc];
+                        int label = new_labels[i][j];
+
+                        if (c & UP)
+                           label = ::min(label, labelsTile[yloc - 1][xloc]);
+
+                        if (c &  DOWN)
+                           label = ::min(label, labelsTile[yloc + 1][xloc]);
+
+                        if (c & LEFT)
+                           label = ::min(label, labelsTile[yloc][xloc - 1]);
+
+                        if (c & RIGHT)
+                           label = ::min(label, labelsTile[yloc][xloc + 1]);
+
+                       new_labels[i][j] = label;
+                    }
+
+                __syncthreads();
+
+                //3. determine: Is any value changed?
+                int changed = 0;
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        if (new_labels[i][j] < old_labels[i][j])
+                        {
+                            changed = 1;
+                            Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
+                        }
+                    }
+
+                changed = Emulation::syncthreadsOr(changed);
+
+                if (!changed)
+                    break;
+
+                //4. Compact paths
+                const int *labels = &labelsTile[0][0];
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int label = new_labels[i][j];
+
+                        while( labels[label] < label ) label = labels[label];
+
+                        new_labels[i][j] = label;
+                    }
+                __syncthreads();
+            }
+
+            #pragma unroll
+            for (int i = 0; i < TPB_Y; ++i)
+            #pragma unroll
+                for (int j = 0; j < TPB_X; ++j)
+                {
+                    int label = new_labels[i][j];
+                    int yloc = label / TILE_COLS;
+                    int xloc = label - yloc * TILE_COLS;
+
+                    xloc += blockIdx.x * TILE_COLS;
+                    yloc += blockIdx.y * TILE_ROWS;
+
+                    label = yloc * edges.cols + xloc;
+                    // do it for x too.
+                    if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
+                }
+        }
+
+        __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
+        {
+            while(1)
+            {
+                int y = label / comps.cols;
+                int x = label - y * comps.cols;
+
+                int parent = comps(y, x);
+
+                if (label == parent) break;
+
+                label = parent;
+            }
+            return label;
+        }
+
+        __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
+        {
+            int r1 = root(comps, l1);
+            int r2 = root(comps, l2);
+
+            if (r1 == r2) return;
+
+            int mi = ::min(r1, r2);
+            int ma = ::max(r1, r2);
+
+            int y = ma / comps.cols;
+            int x = ma - y * comps.cols;
+
+            atomicMin(&comps.ptr(y)[x], mi);
+            changed = true;
+        }
+
+        __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
+            const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
+        {
+            int tid = threadIdx.y * blockDim.x + threadIdx.x;
+            int stride = blockDim.y * blockDim.x;
+
+            int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
+            int yend   = ybegin + tilesNumY * tileSizeY;
+
+            if (blockIdx.y == gridDim.y - 1)
+            {
+                yend -= yIncomplete * tileSizeY;
+                yend -= tileSizeY;
+                tileSizeY = (edges.rows % tileSizeY);
+
+                yend += tileSizeY;
+            }
+
+            int xbegin = blockIdx.x * tilesNumX * tileSizeX;
+            int xend   = xbegin + tilesNumX * tileSizeX;
+
+            if (blockIdx.x == gridDim.x - 1)
+            {
+                if (xIncomplete) yend = ybegin;
+                xend -= xIncomplete * tileSizeX;
+                xend -= tileSizeX;
+                tileSizeX = (edges.cols % tileSizeX);
+
+                xend += tileSizeX;
+            }
+
+            if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
+            {
+                xend = xbegin;
+            }
+
+            int tasksV = (tilesNumX - 1) * (yend - ybegin);
+            int tasksH = (tilesNumY - 1) * (xend - xbegin);
+
+            int total = tasksH + tasksV;
+
+            bool changed;
+            do
+            {
+                changed = false;
+                for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
+                {
+                    if (taskIdx < tasksH)
+                    {
+                        int indexH = taskIdx;
+
+                        int row = indexH / (xend - xbegin);
+                        int col = indexH - row * (xend - xbegin);
+
+                        int y = ybegin + (row + 1) * tileSizeY;
+                        int x = xbegin + col;
+
+                        component e = edges( x, y);
+                        if (e & UP)
+                        {
+                            int lc = comps(y,x);
+                            int lu = comps(y - 1, x);
+
+                            isConnected(comps, lc, lu, changed);
+                        }
+                    }
+                    else
+                    {
+                        int indexV = taskIdx - tasksH;
+
+                        int col = indexV / (yend - ybegin);
+                        int row = indexV - col * (yend - ybegin);
+
+                        int x = xbegin + (col + 1) * tileSizeX;
+                        int y = ybegin + row;
+
+                        component e = edges(x, y);
+                        if (e & LEFT)
+                        {
+                            int lc = comps(y, x);
+                            int ll = comps(y, x - 1);
+
+                            isConnected(comps, lc, ll, changed);
+                        }
+                    }
+                }
+            } while (Emulation::syncthreadsOr(changed));
+        }
+
+        __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if( x < comps.cols && y < comps.rows)
+                comps(y, x) = root(comps, comps(y, x));
+        }
+
+        enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
+
+        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
+        {
+            CV_UNUSED(flags);
+            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
+            dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
+
+            lableTiles<<<grid, block, 0, stream>>>(edges, comps);
+            cudaSafeCall( cudaGetLastError() );
+
+            int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
+            while (grid.x > 1 || grid.y > 1)
+            {
+                dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
+                dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
+                // debug log
+                // std::cout << "merging: " << grid.y  << " x " << grid.x << " ---> " << mergeGrid.y <<  " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
+                crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
+                tileSizeX <<= 1;
+                tileSizeY <<= 1;
+                grid = mergeGrid;
+
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            grid.x = divUp(edges.cols, block.x);
+            grid.y = divUp(edges.rows, block.y);
+            flatten<<<grid, block, 0, stream>>>(edges, comps);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+} } }
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudalegacy/src/cuda/fgd.cu b/modules/cudalegacy/src/cuda/fgd.cu
new file mode 100644
index 00000000000..9f7b142c574
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/fgd.cu
@@ -0,0 +1,801 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "fgd.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace fgd
+{
+    ////////////////////////////////////////////////////////////////////////////
+    // calcDiffHistogram
+
+    const unsigned int UINT_BITS = 32U;
+    const int LOG_WARP_SIZE = 5;
+    const int WARP_SIZE = 1 << LOG_WARP_SIZE;
+#if (__CUDA_ARCH__ < 120)
+    const unsigned int TAG_MASK = (1U << (UINT_BITS - LOG_WARP_SIZE)) - 1U;
+#endif
+
+    const int MERGE_THREADBLOCK_SIZE = 256;
+
+    __device__ __forceinline__ void addByte(unsigned int* s_WarpHist_, unsigned int data, unsigned int threadTag)
+    {
+        #if (__CUDA_ARCH__ < 120)
+            volatile unsigned int* s_WarpHist = s_WarpHist_;
+            unsigned int count;
+            do
+            {
+                count = s_WarpHist[data] & TAG_MASK;
+                count = threadTag | (count + 1);
+                s_WarpHist[data] = count;
+            } while (s_WarpHist[data] != count);
+        #else
+            atomicInc(s_WarpHist_ + data, (unsigned int)(-1));
+        #endif
+    }
+
+
+    template <typename PT, typename CT>
+    __global__ void calcPartialHistogram(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
+    {
+#if (__CUDA_ARCH__ < 200)
+        const int HISTOGRAM_WARP_COUNT = 4;
+#else
+        const int HISTOGRAM_WARP_COUNT = 6;
+#endif
+        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
+        const int HISTOGRAM_THREADBLOCK_MEMORY = HISTOGRAM_WARP_COUNT * HISTOGRAM_BIN_COUNT;
+
+        //Per-warp subhistogram storage
+        __shared__ unsigned int s_Hist0[HISTOGRAM_THREADBLOCK_MEMORY];
+        __shared__ unsigned int s_Hist1[HISTOGRAM_THREADBLOCK_MEMORY];
+        __shared__ unsigned int s_Hist2[HISTOGRAM_THREADBLOCK_MEMORY];
+
+        //Clear shared memory storage for current threadblock before processing
+        #pragma unroll
+        for (int i = 0; i < (HISTOGRAM_THREADBLOCK_MEMORY / HISTOGRAM_THREADBLOCK_SIZE); ++i)
+        {
+           s_Hist0[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
+           s_Hist1[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
+           s_Hist2[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
+        }
+        __syncthreads();
+
+        const unsigned int warpId = threadIdx.x >> LOG_WARP_SIZE;
+
+        unsigned int* s_WarpHist0 = s_Hist0 + warpId * HISTOGRAM_BIN_COUNT;
+        unsigned int* s_WarpHist1 = s_Hist1 + warpId * HISTOGRAM_BIN_COUNT;
+        unsigned int* s_WarpHist2 = s_Hist2 + warpId * HISTOGRAM_BIN_COUNT;
+
+        const unsigned int tag = threadIdx.x << (UINT_BITS - LOG_WARP_SIZE);
+        const int dataCount = prevFrame.rows * prevFrame.cols;
+        for (unsigned int pos = blockIdx.x * HISTOGRAM_THREADBLOCK_SIZE + threadIdx.x; pos < dataCount; pos += HISTOGRAM_THREADBLOCK_SIZE * PARTIAL_HISTOGRAM_COUNT)
+        {
+            const unsigned int y = pos / prevFrame.cols;
+            const unsigned int x = pos % prevFrame.cols;
+
+            PT prevVal = prevFrame(y, x);
+            CT curVal = curFrame(y, x);
+
+            int3 diff = make_int3(
+                ::abs(curVal.x - prevVal.x),
+                ::abs(curVal.y - prevVal.y),
+                ::abs(curVal.z - prevVal.z)
+            );
+
+            addByte(s_WarpHist0, diff.x, tag);
+            addByte(s_WarpHist1, diff.y, tag);
+            addByte(s_WarpHist2, diff.z, tag);
+        }
+        __syncthreads();
+
+        //Merge per-warp histograms into per-block and write to global memory
+        for (unsigned int bin = threadIdx.x; bin < HISTOGRAM_BIN_COUNT; bin += HISTOGRAM_THREADBLOCK_SIZE)
+        {
+            unsigned int sum0 = 0;
+            unsigned int sum1 = 0;
+            unsigned int sum2 = 0;
+
+            #pragma unroll
+            for (int i = 0; i < HISTOGRAM_WARP_COUNT; ++i)
+            {
+                #if (__CUDA_ARCH__ < 120)
+                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
+                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
+                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
+                #else
+                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT];
+                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT];
+                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT];
+                #endif
+            }
+
+            partialBuf0[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum0;
+            partialBuf1[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum1;
+            partialBuf2[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum2;
+        }
+    }
+
+    __global__ void mergeHistogram(const unsigned int* partialBuf0, const unsigned int* partialBuf1, const unsigned int* partialBuf2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2)
+    {
+        unsigned int sum0 = 0;
+        unsigned int sum1 = 0;
+        unsigned int sum2 = 0;
+
+        #pragma unroll
+        for (unsigned int i = threadIdx.x; i < PARTIAL_HISTOGRAM_COUNT; i += MERGE_THREADBLOCK_SIZE)
+        {
+            sum0 += partialBuf0[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
+            sum1 += partialBuf1[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
+            sum2 += partialBuf2[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
+        }
+
+        __shared__ unsigned int data0[MERGE_THREADBLOCK_SIZE];
+        __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
+        __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
+
+        plus<unsigned int> op;
+        reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
+
+        if(threadIdx.x == 0)
+        {
+            hist0[blockIdx.x] = sum0;
+            hist1[blockIdx.x] = sum1;
+            hist2[blockIdx.x] = sum2;
+        }
+    }
+
+    template <typename PT, typename CT>
+    void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
+                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
+                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
+                               bool cc20, cudaStream_t stream)
+    {
+        const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
+        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
+
+        calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
+                (PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
+        cudaSafeCall( cudaGetLastError() );
+
+        mergeHistogram<<<HISTOGRAM_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(partialBuf0, partialBuf1, partialBuf2, hist0, hist1, hist2);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+
+    /////////////////////////////////////////////////////////////////////////
+    // calcDiffThreshMask
+
+    template <typename PT, typename CT>
+    __global__ void calcDiffThreshMask(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (y > prevFrame.rows || x > prevFrame.cols)
+            return;
+
+        PT prevVal = prevFrame(y, x);
+        CT curVal = curFrame(y, x);
+
+        int3 diff = make_int3(
+            ::abs(curVal.x - prevVal.x),
+            ::abs(curVal.y - prevVal.y),
+            ::abs(curVal.z - prevVal.z)
+        );
+
+        if (diff.x > bestThres.x || diff.y > bestThres.y || diff.z > bestThres.z)
+            changeMask(y, x) = 255;
+    }
+
+    template <typename PT, typename CT>
+    void calcDiffThreshMask_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
+
+        calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, bestThres, changeMask);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calcDiffThreshMask_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+
+    /////////////////////////////////////////////////////////////////////////
+    // bgfgClassification
+
+    __constant__ BGPixelStat c_stat;
+
+    void setBGPixelStat(const BGPixelStat& stat)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(c_stat, &stat, sizeof(BGPixelStat)) );
+    }
+
+    template <typename T> struct Output;
+    template <> struct Output<uchar3>
+    {
+        static __device__ __forceinline__ uchar3 make(uchar v0, uchar v1, uchar v2)
+        {
+            return make_uchar3(v0, v1, v2);
+        }
+    };
+    template <> struct Output<uchar4>
+    {
+        static __device__ __forceinline__ uchar4 make(uchar v0, uchar v1, uchar v2)
+        {
+            return make_uchar4(v0, v1, v2, 255);
+        }
+    };
+
+    template <typename PT, typename CT, typename OT>
+    __global__ void bgfgClassification(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame,
+                                       const PtrStepb Ftd, const PtrStepb Fbd, PtrStepb foreground,
+                                       int deltaC, int deltaCC, float alpha2, int N1c, int N1cc)
+    {
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (i > prevFrame.rows || j > prevFrame.cols)
+            return;
+
+        if (Fbd(i, j) || Ftd(i, j))
+        {
+            float Pb  = 0.0f;
+            float Pv  = 0.0f;
+            float Pvb = 0.0f;
+
+            int val = 0;
+
+            // Is it a motion pixel?
+            if (Ftd(i, j))
+            {
+                if (!c_stat.is_trained_dyn_model(i, j))
+                    val = 1;
+                else
+                {
+                    PT prevVal = prevFrame(i, j);
+                    CT curVal = curFrame(i, j);
+
+                    // Compare with stored CCt vectors:
+                    for (int k = 0; k < N1cc && c_stat.PV_CC(i, j, k) > alpha2; ++k)
+                    {
+                        OT v1 = c_stat.V1_CC<OT>(i, j, k);
+                        OT v2 = c_stat.V2_CC<OT>(i, j, k);
+
+                        if (::abs(v1.x - prevVal.x) <= deltaCC &&
+                            ::abs(v1.y - prevVal.y) <= deltaCC &&
+                            ::abs(v1.z - prevVal.z) <= deltaCC &&
+                            ::abs(v2.x - curVal.x) <= deltaCC &&
+                            ::abs(v2.y - curVal.y) <= deltaCC &&
+                            ::abs(v2.z - curVal.z) <= deltaCC)
+                        {
+                            Pv += c_stat.PV_CC(i, j, k);
+                            Pvb += c_stat.PVB_CC(i, j, k);
+                        }
+                    }
+
+                    Pb = c_stat.Pbcc(i, j);
+                    if (2 * Pvb * Pb <= Pv)
+                        val = 1;
+                }
+            }
+            else if(c_stat.is_trained_st_model(i, j))
+            {
+                CT curVal = curFrame(i, j);
+
+                // Compare with stored Ct vectors:
+                for (int k = 0; k < N1c && c_stat.PV_C(i, j, k) > alpha2; ++k)
+                {
+                    OT v = c_stat.V_C<OT>(i, j, k);
+
+                    if (::abs(v.x - curVal.x) <= deltaC &&
+                        ::abs(v.y - curVal.y) <= deltaC &&
+                        ::abs(v.z - curVal.z) <= deltaC)
+                    {
+                        Pv += c_stat.PV_C(i, j, k);
+                        Pvb += c_stat.PVB_C(i, j, k);
+                    }
+                }
+                Pb = c_stat.Pbc(i, j);
+                if (2 * Pvb * Pb <= Pv)
+                    val = 1;
+            }
+
+            // Update foreground:
+            foreground(i, j) = static_cast<uchar>(val);
+        } // end if( change detection...
+    }
+
+    template <typename PT, typename CT, typename OT>
+    void bgfgClassification_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
+                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(bgfgClassification<PT, CT, OT>, cudaFuncCachePreferL1) );
+
+        bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame,
+                                                                   Ftd, Fbd, foreground,
+                                                                   deltaC, deltaCC, alpha2, N1c, N1cc);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+
+    ////////////////////////////////////////////////////////////////////////////
+    // updateBackgroundModel
+
+    template <typename PT, typename CT, typename OT, class PrevFramePtr2D, class CurFramePtr2D, class FtdPtr2D, class FbdPtr2D>
+    __global__ void updateBackgroundModel(int cols, int rows, const PrevFramePtr2D prevFrame, const CurFramePtr2D curFrame, const FtdPtr2D Ftd, const FbdPtr2D Fbd,
+                                          PtrStepb foreground, PtrStep<OT> background,
+                                          int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T)
+    {
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (i > rows || j > cols)
+            return;
+
+        const float MIN_PV = 1e-10f;
+
+        const uchar is_trained_dyn_model = c_stat.is_trained_dyn_model(i, j);
+        if (Ftd(i, j) || !is_trained_dyn_model)
+        {
+            const float alpha = is_trained_dyn_model ? alpha2 : alpha3;
+
+            float Pbcc = c_stat.Pbcc(i, j);
+
+            //update Pb
+            Pbcc *= (1.0f - alpha);
+            if (!foreground(i, j))
+            {
+                Pbcc += alpha;
+            }
+
+            int min_dist = numeric_limits<int>::max();
+            int indx = -1;
+
+            PT prevVal = prevFrame(i, j);
+            CT curVal = curFrame(i, j);
+
+            // Find best Vi match:
+            for (int k = 0; k < N2cc; ++k)
+            {
+                float PV_CC = c_stat.PV_CC(i, j, k);
+                if (!PV_CC)
+                    break;
+
+                if (PV_CC < MIN_PV)
+                {
+                    c_stat.PV_CC(i, j, k) = 0;
+                    c_stat.PVB_CC(i, j, k) = 0;
+                    continue;
+                }
+
+                c_stat.PV_CC(i, j, k) = PV_CC * (1.0f - alpha);
+                c_stat.PVB_CC(i, j, k) = c_stat.PVB_CC(i, j, k) * (1.0f - alpha);
+
+                OT v1 = c_stat.V1_CC<OT>(i, j, k);
+
+                int3 val1 = make_int3(
+                    ::abs(v1.x - prevVal.x),
+                    ::abs(v1.y - prevVal.y),
+                    ::abs(v1.z - prevVal.z)
+                );
+
+                OT v2 = c_stat.V2_CC<OT>(i, j, k);
+
+                int3 val2 = make_int3(
+                    ::abs(v2.x - curVal.x),
+                    ::abs(v2.y - curVal.y),
+                    ::abs(v2.z - curVal.z)
+                );
+
+                int dist = val1.x + val1.y + val1.z + val2.x + val2.y + val2.z;
+
+                if (dist < min_dist &&
+                    val1.x <= deltaCC && val1.y <= deltaCC && val1.z <= deltaCC &&
+                    val2.x <= deltaCC && val2.y <= deltaCC && val2.z <= deltaCC)
+                {
+                    min_dist = dist;
+                    indx = k;
+                }
+            }
+
+            if (indx < 0)
+            {
+                // Replace N2th elem in the table by new feature:
+                indx = N2cc - 1;
+                c_stat.PV_CC(i, j, indx) = alpha;
+                c_stat.PVB_CC(i, j, indx) = alpha;
+
+                //udate Vt
+                c_stat.V1_CC<OT>(i, j, indx) = Output<OT>::make(prevVal.x, prevVal.y, prevVal.z);
+                c_stat.V2_CC<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
+            }
+            else
+            {
+                // Update:
+                c_stat.PV_CC(i, j, indx) += alpha;
+
+                if (!foreground(i, j))
+                {
+                    c_stat.PVB_CC(i, j, indx) += alpha;
+                }
+            }
+
+            //re-sort CCt table by Pv
+            const float PV_CC_indx = c_stat.PV_CC(i, j, indx);
+            const float PVB_CC_indx = c_stat.PVB_CC(i, j, indx);
+            const OT V1_CC_indx = c_stat.V1_CC<OT>(i, j, indx);
+            const OT V2_CC_indx = c_stat.V2_CC<OT>(i, j, indx);
+            for (int k = 0; k < indx; ++k)
+            {
+                if (c_stat.PV_CC(i, j, k) <= PV_CC_indx)
+                {
+                    //shift elements
+                    float Pv_tmp1;
+                    float Pv_tmp2 = PV_CC_indx;
+
+                    float Pvb_tmp1;
+                    float Pvb_tmp2 = PVB_CC_indx;
+
+                    OT v1_tmp1;
+                    OT v1_tmp2 = V1_CC_indx;
+
+                    OT v2_tmp1;
+                    OT v2_tmp2 = V2_CC_indx;
+
+                    for (int l = k; l <= indx; ++l)
+                    {
+                        Pv_tmp1 = c_stat.PV_CC(i, j, l);
+                        c_stat.PV_CC(i, j, l) = Pv_tmp2;
+                        Pv_tmp2 = Pv_tmp1;
+
+                        Pvb_tmp1 = c_stat.PVB_CC(i, j, l);
+                        c_stat.PVB_CC(i, j, l) = Pvb_tmp2;
+                        Pvb_tmp2 = Pvb_tmp1;
+
+                        v1_tmp1 = c_stat.V1_CC<OT>(i, j, l);
+                        c_stat.V1_CC<OT>(i, j, l) = v1_tmp2;
+                        v1_tmp2 = v1_tmp1;
+
+                        v2_tmp1 = c_stat.V2_CC<OT>(i, j, l);
+                        c_stat.V2_CC<OT>(i, j, l) = v2_tmp2;
+                        v2_tmp2 = v2_tmp1;
+                    }
+
+                    break;
+                }
+            }
+
+            float sum1 = 0.0f;
+            float sum2 = 0.0f;
+
+            //check "once-off" changes
+            for (int k = 0; k < N1cc; ++k)
+            {
+                const float PV_CC = c_stat.PV_CC(i, j, k);
+                if (!PV_CC)
+                    break;
+
+                sum1 += PV_CC;
+                sum2 += c_stat.PVB_CC(i, j, k);
+            }
+
+            if (sum1 > T)
+                c_stat.is_trained_dyn_model(i, j) = 1;
+
+            float diff = sum1 - Pbcc * sum2;
+
+            // Update stat table:
+            if (diff > T)
+            {
+                //new BG features are discovered
+                for (int k = 0; k < N1cc; ++k)
+                {
+                    const float PV_CC = c_stat.PV_CC(i, j, k);
+                    if (!PV_CC)
+                        break;
+
+                    c_stat.PVB_CC(i, j, k) = (PV_CC - Pbcc * c_stat.PVB_CC(i, j, k)) / (1.0f - Pbcc);
+                }
+            }
+
+            c_stat.Pbcc(i, j) = Pbcc;
+        }
+
+        // Handle "stationary" pixel:
+        if (!Ftd(i, j))
+        {
+            const float alpha = c_stat.is_trained_st_model(i, j) ? alpha2 : alpha3;
+
+            float Pbc = c_stat.Pbc(i, j);
+
+            //update Pb
+            Pbc *= (1.0f - alpha);
+            if (!foreground(i, j))
+            {
+                Pbc += alpha;
+            }
+
+            int min_dist = numeric_limits<int>::max();
+            int indx = -1;
+
+            CT curVal = curFrame(i, j);
+
+            //find best Vi match
+            for (int k = 0; k < N2c; ++k)
+            {
+                float PV_C = c_stat.PV_C(i, j, k);
+
+                if (PV_C < MIN_PV)
+                {
+                    c_stat.PV_C(i, j, k) = 0;
+                    c_stat.PVB_C(i, j, k) = 0;
+                    continue;
+                }
+
+                // Exponential decay of memory
+                c_stat.PV_C(i, j, k) = PV_C * (1.0f - alpha);
+                c_stat.PVB_C(i, j, k) = c_stat.PVB_C(i, j, k) * (1.0f - alpha);
+
+                OT v = c_stat.V_C<OT>(i, j, k);
+                int3 val = make_int3(
+                    ::abs(v.x - curVal.x),
+                    ::abs(v.y - curVal.y),
+                    ::abs(v.z - curVal.z)
+                );
+
+                int dist = val.x + val.y + val.z;
+
+                if (dist < min_dist && val.x <= deltaC && val.y <= deltaC && val.z <= deltaC)
+                {
+                    min_dist = dist;
+                    indx = k;
+                }
+            }
+
+            if (indx < 0)
+            {
+                //N2th elem in the table is replaced by a new features
+                indx = N2c - 1;
+
+                c_stat.PV_C(i, j, indx) = alpha;
+                c_stat.PVB_C(i, j, indx) = alpha;
+
+                //udate Vt
+                c_stat.V_C<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
+            }
+            else
+            {
+                //update
+                c_stat.PV_C(i, j, indx) += alpha;
+
+                if (!foreground(i, j))
+                {
+                    c_stat.PVB_C(i, j, indx) += alpha;
+                }
+            }
+
+            //re-sort Ct table by Pv
+            const float PV_C_indx = c_stat.PV_C(i, j, indx);
+            const float PVB_C_indx = c_stat.PVB_C(i, j, indx);
+            OT V_C_indx = c_stat.V_C<OT>(i, j, indx);
+            for (int k = 0; k < indx; ++k)
+            {
+                if (c_stat.PV_C(i, j, k) <= PV_C_indx)
+                {
+                    //shift elements
+                    float Pv_tmp1;
+                    float Pv_tmp2 = PV_C_indx;
+
+                    float Pvb_tmp1;
+                    float Pvb_tmp2 = PVB_C_indx;
+
+                    OT v_tmp1;
+                    OT v_tmp2 = V_C_indx;
+
+                    for (int l = k; l <= indx; ++l)
+                    {
+                        Pv_tmp1 = c_stat.PV_C(i, j, l);
+                        c_stat.PV_C(i, j, l) = Pv_tmp2;
+                        Pv_tmp2 = Pv_tmp1;
+
+                        Pvb_tmp1 = c_stat.PVB_C(i, j, l);
+                        c_stat.PVB_C(i, j, l) = Pvb_tmp2;
+                        Pvb_tmp2 = Pvb_tmp1;
+
+                        v_tmp1 = c_stat.V_C<OT>(i, j, l);
+                        c_stat.V_C<OT>(i, j, l) = v_tmp2;
+                        v_tmp2 = v_tmp1;
+                    }
+
+                    break;
+                }
+            }
+
+            // Check "once-off" changes:
+            float sum1 = 0.0f;
+            float sum2 = 0.0f;
+            for (int k = 0; k < N1c; ++k)
+            {
+                const float PV_C = c_stat.PV_C(i, j, k);
+                if (!PV_C)
+                    break;
+
+                sum1 += PV_C;
+                sum2 += c_stat.PVB_C(i, j, k);
+            }
+
+            if (sum1 > T)
+                c_stat.is_trained_st_model(i, j) = 1;
+
+            float diff = sum1 - Pbc * sum2;
+
+            // Update stat table:
+            if (diff > T)
+            {
+                //new BG features are discovered
+                for (int k = 0; k < N1c; ++k)
+                {
+                    const float PV_C = c_stat.PV_C(i, j, k);
+                    if (!PV_C)
+                        break;
+
+                    c_stat.PVB_C(i, j, k) = (PV_C - Pbc * c_stat.PVB_C(i, j, k)) / (1.0f - Pbc);
+                }
+
+                c_stat.Pbc(i, j) = 1.0f - Pbc;
+            }
+            else
+            {
+                c_stat.Pbc(i, j) = Pbc;
+            }
+        } // if !(change detection) at pixel (i,j)
+
+        // Update the reference BG image:
+        if (!foreground(i, j))
+        {
+            CT curVal = curFrame(i, j);
+
+            if (!Ftd(i, j) && !Fbd(i, j))
+            {
+                // Apply IIR filter:
+                OT oldVal = background(i, j);
+
+                int3 newVal = make_int3(
+                    __float2int_rn(oldVal.x * (1.0f - alpha1) + curVal.x * alpha1),
+                    __float2int_rn(oldVal.y * (1.0f - alpha1) + curVal.y * alpha1),
+                    __float2int_rn(oldVal.z * (1.0f - alpha1) + curVal.z * alpha1)
+                );
+
+                background(i, j) = Output<OT>::make(
+                    static_cast<uchar>(newVal.x),
+                    static_cast<uchar>(newVal.y),
+                    static_cast<uchar>(newVal.z)
+                );
+            }
+            else
+            {
+                background(i, j) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
+            }
+        }
+    }
+
+    template <typename PT, typename CT, typename OT>
+    struct UpdateBackgroundModel
+    {
+        static void call(PtrStepSz<PT> prevFrame, PtrStepSz<CT> curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSz<OT> background,
+                         int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
+                         cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );
+
+            updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
+                prevFrame.cols, prevFrame.rows,
+                prevFrame, curFrame,
+                Ftd, Fbd, foreground, background,
+                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    template <typename PT, typename CT, typename OT>
+    void updateBackgroundModel_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background,
+                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
+                                   cudaStream_t stream)
+    {
+        UpdateBackgroundModel<PT, CT, OT>::call(PtrStepSz<PT>(prevFrame), PtrStepSz<CT>(curFrame), Ftd, Fbd, foreground, PtrStepSz<OT>(background),
+                                                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T, stream);
+    }
+
+    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudalegacy/src/cuda/fgd.hpp b/modules/cudalegacy/src/cuda/fgd.hpp
new file mode 100644
index 00000000000..2cfab6e40d3
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/fgd.hpp
@@ -0,0 +1,189 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __FGD_BGFG_COMMON_HPP__
+#define __FGD_BGFG_COMMON_HPP__
+
+#include "opencv2/core/cuda_types.hpp"
+
+namespace fgd
+{
+    struct BGPixelStat
+    {
+    public:
+#ifdef __CUDACC__
+        __device__ float& Pbc(int i, int j);
+        __device__ float& Pbcc(int i, int j);
+
+        __device__ unsigned char& is_trained_st_model(int i, int j);
+        __device__ unsigned char& is_trained_dyn_model(int i, int j);
+
+        __device__ float& PV_C(int i, int j, int k);
+        __device__ float& PVB_C(int i, int j, int k);
+        template <typename T> __device__ T& V_C(int i, int j, int k);
+
+        __device__ float& PV_CC(int i, int j, int k);
+        __device__ float& PVB_CC(int i, int j, int k);
+        template <typename T> __device__ T& V1_CC(int i, int j, int k);
+        template <typename T> __device__ T& V2_CC(int i, int j, int k);
+#endif
+
+        int rows_;
+
+        unsigned char* Pbc_data_;
+        size_t Pbc_step_;
+
+        unsigned char* Pbcc_data_;
+        size_t Pbcc_step_;
+
+        unsigned char* is_trained_st_model_data_;
+        size_t is_trained_st_model_step_;
+
+        unsigned char* is_trained_dyn_model_data_;
+        size_t is_trained_dyn_model_step_;
+
+        unsigned char* ctable_Pv_data_;
+        size_t ctable_Pv_step_;
+
+        unsigned char* ctable_Pvb_data_;
+        size_t ctable_Pvb_step_;
+
+        unsigned char* ctable_v_data_;
+        size_t ctable_v_step_;
+
+        unsigned char* cctable_Pv_data_;
+        size_t cctable_Pv_step_;
+
+        unsigned char* cctable_Pvb_data_;
+        size_t cctable_Pvb_step_;
+
+        unsigned char* cctable_v1_data_;
+        size_t cctable_v1_step_;
+
+        unsigned char* cctable_v2_data_;
+        size_t cctable_v2_step_;
+    };
+
+#ifdef __CUDACC__
+    __device__ __forceinline__ float& BGPixelStat::Pbc(int i, int j)
+    {
+        return *((float*)(Pbc_data_ + i * Pbc_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::Pbcc(int i, int j)
+    {
+        return *((float*)(Pbcc_data_ + i * Pbcc_step_) + j);
+    }
+
+    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_st_model(int i, int j)
+    {
+        return *((unsigned char*)(is_trained_st_model_data_ + i * is_trained_st_model_step_) + j);
+    }
+
+    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_dyn_model(int i, int j)
+    {
+        return *((unsigned char*)(is_trained_dyn_model_data_ + i * is_trained_dyn_model_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PV_C(int i, int j, int k)
+    {
+        return *((float*)(ctable_Pv_data_ + ((k * rows_) + i) * ctable_Pv_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PVB_C(int i, int j, int k)
+    {
+        return *((float*)(ctable_Pvb_data_ + ((k * rows_) + i) * ctable_Pvb_step_) + j);
+    }
+
+    template <typename T> __device__ __forceinline__ T& BGPixelStat::V_C(int i, int j, int k)
+    {
+        return *((T*)(ctable_v_data_ + ((k * rows_) + i) * ctable_v_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PV_CC(int i, int j, int k)
+    {
+        return *((float*)(cctable_Pv_data_ + ((k * rows_) + i) * cctable_Pv_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PVB_CC(int i, int j, int k)
+    {
+        return *((float*)(cctable_Pvb_data_ + ((k * rows_) + i) * cctable_Pvb_step_) + j);
+    }
+
+    template <typename T> __device__ __forceinline__ T& BGPixelStat::V1_CC(int i, int j, int k)
+    {
+        return *((T*)(cctable_v1_data_ + ((k * rows_) + i) * cctable_v1_step_) + j);
+    }
+
+    template <typename T> __device__ __forceinline__ T& BGPixelStat::V2_CC(int i, int j, int k)
+    {
+        return *((T*)(cctable_v2_data_ + ((k * rows_) + i) * cctable_v2_step_) + j);
+    }
+#endif
+
+    const int PARTIAL_HISTOGRAM_COUNT = 240;
+    const int HISTOGRAM_BIN_COUNT = 256;
+
+    template <typename PT, typename CT>
+    void calcDiffHistogram_gpu(cv::cuda::PtrStepSzb prevFrame, cv::cuda::PtrStepSzb curFrame,
+                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
+                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
+                               bool cc20, cudaStream_t stream);
+
+    template <typename PT, typename CT>
+    void calcDiffThreshMask_gpu(cv::cuda::PtrStepSzb prevFrame, cv::cuda::PtrStepSzb curFrame, uchar3 bestThres, cv::cuda::PtrStepSzb changeMask, cudaStream_t stream);
+
+    void setBGPixelStat(const BGPixelStat& stat);
+
+    template <typename PT, typename CT, typename OT>
+    void bgfgClassification_gpu(cv::cuda::PtrStepSzb prevFrame, cv::cuda::PtrStepSzb curFrame,
+                                cv::cuda::PtrStepSzb Ftd, cv::cuda::PtrStepSzb Fbd, cv::cuda::PtrStepSzb foreground,
+                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+
+    template <typename PT, typename CT, typename OT>
+    void updateBackgroundModel_gpu(cv::cuda::PtrStepSzb prevFrame, cv::cuda::PtrStepSzb curFrame,
+                                   cv::cuda::PtrStepSzb Ftd, cv::cuda::PtrStepSzb Fbd, cv::cuda::PtrStepSzb foreground, cv::cuda::PtrStepSzb background,
+                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
+                                   cudaStream_t stream);
+}
+
+#endif // __FGD_BGFG_COMMON_HPP__
diff --git a/modules/cudalegacy/src/cuda/gmg.cu b/modules/cudalegacy/src/cuda/gmg.cu
new file mode 100644
index 00000000000..c55fdb721bc
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/gmg.cu
@@ -0,0 +1,258 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+namespace cv { namespace cuda { namespace device {
+    namespace gmg
+    {
+        __constant__ int   c_width;
+        __constant__ int   c_height;
+        __constant__ float c_minVal;
+        __constant__ float c_maxVal;
+        __constant__ int   c_quantizationLevels;
+        __constant__ float c_backgroundPrior;
+        __constant__ float c_decisionThreshold;
+        __constant__ int   c_maxFeatures;
+        __constant__ int   c_numInitializationFrames;
+
+        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
+                           float decisionThreshold, int maxFeatures, int numInitializationFrames)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_width, &width, sizeof(width)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_height, &height, sizeof(height)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_minVal, &minVal, sizeof(minVal)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_maxVal, &maxVal, sizeof(maxVal)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_quantizationLevels, &quantizationLevels, sizeof(quantizationLevels)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_backgroundPrior, &backgroundPrior, sizeof(backgroundPrior)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_decisionThreshold, &decisionThreshold, sizeof(decisionThreshold)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_maxFeatures, &maxFeatures, sizeof(maxFeatures)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_numInitializationFrames, &numInitializationFrames, sizeof(numInitializationFrames)) );
+        }
+
+        __device__ float findFeature(const int color, const PtrStepi& colors, const PtrStepf& weights, const int x, const int y, const int nfeatures)
+        {
+            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+            {
+                if (color == colors(fy, x))
+                    return weights(fy, x);
+            }
+
+            // not in histogram, so return 0.
+            return 0.0f;
+        }
+
+        __device__ void normalizeHistogram(PtrStepf weights, const int x, const int y, const int nfeatures)
+        {
+            float total = 0.0f;
+            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+                total += weights(fy, x);
+
+            if (total != 0.0f)
+            {
+                for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+                    weights(fy, x) /= total;
+            }
+        }
+
+        __device__ bool insertFeature(const int color, const float weight, PtrStepi colors, PtrStepf weights, const int x, const int y, int& nfeatures)
+        {
+            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+            {
+                if (color == colors(fy, x))
+                {
+                    // feature in histogram
+
+                    weights(fy, x) += weight;
+
+                    return false;
+                }
+            }
+
+            if (nfeatures == c_maxFeatures)
+            {
+                // discard oldest feature
+
+                int idx = -1;
+                float minVal = numeric_limits<float>::max();
+                for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+                {
+                    const float w = weights(fy, x);
+                    if (w < minVal)
+                    {
+                        minVal = w;
+                        idx = fy;
+                    }
+                }
+
+                colors(idx, x) = color;
+                weights(idx, x) = weight;
+
+                return false;
+            }
+
+            colors(nfeatures * c_height + y, x) = color;
+            weights(nfeatures * c_height + y, x) = weight;
+
+            ++nfeatures;
+
+            return true;
+        }
+
+        namespace detail
+        {
+            template <int cn> struct Quantization
+            {
+                template <typename T>
+                __device__ static int apply(const T& val)
+                {
+                    int res = 0;
+                    res |= static_cast<int>((val.x - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal));
+                    res |= static_cast<int>((val.y - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal)) << 8;
+                    res |= static_cast<int>((val.z - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal)) << 16;
+                    return res;
+                }
+            };
+
+            template <> struct Quantization<1>
+            {
+                template <typename T>
+                __device__ static int apply(T val)
+                {
+                    return static_cast<int>((val - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal));
+                }
+            };
+        }
+
+        template <typename T> struct Quantization : detail::Quantization<VecTraits<T>::cn> {};
+
+        template <typename SrcT>
+        __global__ void update(const PtrStep<SrcT> frame, PtrStepb fgmask, PtrStepi colors_, PtrStepf weights_, PtrStepi nfeatures_,
+                               const int frameNum, const float learningRate, const bool updateBackgroundModel)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= c_width || y >= c_height)
+                return;
+
+            const SrcT pix = frame(y, x);
+            const int newFeatureColor = Quantization<SrcT>::apply(pix);
+
+            int nfeatures = nfeatures_(y, x);
+
+            if (frameNum >= c_numInitializationFrames)
+            {
+                // typical operation
+
+                const float weight = findFeature(newFeatureColor, colors_, weights_, x, y, nfeatures);
+
+                // see Godbehere, Matsukawa, Goldberg (2012) for reasoning behind this implementation of Bayes rule
+                const float posterior = (weight * c_backgroundPrior) / (weight * c_backgroundPrior + (1.0f - weight) * (1.0f - c_backgroundPrior));
+
+                const bool isForeground = ((1.0f - posterior) > c_decisionThreshold);
+                fgmask(y, x) = (uchar)(-isForeground);
+
+                // update histogram.
+
+                if (updateBackgroundModel)
+                {
+                    for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+                        weights_(fy, x) *= 1.0f - learningRate;
+
+                    bool inserted = insertFeature(newFeatureColor, learningRate, colors_, weights_, x, y, nfeatures);
+
+                    if (inserted)
+                    {
+                        normalizeHistogram(weights_, x, y, nfeatures);
+                        nfeatures_(y, x) = nfeatures;
+                    }
+                }
+            }
+            else if (updateBackgroundModel)
+            {
+                // training-mode update
+
+                insertFeature(newFeatureColor, 1.0f, colors_, weights_, x, y, nfeatures);
+
+                if (frameNum == c_numInitializationFrames - 1)
+                    normalizeHistogram(weights_, x, y, nfeatures);
+            }
+        }
+
+        template <typename SrcT>
+        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
+                        int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream)
+        {
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT>, cudaFuncCachePreferL1) );
+
+            update<SrcT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, colors, weights, nfeatures, frameNum, learningRate, updateBackgroundModel);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void update_gpu<uchar  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<uchar3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<uchar4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+
+        template void update_gpu<ushort >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<ushort3>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<ushort4>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+
+        template void update_gpu<float  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<float3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<float4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudalegacy/src/cuda/needle_map.cu b/modules/cudalegacy/src/cuda/needle_map.cu
new file mode 100644
index 00000000000..a98b17cafed
--- /dev/null
+++ b/modules/cudalegacy/src/cuda/needle_map.cu
@@ -0,0 +1,220 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace optical_flow
+    {
+        #define NEEDLE_MAP_SCALE 16
+        #define NUM_VERTS_PER_ARROW 6
+
+        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
+        {
+            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
+
+            volatile float* u_col_sum = smem;
+            volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
+
+            const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
+            const int y = blockIdx.y * NEEDLE_MAP_SCALE;
+
+            u_col_sum[threadIdx.x] = 0;
+            v_col_sum[threadIdx.x] = 0;
+
+            #pragma unroll
+            for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
+            {
+                u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
+                v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
+            }
+
+            if (threadIdx.x < 8)
+            {
+                // now add the column sums
+                const uint X = threadIdx.x;
+
+                if (X | 0xfe == 0xfe)  // bit 0 is 0
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
+                }
+
+                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
+                }
+
+                if (X | 0xf8 == 0xf8)
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
+                }
+
+                if (X == 0)
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
+                }
+            }
+
+            if (threadIdx.x == 0)
+            {
+                const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
+
+                u_col_sum[0] *= coeff;
+                v_col_sum[0] *= coeff;
+
+                u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
+                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
+            }
+        }
+
+        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
+        {
+            const dim3 block(NEEDLE_MAP_SCALE);
+            const dim3 grid(u_avg.cols, u_avg.rows);
+
+            NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
+        {
+            // test - just draw a triangle at each pixel
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
+            const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
+
+            float3 v[NUM_VERTS_PER_ARROW];
+
+            if (x < u_avg.cols && y < u_avg.rows)
+            {
+                const float u_avg_val = u_avg(y, x);
+                const float v_avg_val = v_avg(y, x);
+
+                const float theta = ::atan2f(v_avg_val, u_avg_val);
+
+                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
+                r = fmin(14.0f * (r / max_flow), 14.0f);
+
+                v[0].z = 1.0f;
+                v[1].z = 0.7f;
+                v[2].z = 0.7f;
+                v[3].z = 0.7f;
+                v[4].z = 0.7f;
+                v[5].z = 1.0f;
+
+                v[0].x = arrow_x;
+                v[0].y = arrow_y;
+                v[5].x = arrow_x;
+                v[5].y = arrow_y;
+
+                v[2].x = arrow_x + r * ::cosf(theta);
+                v[2].y = arrow_y + r * ::sinf(theta);
+                v[3].x = v[2].x;
+                v[3].y = v[2].y;
+
+                r = ::fmin(r, 2.5f);
+
+                v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
+                v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
+
+                v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
+                v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
+
+                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[0].x * xscale;
+                vertex_data[indx++] = v[0].y * yscale;
+                vertex_data[indx++] = v[0].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[1].x * xscale;
+                vertex_data[indx++] = v[1].y * yscale;
+                vertex_data[indx++] = v[1].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[2].x * xscale;
+                vertex_data[indx++] = v[2].y * yscale;
+                vertex_data[indx++] = v[2].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[3].x * xscale;
+                vertex_data[indx++] = v[3].y * yscale;
+                vertex_data[indx++] = v[3].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[4].x * xscale;
+                vertex_data[indx++] = v[4].y * yscale;
+                vertex_data[indx++] = v[4].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[5].x * xscale;
+                vertex_data[indx++] = v[5].y * yscale;
+                vertex_data[indx++] = v[5].z;
+            }
+        }
+
+        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
+        {
+            const dim3 block(16);
+            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
+
+            NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudalegacy/src/fgd.cpp b/modules/cudalegacy/src/fgd.cpp
new file mode 100644
index 00000000000..7e5728a1c5b
--- /dev/null
+++ b/modules/cudalegacy/src/fgd.cpp
@@ -0,0 +1,729 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined(HAVE_CUDA) || defined(CUDA_DISABLER) || !defined(HAVE_OPENCV_IMGPROC) || !defined(HAVE_OPENCV_CUDAARITHM) || !defined(HAVE_OPENCV_CUDAIMGPROC)
+
+cv::cuda::FGDParams::FGDParams() { throw_no_cuda(); }
+
+Ptr<cuda::BackgroundSubtractorFGD> cv::cuda::createBackgroundSubtractorFGD(const FGDParams&) { throw_no_cuda(); return Ptr<cuda::BackgroundSubtractorFGD>(); }
+
+#else
+
+#include "cuda/fgd.hpp"
+#include "opencv2/imgproc/imgproc_c.h"
+
+/////////////////////////////////////////////////////////////////////////
+// FGDParams
+
+namespace
+{
+    // Default parameters of foreground detection algorithm:
+    const int BGFG_FGD_LC  = 128;
+    const int BGFG_FGD_N1C = 15;
+    const int BGFG_FGD_N2C = 25;
+
+    const int BGFG_FGD_LCC   = 64;
+    const int BGFG_FGD_N1CC = 25;
+    const int BGFG_FGD_N2CC = 40;
+
+    // Background reference image update parameter:
+    const float BGFG_FGD_ALPHA_1 = 0.1f;
+
+    // stat model update parameter
+    // 0.002f ~ 1K frame(~45sec), 0.005 ~ 18sec (if 25fps and absolutely static BG)
+    const float BGFG_FGD_ALPHA_2 = 0.005f;
+
+    // start value for alpha parameter (to fast initiate statistic model)
+    const float BGFG_FGD_ALPHA_3 = 0.1f;
+
+    const float BGFG_FGD_DELTA = 2.0f;
+
+    const float BGFG_FGD_T = 0.9f;
+
+    const float BGFG_FGD_MINAREA= 15.0f;
+}
+
+cv::cuda::FGDParams::FGDParams()
+{
+    Lc      = BGFG_FGD_LC;
+    N1c     = BGFG_FGD_N1C;
+    N2c     = BGFG_FGD_N2C;
+
+    Lcc     = BGFG_FGD_LCC;
+    N1cc    = BGFG_FGD_N1CC;
+    N2cc    = BGFG_FGD_N2CC;
+
+    delta   = BGFG_FGD_DELTA;
+
+    alpha1  = BGFG_FGD_ALPHA_1;
+    alpha2  = BGFG_FGD_ALPHA_2;
+    alpha3  = BGFG_FGD_ALPHA_3;
+
+    T       = BGFG_FGD_T;
+    minArea = BGFG_FGD_MINAREA;
+
+    is_obj_without_holes = true;
+    perform_morphing     = 1;
+}
+
+/////////////////////////////////////////////////////////////////////////
+// copyChannels
+
+namespace
+{
+    void copyChannels(const GpuMat& src, GpuMat& dst, int dst_cn = -1)
+    {
+        const int src_cn = src.channels();
+
+        if (dst_cn < 0)
+            dst_cn = src_cn;
+
+        cuda::ensureSizeIsEnough(src.size(), CV_MAKE_TYPE(src.depth(), dst_cn), dst);
+
+        if (src_cn == dst_cn)
+        {
+            src.copyTo(dst);
+        }
+        else
+        {
+            static const int cvt_codes[4][4] =
+            {
+                {-1, -1, COLOR_GRAY2BGR, COLOR_GRAY2BGRA},
+                {-1, -1, -1, -1},
+                {COLOR_BGR2GRAY, -1, -1, COLOR_BGR2BGRA},
+                {COLOR_BGRA2GRAY, -1, COLOR_BGRA2BGR, -1}
+            };
+
+            const int cvt_code = cvt_codes[src_cn - 1][dst_cn - 1];
+            CV_DbgAssert( cvt_code >= 0 );
+
+            cuda::cvtColor(src, dst, cvt_code, dst_cn);
+        }
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+// changeDetection
+
+namespace
+{
+    void calcDiffHistogram(const GpuMat& prevFrame, const GpuMat& curFrame, GpuMat& hist, GpuMat& histBuf)
+    {
+        typedef void (*func_t)(PtrStepSzb prevFrame, PtrStepSzb curFrame,
+                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
+                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
+                               bool cc20, cudaStream_t stream);
+        static const func_t funcs[4][4] =
+        {
+            {0,0,0,0},
+            {0,0,0,0},
+            {0,0,fgd::calcDiffHistogram_gpu<uchar3, uchar3>,fgd::calcDiffHistogram_gpu<uchar3, uchar4>},
+            {0,0,fgd::calcDiffHistogram_gpu<uchar4, uchar3>,fgd::calcDiffHistogram_gpu<uchar4, uchar4>}
+        };
+
+        hist.create(3, 256, CV_32SC1);
+        histBuf.create(3, fgd::PARTIAL_HISTOGRAM_COUNT * fgd::HISTOGRAM_BIN_COUNT, CV_32SC1);
+
+        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](
+                    prevFrame, curFrame,
+                    hist.ptr<unsigned int>(0), hist.ptr<unsigned int>(1), hist.ptr<unsigned int>(2),
+                    histBuf.ptr<unsigned int>(0), histBuf.ptr<unsigned int>(1), histBuf.ptr<unsigned int>(2),
+                    deviceSupports(FEATURE_SET_COMPUTE_20), 0);
+    }
+
+    void calcRelativeVariance(unsigned int hist[3 * 256], double relativeVariance[3][fgd::HISTOGRAM_BIN_COUNT])
+    {
+        std::memset(relativeVariance, 0, 3 * fgd::HISTOGRAM_BIN_COUNT * sizeof(double));
+
+        for (int thres = fgd::HISTOGRAM_BIN_COUNT - 2; thres >= 0; --thres)
+        {
+            Vec3d sum(0.0, 0.0, 0.0);
+            Vec3d sqsum(0.0, 0.0, 0.0);
+            Vec3i count(0, 0, 0);
+
+            for (int j = thres; j < fgd::HISTOGRAM_BIN_COUNT; ++j)
+            {
+                sum[0]   += static_cast<double>(j) * hist[j];
+                sqsum[0] += static_cast<double>(j * j) * hist[j];
+                count[0] += hist[j];
+
+                sum[1]   += static_cast<double>(j) * hist[j + 256];
+                sqsum[1] += static_cast<double>(j * j) * hist[j + 256];
+                count[1] += hist[j + 256];
+
+                sum[2]   += static_cast<double>(j) * hist[j + 512];
+                sqsum[2] += static_cast<double>(j * j) * hist[j + 512];
+                count[2] += hist[j + 512];
+            }
+
+            count[0] = std::max(count[0], 1);
+            count[1] = std::max(count[1], 1);
+            count[2] = std::max(count[2], 1);
+
+            Vec3d my(
+                sum[0] / count[0],
+                sum[1] / count[1],
+                sum[2] / count[2]
+            );
+
+            relativeVariance[0][thres] = std::sqrt(sqsum[0] / count[0] - my[0] * my[0]);
+            relativeVariance[1][thres] = std::sqrt(sqsum[1] / count[1] - my[1] * my[1]);
+            relativeVariance[2][thres] = std::sqrt(sqsum[2] / count[2] - my[2] * my[2]);
+        }
+    }
+
+    void calcDiffThreshMask(const GpuMat& prevFrame, const GpuMat& curFrame, Vec3d bestThres, GpuMat& changeMask)
+    {
+        typedef void (*func_t)(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+        static const func_t funcs[4][4] =
+        {
+            {0,0,0,0},
+            {0,0,0,0},
+            {0,0,fgd::calcDiffThreshMask_gpu<uchar3, uchar3>,fgd::calcDiffThreshMask_gpu<uchar3, uchar4>},
+            {0,0,fgd::calcDiffThreshMask_gpu<uchar4, uchar3>,fgd::calcDiffThreshMask_gpu<uchar4, uchar4>}
+        };
+
+        changeMask.setTo(Scalar::all(0));
+
+        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](prevFrame, curFrame,
+                                                                 make_uchar3((uchar)bestThres[0], (uchar)bestThres[1], (uchar)bestThres[2]),
+                                                                 changeMask, 0);
+    }
+
+    // performs change detection for Foreground detection algorithm
+    void changeDetection(const GpuMat& prevFrame, const GpuMat& curFrame, GpuMat& changeMask, GpuMat& hist, GpuMat& histBuf)
+    {
+        calcDiffHistogram(prevFrame, curFrame, hist, histBuf);
+
+        unsigned int histData[3 * 256];
+        Mat h_hist(3, 256, CV_32SC1, histData);
+        hist.download(h_hist);
+
+        double relativeVariance[3][fgd::HISTOGRAM_BIN_COUNT];
+        calcRelativeVariance(histData, relativeVariance);
+
+        // Find maximum:
+        Vec3d bestThres(10.0, 10.0, 10.0);
+        for (int i = 0; i < fgd::HISTOGRAM_BIN_COUNT; ++i)
+        {
+            bestThres[0] = std::max(bestThres[0], relativeVariance[0][i]);
+            bestThres[1] = std::max(bestThres[1], relativeVariance[1][i]);
+            bestThres[2] = std::max(bestThres[2], relativeVariance[2][i]);
+        }
+
+        calcDiffThreshMask(prevFrame, curFrame, bestThres, changeMask);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+// bgfgClassification
+
+namespace
+{
+    int bgfgClassification(const GpuMat& prevFrame, const GpuMat& curFrame,
+                           const GpuMat& Ftd, const GpuMat& Fbd,
+                           GpuMat& foreground,
+                           const FGDParams& params, int out_cn)
+    {
+        typedef void (*func_t)(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
+                               int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+        static const func_t funcs[4][4][4] =
+        {
+            {
+                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0},
+                {0,0,fgd::bgfgClassification_gpu<uchar3, uchar3, uchar3>,fgd::bgfgClassification_gpu<uchar3, uchar3, uchar4>},
+                {0,0,fgd::bgfgClassification_gpu<uchar3, uchar4, uchar3>,fgd::bgfgClassification_gpu<uchar3, uchar4, uchar4>}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0},
+                {0,0,fgd::bgfgClassification_gpu<uchar4, uchar3, uchar3>,fgd::bgfgClassification_gpu<uchar4, uchar3, uchar4>},
+                {0,0,fgd::bgfgClassification_gpu<uchar4, uchar4, uchar3>,fgd::bgfgClassification_gpu<uchar4, uchar4, uchar4>}
+            }
+        };
+
+        const int deltaC  = cvRound(params.delta * 256 / params.Lc);
+        const int deltaCC = cvRound(params.delta * 256 / params.Lcc);
+
+        funcs[prevFrame.channels() - 1][curFrame.channels() - 1][out_cn - 1](prevFrame, curFrame, Ftd, Fbd, foreground,
+                                                                             deltaC, deltaCC, params.alpha2,
+                                                                             params.N1c, params.N1cc, 0);
+
+        int count = cuda::countNonZero(foreground);
+
+        cuda::multiply(foreground, Scalar::all(255), foreground);
+
+        return count;
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+// smoothForeground
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+
+namespace
+{
+    void morphology(const GpuMat& src, GpuMat& dst, GpuMat& filterBrd, int brd, Ptr<cuda::Filter>& filter, Scalar brdVal)
+    {
+        cuda::copyMakeBorder(src, filterBrd, brd, brd, brd, brd, BORDER_CONSTANT, brdVal);
+        filter->apply(filterBrd(Rect(brd, brd, src.cols, src.rows)), dst);
+    }
+
+    void smoothForeground(GpuMat& foreground, GpuMat& filterBrd, GpuMat& buf,
+                          Ptr<cuda::Filter>& erodeFilter, Ptr<cuda::Filter>& dilateFilter,
+                          const FGDParams& params)
+    {
+        const int brd = params.perform_morphing;
+
+        const Scalar erodeBrdVal = Scalar::all(UCHAR_MAX);
+        const Scalar dilateBrdVal = Scalar::all(0);
+
+        // MORPH_OPEN
+        morphology(foreground, buf, filterBrd, brd, erodeFilter, erodeBrdVal);
+        morphology(buf, foreground, filterBrd, brd, dilateFilter, dilateBrdVal);
+
+        // MORPH_CLOSE
+        morphology(foreground, buf, filterBrd, brd, dilateFilter, dilateBrdVal);
+        morphology(buf, foreground, filterBrd, brd, erodeFilter, erodeBrdVal);
+    }
+}
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////
+// findForegroundRegions
+
+namespace
+{
+    void seqToContours(CvSeq* _ccontours, CvMemStorage* storage, OutputArrayOfArrays _contours)
+    {
+        Seq<CvSeq*> all_contours(cvTreeToNodeSeq(_ccontours, sizeof(CvSeq), storage));
+
+        size_t total = all_contours.size();
+
+        _contours.create((int) total, 1, 0, -1, true);
+
+        SeqIterator<CvSeq*> it = all_contours.begin();
+        for (size_t i = 0; i < total; ++i, ++it)
+        {
+            CvSeq* c = *it;
+            ((CvContour*)c)->color = (int)i;
+            _contours.create((int)c->total, 1, CV_32SC2, (int)i, true);
+            Mat ci = _contours.getMat((int)i);
+            CV_Assert( ci.isContinuous() );
+            cvCvtSeqToArray(c, ci.data);
+        }
+    }
+
+    int findForegroundRegions(GpuMat& d_foreground, Mat& h_foreground, std::vector< std::vector<Point> >& foreground_regions,
+                              CvMemStorage* storage, const FGDParams& params)
+    {
+        int region_count = 0;
+
+        // Discard under-size foreground regions:
+
+        d_foreground.download(h_foreground);
+        IplImage ipl_foreground = cvIplImage(h_foreground);
+        CvSeq* first_seq = 0;
+
+        cvFindContours(&ipl_foreground, storage, &first_seq, sizeof(CvContour), CV_RETR_LIST);
+
+        for (CvSeq* seq = first_seq; seq; seq = seq->h_next)
+        {
+            CvContour* cnt = reinterpret_cast<CvContour*>(seq);
+
+            if (cnt->rect.width * cnt->rect.height < params.minArea || (params.is_obj_without_holes && CV_IS_SEQ_HOLE(seq)))
+            {
+                // Delete under-size contour:
+                CvSeq* prev_seq = seq->h_prev;
+                if (prev_seq)
+                {
+                    prev_seq->h_next = seq->h_next;
+
+                    if (seq->h_next)
+                        seq->h_next->h_prev = prev_seq;
+                }
+                else
+                {
+                    first_seq = seq->h_next;
+
+                    if (seq->h_next)
+                        seq->h_next->h_prev = NULL;
+                }
+            }
+            else
+            {
+                region_count++;
+            }
+        }
+
+        seqToContours(first_seq, storage, foreground_regions);
+        h_foreground.setTo(0);
+
+        drawContours(h_foreground, foreground_regions, -1, Scalar::all(255), -1);
+
+        d_foreground.upload(h_foreground);
+
+        return region_count;
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+// updateBackgroundModel
+
+namespace
+{
+    void updateBackgroundModel(const GpuMat& prevFrame, const GpuMat& curFrame, const GpuMat& Ftd, const GpuMat& Fbd,
+                               const GpuMat& foreground, GpuMat& background,
+                               const FGDParams& params)
+    {
+        typedef void (*func_t)(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd,
+                               PtrStepSzb foreground, PtrStepSzb background,
+                               int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+        static const func_t funcs[4][4][4] =
+        {
+            {
+                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0},
+                {0,0,fgd::updateBackgroundModel_gpu<uchar3, uchar3, uchar3>,fgd::updateBackgroundModel_gpu<uchar3, uchar3, uchar4>},
+                {0,0,fgd::updateBackgroundModel_gpu<uchar3, uchar4, uchar3>,fgd::updateBackgroundModel_gpu<uchar3, uchar4, uchar4>}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0},
+                {0,0,fgd::updateBackgroundModel_gpu<uchar4, uchar3, uchar3>,fgd::updateBackgroundModel_gpu<uchar4, uchar3, uchar4>},
+                {0,0,fgd::updateBackgroundModel_gpu<uchar4, uchar4, uchar3>,fgd::updateBackgroundModel_gpu<uchar4, uchar4, uchar4>}
+            }
+        };
+
+        const int deltaC  = cvRound(params.delta * 256 / params.Lc);
+        const int deltaCC = cvRound(params.delta * 256 / params.Lcc);
+
+        funcs[prevFrame.channels() - 1][curFrame.channels() - 1][background.channels() - 1](
+                    prevFrame, curFrame, Ftd, Fbd, foreground, background,
+                    deltaC, deltaCC, params.alpha1, params.alpha2, params.alpha3,
+                    params.N1c, params.N1cc, params.N2c, params.N2cc, params.T,
+                    0);
+    }
+}
+
+
+namespace
+{
+    class BGPixelStat
+    {
+    public:
+        void create(Size size, const FGDParams& params);
+
+        void setTrained();
+
+        operator fgd::BGPixelStat();
+
+    private:
+        GpuMat Pbc_;
+        GpuMat Pbcc_;
+        GpuMat is_trained_st_model_;
+        GpuMat is_trained_dyn_model_;
+
+        GpuMat ctable_Pv_;
+        GpuMat ctable_Pvb_;
+        GpuMat ctable_v_;
+
+        GpuMat cctable_Pv_;
+        GpuMat cctable_Pvb_;
+        GpuMat cctable_v1_;
+        GpuMat cctable_v2_;
+    };
+
+    void BGPixelStat::create(Size size, const FGDParams& params)
+    {
+        cuda::ensureSizeIsEnough(size, CV_32FC1, Pbc_);
+        Pbc_.setTo(Scalar::all(0));
+
+        cuda::ensureSizeIsEnough(size, CV_32FC1, Pbcc_);
+        Pbcc_.setTo(Scalar::all(0));
+
+        cuda::ensureSizeIsEnough(size, CV_8UC1, is_trained_st_model_);
+        is_trained_st_model_.setTo(Scalar::all(0));
+
+        cuda::ensureSizeIsEnough(size, CV_8UC1, is_trained_dyn_model_);
+        is_trained_dyn_model_.setTo(Scalar::all(0));
+
+        cuda::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_32FC1, ctable_Pv_);
+        ctable_Pv_.setTo(Scalar::all(0));
+
+        cuda::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_32FC1, ctable_Pvb_);
+        ctable_Pvb_.setTo(Scalar::all(0));
+
+        cuda::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_8UC4, ctable_v_);
+        ctable_v_.setTo(Scalar::all(0));
+
+        cuda::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_32FC1, cctable_Pv_);
+        cctable_Pv_.setTo(Scalar::all(0));
+
+        cuda::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_32FC1, cctable_Pvb_);
+        cctable_Pvb_.setTo(Scalar::all(0));
+
+        cuda::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_8UC4, cctable_v1_);
+        cctable_v1_.setTo(Scalar::all(0));
+
+        cuda::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_8UC4, cctable_v2_);
+        cctable_v2_.setTo(Scalar::all(0));
+    }
+
+    void BGPixelStat::setTrained()
+    {
+        is_trained_st_model_.setTo(Scalar::all(1));
+        is_trained_dyn_model_.setTo(Scalar::all(1));
+    }
+
+    BGPixelStat::operator fgd::BGPixelStat()
+    {
+        fgd::BGPixelStat stat;
+
+        stat.rows_ = Pbc_.rows;
+
+        stat.Pbc_data_ = Pbc_.data;
+        stat.Pbc_step_ = Pbc_.step;
+
+        stat.Pbcc_data_ = Pbcc_.data;
+        stat.Pbcc_step_ = Pbcc_.step;
+
+        stat.is_trained_st_model_data_ = is_trained_st_model_.data;
+        stat.is_trained_st_model_step_ = is_trained_st_model_.step;
+
+        stat.is_trained_dyn_model_data_ = is_trained_dyn_model_.data;
+        stat.is_trained_dyn_model_step_ = is_trained_dyn_model_.step;
+
+        stat.ctable_Pv_data_ = ctable_Pv_.data;
+        stat.ctable_Pv_step_ = ctable_Pv_.step;
+
+        stat.ctable_Pvb_data_ = ctable_Pvb_.data;
+        stat.ctable_Pvb_step_ = ctable_Pvb_.step;
+
+        stat.ctable_v_data_ = ctable_v_.data;
+        stat.ctable_v_step_ = ctable_v_.step;
+
+        stat.cctable_Pv_data_ = cctable_Pv_.data;
+        stat.cctable_Pv_step_ = cctable_Pv_.step;
+
+        stat.cctable_Pvb_data_ = cctable_Pvb_.data;
+        stat.cctable_Pvb_step_ = cctable_Pvb_.step;
+
+        stat.cctable_v1_data_ = cctable_v1_.data;
+        stat.cctable_v1_step_ = cctable_v1_.step;
+
+        stat.cctable_v2_data_ = cctable_v2_.data;
+        stat.cctable_v2_step_ = cctable_v2_.step;
+
+        return stat;
+    }
+
+    class FGDImpl : public cuda::BackgroundSubtractorFGD
+    {
+    public:
+        explicit FGDImpl(const FGDParams& params);
+        ~FGDImpl();
+
+        void apply(InputArray image, OutputArray fgmask, double learningRate=-1);
+
+        void getBackgroundImage(OutputArray backgroundImage) const;
+
+        void getForegroundRegions(OutputArrayOfArrays foreground_regions);
+
+    private:
+        void initialize(const GpuMat& firstFrame);
+
+        FGDParams params_;
+        Size frameSize_;
+
+        GpuMat background_;
+        GpuMat foreground_;
+        std::vector< std::vector<Point> > foreground_regions_;
+
+        Mat h_foreground_;
+
+        GpuMat prevFrame_;
+        GpuMat Ftd_;
+        GpuMat Fbd_;
+        BGPixelStat stat_;
+
+        GpuMat hist_;
+        GpuMat histBuf_;
+
+        GpuMat buf_;
+        GpuMat filterBrd_;
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        Ptr<cuda::Filter> dilateFilter_;
+        Ptr<cuda::Filter> erodeFilter_;
+#endif
+
+        CvMemStorage* storage_;
+    };
+
+    FGDImpl::FGDImpl(const FGDParams& params) : params_(params), frameSize_(0, 0)
+    {
+        storage_ = cvCreateMemStorage();
+        CV_Assert( storage_ != 0 );
+    }
+
+    FGDImpl::~FGDImpl()
+    {
+        cvReleaseMemStorage(&storage_);
+    }
+
+    void FGDImpl::apply(InputArray _frame, OutputArray fgmask, double)
+    {
+        GpuMat curFrame = _frame.getGpuMat();
+
+        if (curFrame.size() != frameSize_)
+        {
+            initialize(curFrame);
+            return;
+        }
+
+        CV_Assert( curFrame.type() == CV_8UC3 || curFrame.type() == CV_8UC4 );
+        CV_Assert( curFrame.size() == prevFrame_.size() );
+
+        cvClearMemStorage(storage_);
+        foreground_regions_.clear();
+        foreground_.setTo(Scalar::all(0));
+
+        changeDetection(prevFrame_, curFrame, Ftd_, hist_, histBuf_);
+        changeDetection(background_, curFrame, Fbd_, hist_, histBuf_);
+
+        int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, params_, 4);
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        if (params_.perform_morphing > 0)
+            smoothForeground(foreground_, filterBrd_, buf_, erodeFilter_, dilateFilter_, params_);
+#endif
+
+        if (params_.minArea > 0 || params_.is_obj_without_holes)
+            findForegroundRegions(foreground_, h_foreground_, foreground_regions_, storage_, params_);
+
+        // Check ALL BG update condition:
+        const double BGFG_FGD_BG_UPDATE_TRESH = 0.5;
+        if (static_cast<double>(FG_pixels_count) / Ftd_.size().area() > BGFG_FGD_BG_UPDATE_TRESH)
+            stat_.setTrained();
+
+        updateBackgroundModel(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, background_, params_);
+
+        copyChannels(curFrame, prevFrame_, 4);
+
+        foreground_.copyTo(fgmask);
+    }
+
+    void FGDImpl::getBackgroundImage(OutputArray backgroundImage) const
+    {
+        cuda::cvtColor(background_, backgroundImage, COLOR_BGRA2BGR);
+    }
+
+    void FGDImpl::getForegroundRegions(OutputArrayOfArrays dst)
+    {
+        size_t total = foreground_regions_.size();
+
+        dst.create((int) total, 1, 0, -1, true);
+
+        for (size_t i = 0; i < total; ++i)
+        {
+            std::vector<Point>& c = foreground_regions_[i];
+
+            dst.create((int) c.size(), 1, CV_32SC2, (int) i, true);
+            Mat ci = dst.getMat((int) i);
+
+            Mat(ci.size(), ci.type(), &c[0]).copyTo(ci);
+        }
+    }
+
+    void FGDImpl::initialize(const GpuMat& firstFrame)
+    {
+        CV_Assert( firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4 );
+
+        frameSize_ = firstFrame.size();
+
+        cuda::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, foreground_);
+
+        copyChannels(firstFrame, background_, 4);
+        copyChannels(firstFrame, prevFrame_, 4);
+
+        cuda::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, Ftd_);
+        cuda::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, Fbd_);
+
+        stat_.create(firstFrame.size(), params_);
+        fgd::setBGPixelStat(stat_);
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        if (params_.perform_morphing > 0)
+        {
+            Mat kernel = getStructuringElement(MORPH_RECT, Size(1 + params_.perform_morphing * 2, 1 + params_.perform_morphing * 2));
+            Point anchor(params_.perform_morphing, params_.perform_morphing);
+
+            dilateFilter_ = cuda::createMorphologyFilter(MORPH_DILATE, CV_8UC1, kernel, anchor);
+            erodeFilter_ = cuda::createMorphologyFilter(MORPH_ERODE, CV_8UC1, kernel, anchor);
+        }
+#endif
+    }
+}
+
+Ptr<cuda::BackgroundSubtractorFGD> cv::cuda::createBackgroundSubtractorFGD(const FGDParams& params)
+{
+    return makePtr<FGDImpl>(params);
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/cudalegacy/src/gmg.cpp b/modules/cudalegacy/src/gmg.cpp
new file mode 100644
index 00000000000..a982d8689bf
--- /dev/null
+++ b/modules/cudalegacy/src/gmg.cpp
@@ -0,0 +1,277 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+Ptr<cuda::BackgroundSubtractorGMG> cv::cuda::createBackgroundSubtractorGMG(int, double) { throw_no_cuda(); return Ptr<cuda::BackgroundSubtractorGMG>(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device {
+    namespace gmg
+    {
+        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
+                           float decisionThreshold, int maxFeatures, int numInitializationFrames);
+
+        template <typename SrcT>
+        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
+                        int frameNum,  float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    class GMGImpl : public cuda::BackgroundSubtractorGMG
+    {
+    public:
+        GMGImpl(int initializationFrames, double decisionThreshold);
+
+        void apply(InputArray image, OutputArray fgmask, double learningRate=-1);
+        void apply(InputArray image, OutputArray fgmask, double learningRate, Stream& stream);
+
+        void getBackgroundImage(OutputArray backgroundImage) const;
+
+        int getMaxFeatures() const { return maxFeatures_; }
+        void setMaxFeatures(int maxFeatures) { maxFeatures_ = maxFeatures; }
+
+        double getDefaultLearningRate() const { return learningRate_; }
+        void setDefaultLearningRate(double lr) { learningRate_ = (float) lr; }
+
+        int getNumFrames() const { return numInitializationFrames_; }
+        void setNumFrames(int nframes) { numInitializationFrames_ = nframes; }
+
+        int getQuantizationLevels() const { return quantizationLevels_; }
+        void setQuantizationLevels(int nlevels) { quantizationLevels_ = nlevels; }
+
+        double getBackgroundPrior() const { return backgroundPrior_; }
+        void setBackgroundPrior(double bgprior) { backgroundPrior_ = (float) bgprior; }
+
+        int getSmoothingRadius() const { return smoothingRadius_; }
+        void setSmoothingRadius(int radius) { smoothingRadius_ = radius; }
+
+        double getDecisionThreshold() const { return decisionThreshold_; }
+        void setDecisionThreshold(double thresh) { decisionThreshold_ = (float) thresh; }
+
+        bool getUpdateBackgroundModel() const { return updateBackgroundModel_; }
+        void setUpdateBackgroundModel(bool update) { updateBackgroundModel_ = update; }
+
+        double getMinVal() const { return minVal_; }
+        void setMinVal(double val) { minVal_ = (float) val; }
+
+        double getMaxVal() const { return maxVal_; }
+        void setMaxVal(double val) { maxVal_ = (float) val; }
+
+    private:
+        void initialize(Size frameSize, float min, float max);
+
+        //! Total number of distinct colors to maintain in histogram.
+        int maxFeatures_;
+
+        //! Set between 0.0 and 1.0, determines how quickly features are "forgotten" from histograms.
+        float learningRate_;
+
+        //! Number of frames of video to use to initialize histograms.
+        int numInitializationFrames_;
+
+        //! Number of discrete levels in each channel to be used in histograms.
+        int quantizationLevels_;
+
+        //! Prior probability that any given pixel is a background pixel. A sensitivity parameter.
+        float backgroundPrior_;
+
+        //! Smoothing radius, in pixels, for cleaning up FG image.
+        int smoothingRadius_;
+
+        //! Value above which pixel is determined to be FG.
+        float decisionThreshold_;
+
+        //! Perform background model update.
+        bool updateBackgroundModel_;
+
+        float minVal_, maxVal_;
+
+        Size frameSize_;
+        int frameNum_;
+
+        GpuMat nfeatures_;
+        GpuMat colors_;
+        GpuMat weights_;
+
+#if defined(HAVE_OPENCV_CUDAFILTERS) && defined(HAVE_OPENCV_CUDAARITHM)
+        Ptr<cuda::Filter> boxFilter_;
+        GpuMat buf_;
+#endif
+    };
+
+    GMGImpl::GMGImpl(int initializationFrames, double decisionThreshold)
+    {
+        maxFeatures_ = 64;
+        learningRate_ = 0.025f;
+        numInitializationFrames_ = initializationFrames;
+        quantizationLevels_ = 16;
+        backgroundPrior_ = 0.8f;
+        decisionThreshold_ = (float) decisionThreshold;
+        smoothingRadius_ = 7;
+        updateBackgroundModel_ = true;
+        minVal_ = maxVal_ = 0;
+    }
+
+    void GMGImpl::apply(InputArray image, OutputArray fgmask, double learningRate)
+    {
+        apply(image, fgmask, learningRate, Stream::Null());
+    }
+
+    void GMGImpl::apply(InputArray _frame, OutputArray _fgmask, double newLearningRate, Stream& stream)
+    {
+        using namespace cv::cuda::device::gmg;
+
+        typedef void (*func_t)(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
+                               int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        static const func_t funcs[6][4] =
+        {
+            {update_gpu<uchar>, 0, update_gpu<uchar3>, update_gpu<uchar4>},
+            {0,0,0,0},
+            {update_gpu<ushort>, 0, update_gpu<ushort3>, update_gpu<ushort4>},
+            {0,0,0,0},
+            {0,0,0,0},
+            {update_gpu<float>, 0, update_gpu<float3>, update_gpu<float4>}
+        };
+
+        GpuMat frame = _frame.getGpuMat();
+
+        CV_Assert( frame.depth() == CV_8U || frame.depth() == CV_16U || frame.depth() == CV_32F );
+        CV_Assert( frame.channels() == 1 || frame.channels() == 3 || frame.channels() == 4 );
+
+        if (newLearningRate != -1.0)
+        {
+            CV_Assert( newLearningRate >= 0.0 && newLearningRate <= 1.0 );
+            learningRate_ = (float) newLearningRate;
+        }
+
+        if (frame.size() != frameSize_)
+        {
+            double minVal = minVal_;
+            double maxVal = maxVal_;
+
+            if (minVal_ == 0 && maxVal_ == 0)
+            {
+                minVal = 0;
+                maxVal = frame.depth() == CV_8U ? 255.0 : frame.depth() == CV_16U ? std::numeric_limits<ushort>::max() : 1.0;
+            }
+
+            initialize(frame.size(), (float) minVal, (float) maxVal);
+        }
+
+        _fgmask.create(frameSize_, CV_8UC1);
+        GpuMat fgmask = _fgmask.getGpuMat();
+
+        fgmask.setTo(Scalar::all(0), stream);
+
+        funcs[frame.depth()][frame.channels() - 1](frame, fgmask, colors_, weights_, nfeatures_, frameNum_,
+                                                   learningRate_, updateBackgroundModel_, StreamAccessor::getStream(stream));
+
+#if defined(HAVE_OPENCV_CUDAFILTERS) && defined(HAVE_OPENCV_CUDAARITHM)
+        // medianBlur
+        if (smoothingRadius_ > 0)
+        {
+            boxFilter_->apply(fgmask, buf_, stream);
+            const int minCount = (smoothingRadius_ * smoothingRadius_ + 1) / 2;
+            const double thresh = 255.0 * minCount / (smoothingRadius_ * smoothingRadius_);
+            cuda::threshold(buf_, fgmask, thresh, 255.0, THRESH_BINARY, stream);
+        }
+#endif
+
+        // keep track of how many frames we have processed
+        ++frameNum_;
+    }
+
+    void GMGImpl::getBackgroundImage(OutputArray backgroundImage) const
+    {
+        CV_UNUSED(backgroundImage);
+        CV_Error(Error::StsNotImplemented, "Not implemented");
+    }
+
+    void GMGImpl::initialize(Size frameSize, float min, float max)
+    {
+        using namespace cv::cuda::device::gmg;
+
+        CV_Assert( maxFeatures_ > 0 );
+        CV_Assert( learningRate_ >= 0.0f && learningRate_ <= 1.0f);
+        CV_Assert( numInitializationFrames_ >= 1);
+        CV_Assert( quantizationLevels_ >= 1 && quantizationLevels_ <= 255);
+        CV_Assert( backgroundPrior_ >= 0.0f && backgroundPrior_ <= 1.0f);
+
+        minVal_ = min;
+        maxVal_ = max;
+        CV_Assert( minVal_ < maxVal_ );
+
+        frameSize_ = frameSize;
+
+        frameNum_ = 0;
+
+        nfeatures_.create(frameSize_, CV_32SC1);
+        colors_.create(maxFeatures_ * frameSize_.height, frameSize_.width, CV_32SC1);
+        weights_.create(maxFeatures_ * frameSize_.height, frameSize_.width, CV_32FC1);
+
+        nfeatures_.setTo(Scalar::all(0));
+
+#if defined(HAVE_OPENCV_CUDAFILTERS) && defined(HAVE_OPENCV_CUDAARITHM)
+        if (smoothingRadius_ > 0)
+            boxFilter_ = cuda::createBoxFilter(CV_8UC1, -1, Size(smoothingRadius_, smoothingRadius_));
+#endif
+
+        loadConstants(frameSize_.width, frameSize_.height, minVal_, maxVal_,
+                      quantizationLevels_, backgroundPrior_, decisionThreshold_, maxFeatures_, numInitializationFrames_);
+    }
+}
+
+Ptr<cuda::BackgroundSubtractorGMG> cv::cuda::createBackgroundSubtractorGMG(int initializationFrames, double decisionThreshold)
+{
+    return makePtr<GMGImpl>(initializationFrames, decisionThreshold);
+}
+
+#endif
diff --git a/modules/cudalegacy/src/graphcuts.cpp b/modules/cudalegacy/src/graphcuts.cpp
new file mode 100644
index 00000000000..f4017a56a80
--- /dev/null
+++ b/modules/cudalegacy/src/graphcuts.cpp
@@ -0,0 +1,283 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+// GraphCut has been removed in NPP 8.0
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || (CUDART_VERSION >= 8000)
+
+void cv::cuda::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::cuda::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_no_cuda(); }
+void cv::cuda::labelComponents(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace ccl
+    {
+        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream);
+
+        template<typename T>
+        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+    }
+}}}
+
+static float4 scalarToCudaType(const cv::Scalar& in)
+{
+  return make_float4((float)in[0], (float)in[1], (float)in[2], (float)in[3]);
+}
+
+void cv::cuda::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
+{
+    CV_Assert(!image.empty());
+
+    int ch = image.channels();
+    CV_Assert(ch <= 4);
+
+    int depth = image.depth();
+
+    typedef void (*func_t)(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+
+    static const func_t suppotLookup[8][4] =
+    {   //    1,    2,     3,     4
+        { device::ccl::computeEdges<uchar>,  0,  device::ccl::computeEdges<uchar3>,  device::ccl::computeEdges<uchar4>  },// CV_8U
+        { 0,                                 0,  0,                                  0                                  },// CV_16U
+        { device::ccl::computeEdges<ushort>, 0,  device::ccl::computeEdges<ushort3>, device::ccl::computeEdges<ushort4> },// CV_8S
+        { 0,                                 0,  0,                                  0                                  },// CV_16S
+        { device::ccl::computeEdges<int>,    0,  0,                                  0                                  },// CV_32S
+        { device::ccl::computeEdges<float>,  0,  0,                                  0                                  },// CV_32F
+        { 0,                                 0,  0,                                  0                                  },// CV_64F
+        { 0,                                 0,  0,                                  0                                  } // CV_16F
+    };
+
+    func_t f = suppotLookup[depth][ch - 1];
+    CV_Assert(f);
+
+    if (image.size() != mask.size() || mask.type() != CV_8UC1)
+        mask.create(image.size(), CV_8UC1);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    float4 culo = scalarToCudaType(lo), cuhi = scalarToCudaType(hi);
+    f(image, mask, culo, cuhi, stream);
+}
+
+void cv::cuda::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
+{
+    CV_Assert(!mask.empty() && mask.type() == CV_8U);
+
+    if (!deviceSupports(SHARED_ATOMICS))
+        CV_Error(cv::Error::StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
+
+    components.create(mask.size(), CV_32SC1);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    device::ccl::labelComponents(mask, components, flags, stream);
+}
+
+namespace
+{
+    typedef NppStatus (*init_func_t)(NppiSize oSize, NppiGraphcutState** ppState, Npp8u* pDeviceMem);
+
+    class NppiGraphcutStateHandler
+    {
+    public:
+        NppiGraphcutStateHandler(NppiSize sznpp, Npp8u* pDeviceMem, const init_func_t func)
+        {
+            nppSafeCall( func(sznpp, &pState, pDeviceMem) );
+        }
+
+        ~NppiGraphcutStateHandler()
+        {
+            nppSafeCall( nppiGraphcutFree(pState) );
+        }
+
+        operator NppiGraphcutState*()
+        {
+            return pState;
+        }
+
+    private:
+        NppiGraphcutState* pState;
+    };
+}
+
+void cv::cuda::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
+{
+#if (CUDA_VERSION < 5000)
+    CV_Assert(terminals.type() == CV_32S);
+#else
+    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
+#endif
+
+    Size src_size = terminals.size();
+
+    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(leftTransp.type() == terminals.type());
+
+    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(rightTransp.type() == terminals.type());
+
+    CV_Assert(top.size() == src_size);
+    CV_Assert(top.type() == terminals.type());
+
+    CV_Assert(bottom.size() == src_size);
+    CV_Assert(bottom.type() == terminals.type());
+
+    labels.create(src_size, CV_8U);
+
+    NppiSize sznpp;
+    sznpp.width = src_size.width;
+    sznpp.height = src_size.height;
+
+    int bufsz;
+    nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) );
+
+    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcutInitAlloc);
+
+#if (CUDA_VERSION < 5000)
+    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+#else
+    if (terminals.type() == CV_32S)
+    {
+        nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+    else
+    {
+        nppSafeCall( nppiGraphcut_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(), top.ptr<Npp32f>(), bottom.ptr<Npp32f>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+#endif
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void cv::cuda::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
+              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
+{
+#if (CUDA_VERSION < 5000)
+    CV_Assert(terminals.type() == CV_32S);
+#else
+    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
+#endif
+
+    Size src_size = terminals.size();
+
+    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(leftTransp.type() == terminals.type());
+
+    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(rightTransp.type() == terminals.type());
+
+    CV_Assert(top.size() == src_size);
+    CV_Assert(top.type() == terminals.type());
+
+    CV_Assert(topLeft.size() == src_size);
+    CV_Assert(topLeft.type() == terminals.type());
+
+    CV_Assert(topRight.size() == src_size);
+    CV_Assert(topRight.type() == terminals.type());
+
+    CV_Assert(bottom.size() == src_size);
+    CV_Assert(bottom.type() == terminals.type());
+
+    CV_Assert(bottomLeft.size() == src_size);
+    CV_Assert(bottomLeft.type() == terminals.type());
+
+    CV_Assert(bottomRight.size() == src_size);
+    CV_Assert(bottomRight.type() == terminals.type());
+
+    labels.create(src_size, CV_8U);
+
+    NppiSize sznpp;
+    sznpp.width = src_size.width;
+    sznpp.height = src_size.height;
+
+    int bufsz;
+    nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );
+
+    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);
+
+#if (CUDA_VERSION < 5000)
+    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
+        top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
+        bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+#else
+    if (terminals.type() == CV_32S)
+    {
+        nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
+            top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
+            bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+    else
+    {
+        nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(),
+            top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(),
+            bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+#endif
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudalegacy/src/image_pyramid.cpp b/modules/cudalegacy/src/image_pyramid.cpp
new file mode 100644
index 00000000000..938ffea5d80
--- /dev/null
+++ b/modules/cudalegacy/src/image_pyramid.cpp
@@ -0,0 +1,147 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray, int, Stream&) { throw_no_cuda(); return Ptr<ImagePyramid>(); }
+
+#else // HAVE_CUDA
+
+namespace
+{
+    class ImagePyramidImpl : public ImagePyramid
+    {
+    public:
+        ImagePyramidImpl(InputArray img, int nLayers, Stream& stream);
+
+        void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const;
+
+    private:
+        GpuMat layer0_;
+        std::vector<GpuMat> pyramid_;
+        int nLayers_;
+    };
+
+    ImagePyramidImpl::ImagePyramidImpl(InputArray _img, int numLayers, Stream& stream)
+    {
+        GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.depth() <= CV_32F && img.channels() <= 4 );
+
+        img.copyTo(layer0_, stream);
+
+        Size szLastLayer = img.size();
+        nLayers_ = 1;
+
+        if (numLayers <= 0)
+            numLayers = 255; // it will cut-off when any of the dimensions goes 1
+
+        pyramid_.resize(numLayers);
+
+        for (int i = 0; i < numLayers - 1; ++i)
+        {
+            Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
+
+            if (szCurLayer.width == 0 || szCurLayer.height == 0)
+                break;
+
+            ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
+            nLayers_++;
+
+            const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
+
+            cv::cuda::device::pyramid::downsampleX2(prevLayer, pyramid_[i], img.depth(), img.channels(), StreamAccessor::getStream(stream));
+
+            szLastLayer = szCurLayer;
+        }
+    }
+
+    void ImagePyramidImpl::getLayer(OutputArray _outImg, Size outRoi, Stream& stream) const
+    {
+        CV_Assert( outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0 );
+
+        ensureSizeIsEnough(outRoi, layer0_.type(), _outImg);
+        GpuMat outImg = _outImg.getGpuMat();
+
+        if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
+        {
+            layer0_.copyTo(outImg, stream);
+            return;
+        }
+
+        float lastScale = 1.0f;
+        float curScale;
+        GpuMat lastLayer = layer0_;
+        GpuMat curLayer;
+
+        for (int i = 0; i < nLayers_ - 1; ++i)
+        {
+            curScale = lastScale * 0.5f;
+            curLayer = pyramid_[i];
+
+            if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
+            {
+                curLayer.copyTo(outImg, stream);
+            }
+
+            if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
+                break;
+
+            lastScale = curScale;
+            lastLayer = curLayer;
+        }
+
+        cv::cuda::device::pyramid::interpolateFrom1(lastLayer, outImg, outImg.depth(), outImg.channels(), StreamAccessor::getStream(stream));
+    }
+}
+
+Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray img, int nLayers, Stream& stream)
+{
+    return Ptr<ImagePyramid>(new ImagePyramidImpl(img, nLayers, stream));
+}
+
+#endif
diff --git a/modules/cudalegacy/src/interpolate_frames.cpp b/modules/cudalegacy/src/interpolate_frames.cpp
new file mode 100644
index 00000000000..5cb7ea8f59c
--- /dev/null
+++ b/modules/cudalegacy/src/interpolate_frames.cpp
@@ -0,0 +1,113 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER)
+
+void cv::cuda::interpolateFrames(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else
+
+void cv::cuda::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv,
+                                float pos, GpuMat& newFrame, GpuMat& buf, Stream& s)
+{
+    CV_Assert(frame0.type() == CV_32FC1);
+    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+    CV_Assert(fu.size() == frame0.size() && fu.type() == frame0.type());
+    CV_Assert(fv.size() == frame0.size() && fv.type() == frame0.type());
+    CV_Assert(bu.size() == frame0.size() && bu.type() == frame0.type());
+    CV_Assert(bv.size() == frame0.size() && bv.type() == frame0.type());
+
+    newFrame.create(frame0.size(), frame0.type());
+
+    buf.create(6 * frame0.rows, frame0.cols, CV_32FC1);
+    buf.setTo(Scalar::all(0));
+
+    // occlusion masks
+    GpuMat occ0 = buf.rowRange(0 * frame0.rows, 1 * frame0.rows);
+    GpuMat occ1 = buf.rowRange(1 * frame0.rows, 2 * frame0.rows);
+
+    // interpolated forward flow
+    GpuMat fui = buf.rowRange(2 * frame0.rows, 3 * frame0.rows);
+    GpuMat fvi = buf.rowRange(3 * frame0.rows, 4 * frame0.rows);
+
+    // interpolated backward flow
+    GpuMat bui = buf.rowRange(4 * frame0.rows, 5 * frame0.rows);
+    GpuMat bvi = buf.rowRange(5 * frame0.rows, 6 * frame0.rows);
+
+    size_t step = frame0.step;
+
+    CV_Assert(frame1.step == step && fu.step == step && fv.step == step && bu.step == step && bv.step == step && newFrame.step == step && buf.step == step);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    NppStStreamHandler h(stream);
+
+    NppStInterpolationState state;
+
+    state.size         = NcvSize32u(frame0.cols, frame0.rows);
+    state.nStep        = static_cast<Ncv32u>(step);
+    state.pSrcFrame0   = const_cast<Ncv32f*>(frame0.ptr<Ncv32f>());
+    state.pSrcFrame1   = const_cast<Ncv32f*>(frame1.ptr<Ncv32f>());
+    state.pFU          = const_cast<Ncv32f*>(fu.ptr<Ncv32f>());
+    state.pFV          = const_cast<Ncv32f*>(fv.ptr<Ncv32f>());
+    state.pBU          = const_cast<Ncv32f*>(bu.ptr<Ncv32f>());
+    state.pBV          = const_cast<Ncv32f*>(bv.ptr<Ncv32f>());
+    state.pos          = pos;
+    state.pNewFrame    = newFrame.ptr<Ncv32f>();
+    state.ppBuffers[0] = occ0.ptr<Ncv32f>();
+    state.ppBuffers[1] = occ1.ptr<Ncv32f>();
+    state.ppBuffers[2] = fui.ptr<Ncv32f>();
+    state.ppBuffers[3] = fvi.ptr<Ncv32f>();
+    state.ppBuffers[4] = bui.ptr<Ncv32f>();
+    state.ppBuffers[5] = bvi.ptr<Ncv32f>();
+
+    ncvSafeCall( nppiStInterpolateFrames(&state) );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+#endif /* HAVE_CUDA */
diff --git a/modules/cudalegacy/src/needle_map.cpp b/modules/cudalegacy/src/needle_map.cpp
new file mode 100644
index 00000000000..185bfc1e828
--- /dev/null
+++ b/modules/cudalegacy/src/needle_map.cpp
@@ -0,0 +1,100 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || !defined(HAVE_OPENCV_CUDAIMGPROC) || defined (CUDA_DISABLER)
+
+void cv::cuda::createOpticalFlowNeedleMap(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace optical_flow
+    {
+        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg);
+        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale);
+    }
+}}}
+
+void cv::cuda::createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors)
+{
+    using namespace cv::cuda::device::optical_flow;
+
+    CV_Assert(u.type() == CV_32FC1);
+    CV_Assert(v.type() == u.type() && v.size() == u.size());
+
+    const int NEEDLE_MAP_SCALE = 16;
+
+    const int x_needles = u.cols / NEEDLE_MAP_SCALE;
+    const int y_needles = u.rows / NEEDLE_MAP_SCALE;
+
+    GpuMat u_avg(y_needles, x_needles, CV_32FC1);
+    GpuMat v_avg(y_needles, x_needles, CV_32FC1);
+
+    NeedleMapAverage_gpu(u, v, u_avg, v_avg);
+
+    const int NUM_VERTS_PER_ARROW = 6;
+
+    const int num_arrows = x_needles * y_needles * NUM_VERTS_PER_ARROW;
+
+    vertex.create(1, num_arrows, CV_32FC3);
+    colors.create(1, num_arrows, CV_32FC3);
+
+    colors.setTo(Scalar::all(1.0));
+
+    double uMax, vMax;
+    cuda::minMax(u_avg, 0, &uMax);
+    cuda::minMax(v_avg, 0, &vMax);
+
+    float max_flow = static_cast<float>(std::sqrt(uMax * uMax + vMax * vMax));
+
+    CreateOpticalFlowNeedleMap_gpu(u_avg, v_avg, vertex.ptr<float>(), colors.ptr<float>(), max_flow, 1.0f / u.cols, 1.0f / u.rows);
+
+    cuda::cvtColor(colors, colors, COLOR_HSV2RGB);
+}
+
+#endif /* HAVE_CUDA */
diff --git a/modules/cudalegacy/src/precomp.hpp b/modules/cudalegacy/src/precomp.hpp
new file mode 100644
index 00000000000..e87cc8620c9
--- /dev/null
+++ b/modules/cudalegacy/src/precomp.hpp
@@ -0,0 +1,85 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+#include <iostream>
+#include <algorithm>
+
+#if defined(__OPENCV_BUILD) && defined(__clang__)
+#pragma clang diagnostic ignored "-Winconsistent-missing-override"
+#endif
+#if defined(__OPENCV_BUILD) && defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wsuggest-override"
+#endif
+
+#include "opencv2/cudalegacy.hpp"
+#include "opencv2/core/utility.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_OBJDETECT
+#  include "opencv2/objdetect.hpp"
+#endif
+
+#ifdef HAVE_OPENCV_CALIB3D
+#  include "opencv2/calib3d.hpp"
+#endif
+
+#ifdef HAVE_OPENCV_CUDAARITHM
+#  include "opencv2/cudaarithm.hpp"
+#endif
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+#  include "opencv2/cudafilters.hpp"
+#endif
+
+#ifdef HAVE_OPENCV_CUDAIMGPROC
+#  include "opencv2/cudaimgproc.hpp"
+#endif
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/cudalegacy/private.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudalegacy/test/NCVAutoTestLister.hpp b/modules/cudalegacy/test/NCVAutoTestLister.hpp
new file mode 100644
index 00000000000..8730eeea7e3
--- /dev/null
+++ b/modules/cudalegacy/test/NCVAutoTestLister.hpp
@@ -0,0 +1,166 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncvautotestlister_hpp_
+#define _ncvautotestlister_hpp_
+
+#include <vector>
+
+#include "NCVTest.hpp"
+#include "main_test_nvidia.h"
+
+class NCVAutoTestLister
+{
+public:
+
+    NCVAutoTestLister(std::string testSuiteName_, OutputLevel outputLevel_ = OutputLevelCompact, NcvBool bStopOnFirstFail_=false)
+        :
+    testSuiteName(testSuiteName_),
+    outputLevel(outputLevel_),
+    bStopOnFirstFail(bStopOnFirstFail_)
+    {
+    }
+
+    void add(INCVTest *test)
+    {
+        this->tests.push_back(test);
+    }
+
+    bool invoke()
+    {
+        Ncv32u nPassed = 0;
+        Ncv32u nFailed = 0;
+        Ncv32u nFailedMem = 0;
+
+        if (outputLevel == OutputLevelCompact)
+        {
+            printf("Test suite '%s' with %d tests\n",
+                testSuiteName.c_str(),
+                (int)(this->tests.size()));
+        }
+
+        for (Ncv32u i=0; i<this->tests.size(); i++)
+        {
+            INCVTest &curTest = *tests[i];
+
+            NCVTestReport curReport;
+            bool res = curTest.executeTest(curReport);
+
+            if (outputLevel == OutputLevelFull)
+            {
+                printf("Test %3i %16s; Consumed mem GPU = %8d, CPU = %8d; %s\n",
+                    i,
+                    curTest.getName().c_str(),
+                    curReport.statsNums["MemGPU"],
+                    curReport.statsNums["MemCPU"],
+                    curReport.statsText["rcode"].c_str());
+            }
+
+            if (res)
+            {
+                nPassed++;
+                if (outputLevel == OutputLevelCompact)
+                {
+                    printf(".");
+                }
+            }
+            else
+            {
+                if (!curReport.statsText["rcode"].compare("FAILED"))
+                {
+                    nFailed++;
+                    if (outputLevel == OutputLevelCompact)
+                    {
+                        printf("x");
+                    }
+                    if (bStopOnFirstFail)
+                    {
+                        break;
+                    }
+                }
+                else
+                {
+                    nFailedMem++;
+                    if (outputLevel == OutputLevelCompact)
+                    {
+                        printf("m");
+                    }
+                }
+            }
+            fflush(stdout);
+        }
+        if (outputLevel == OutputLevelCompact)
+        {
+            printf("\n");
+        }
+
+        if (outputLevel != OutputLevelNone)
+        {
+            printf("Test suite '%s' complete: %d total, %d passed, %d memory errors, %d failed\n\n",
+                testSuiteName.c_str(),
+                (int)(this->tests.size()),
+                nPassed,
+                nFailedMem,
+                nFailed);
+        }
+
+        bool passed = nFailed == 0 && nFailedMem == 0;
+        return passed;
+    }
+
+    ~NCVAutoTestLister()
+    {
+        for (Ncv32u i=0; i<this->tests.size(); i++)
+        {
+            delete tests[i];
+        }
+    }
+
+private:
+
+    std::string testSuiteName;
+    OutputLevel outputLevel;
+    NcvBool bStopOnFirstFail;
+    std::vector<INCVTest *> tests;
+};
+
+#endif // _ncvautotestlister_hpp_
diff --git a/modules/cudalegacy/test/NCVTest.hpp b/modules/cudalegacy/test/NCVTest.hpp
new file mode 100644
index 00000000000..8461c271505
--- /dev/null
+++ b/modules/cudalegacy/test/NCVTest.hpp
@@ -0,0 +1,247 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncvtest_hpp_
+#define _ncvtest_hpp_
+
+#if defined _MSC_VER
+# pragma warning( disable : 4201 4408 4100)
+#endif
+
+#include <string>
+#include <vector>
+#include <map>
+#include <memory>
+#include <algorithm>
+#include <fstream>
+
+#include <cuda_runtime.h>
+
+#include "opencv2/cudalegacy.hpp"
+
+
+struct NCVTestReport
+{
+    std::map<std::string, Ncv32u> statsNums;
+    std::map<std::string, std::string> statsText;
+};
+
+
+class INCVTest
+{
+public:
+    virtual bool executeTest(NCVTestReport &report) = 0;
+    virtual std::string getName() const = 0;
+    virtual ~INCVTest(){}
+};
+
+
+class NCVTestProvider : public INCVTest
+{
+public:
+
+    NCVTestProvider(std::string testName_)
+        :
+        testName(testName_)
+    {
+        int devId;
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", );
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&this->devProp, devId), "Error returned from cudaGetDeviceProperties", );
+    }
+
+    virtual bool init() = 0;
+    virtual bool process() = 0;
+    virtual bool deinit() = 0;
+    virtual bool toString(std::ofstream &strOut) = 0;
+
+    virtual std::string getName() const
+    {
+        return this->testName;
+    }
+
+    virtual ~NCVTestProvider()
+    {
+        deinitMemory();
+    }
+
+    virtual bool executeTest(NCVTestReport &report)
+    {
+        bool res;
+        report.statsText["rcode"] = "FAILED";
+
+        res = initMemory(report);
+        if (!res)
+        {
+            dumpToFile(report);
+            deinitMemory();
+            return false;
+        }
+
+        res = init();
+        if (!res)
+        {
+            dumpToFile(report);
+            deinit();
+            deinitMemory();
+            return false;
+        }
+
+        res = process();
+        if (!res)
+        {
+            dumpToFile(report);
+            deinit();
+            deinitMemory();
+            return false;
+        }
+
+        res = deinit();
+        if (!res)
+        {
+            dumpToFile(report);
+            deinitMemory();
+            return false;
+        }
+
+        deinitMemory();
+
+        report.statsText["rcode"] = "Passed";
+        return true;
+    }
+
+protected:
+
+    cudaDeviceProp devProp;
+    std::unique_ptr<INCVMemAllocator> allocatorGPU;
+    std::unique_ptr<INCVMemAllocator> allocatorCPU;
+
+private:
+
+    std::string testName;
+
+    bool initMemory(NCVTestReport &report)
+    {
+        this->allocatorGPU.reset(new NCVMemStackAllocator(static_cast<Ncv32u>(devProp.textureAlignment)));
+        this->allocatorCPU.reset(new NCVMemStackAllocator(static_cast<Ncv32u>(devProp.textureAlignment)));
+
+        if (!this->allocatorGPU.get()->isInitialized() ||
+            !this->allocatorCPU.get()->isInitialized())
+        {
+            report.statsText["rcode"] = "Memory FAILED";
+            return false;
+        }
+
+        if (!this->process())
+        {
+            report.statsText["rcode"] = "Memory FAILED";
+            return false;
+        }
+
+        Ncv32u maxGPUsize = (Ncv32u)this->allocatorGPU.get()->maxSize();
+        Ncv32u maxCPUsize = (Ncv32u)this->allocatorCPU.get()->maxSize();
+
+        report.statsNums["MemGPU"] = maxGPUsize;
+        report.statsNums["MemCPU"] = maxCPUsize;
+
+        this->allocatorGPU.reset(new NCVMemStackAllocator(NCVMemoryTypeDevice, maxGPUsize, static_cast<Ncv32u>(devProp.textureAlignment)));
+
+        this->allocatorCPU.reset(new NCVMemStackAllocator(NCVMemoryTypeHostPinned, maxCPUsize, static_cast<Ncv32u>(devProp.textureAlignment)));
+
+        if (!this->allocatorGPU.get()->isInitialized() ||
+            !this->allocatorCPU.get()->isInitialized())
+        {
+            report.statsText["rcode"] = "Memory FAILED";
+            return false;
+        }
+
+        return true;
+    }
+
+    void deinitMemory()
+    {
+        this->allocatorGPU.reset();
+        this->allocatorCPU.reset();
+    }
+
+    void dumpToFile(NCVTestReport &report)
+    {
+        bool bReasonMem = (0 == report.statsText["rcode"].compare("Memory FAILED"));
+        std::string fname = "TestDump_";
+        fname += (bReasonMem ? "m_" : "") + this->testName + ".log";
+        std::ofstream stream(fname.c_str(), std::ios::trunc | std::ios::out);
+        if (!stream.is_open()) return;
+
+        stream << "NCV Test Failure Log: " << this->testName << std::endl;
+        stream << "====================================================" << std::endl << std::endl;
+        stream << "Test initialization report: " << std::endl;
+        for (std::map<std::string,std::string>::iterator it=report.statsText.begin();
+             it != report.statsText.end(); ++it)
+        {
+            stream << it->first << "=" << it->second << std::endl;
+        }
+        for (std::map<std::string,Ncv32u>::iterator it=report.statsNums.begin();
+            it != report.statsNums.end(); ++it)
+        {
+            stream << it->first << "=" << it->second << std::endl;
+        }
+        stream << std::endl;
+
+        stream << "Test initialization parameters: " << std::endl;
+        bool bSerializeRes = false;
+        try
+        {
+            bSerializeRes = this->toString(stream);
+        }
+        catch (...)
+        {
+        }
+
+        if (!bSerializeRes)
+        {
+            stream << "Couldn't retrieve object dump" << std::endl;
+        }
+
+        stream.flush();
+    }
+};
+
+#endif // _ncvtest_hpp_
diff --git a/modules/cudalegacy/test/NCVTestSourceProvider.hpp b/modules/cudalegacy/test/NCVTestSourceProvider.hpp
new file mode 100644
index 00000000000..58e92cea5a3
--- /dev/null
+++ b/modules/cudalegacy/test/NCVTestSourceProvider.hpp
@@ -0,0 +1,193 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncvtestsourceprovider_hpp_
+#define _ncvtestsourceprovider_hpp_
+
+#include <memory>
+
+#include "opencv2/highgui.hpp"
+#include "opencv2/cudalegacy.hpp"
+
+
+template <class T>
+class NCVTestSourceProvider
+{
+public:
+
+    NCVTestSourceProvider(Ncv32u seed, T rangeLow, T rangeHigh, Ncv32u maxWidth, Ncv32u maxHeight)
+        :
+        bInit(false)
+    {
+        ncvAssertPrintReturn(rangeLow < rangeHigh, "NCVTestSourceProvider ctor:: Invalid range", );
+
+        int devId;
+        cudaDeviceProp devProp;
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", );
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&devProp, devId), "Error returned from cudaGetDeviceProperties", );
+
+        //Ncv32u maxWpitch = alignUp(maxWidth * sizeof(T), devProp.textureAlignment);
+
+        allocatorCPU.reset(new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, static_cast<Ncv32u>(devProp.textureAlignment)));
+        data.reset(new NCVMatrixAlloc<T>(*this->allocatorCPU.get(), maxWidth, maxHeight));
+        ncvAssertPrintReturn(data.get()->isMemAllocated(), "NCVTestSourceProvider ctor:: Matrix not allocated", );
+
+        this->dataWidth = maxWidth;
+        this->dataHeight = maxHeight;
+
+        srand(seed);
+
+        for (Ncv32u i=0; i<maxHeight; i++)
+        {
+            for (Ncv32u j=0; j<data.get()->stride(); j++)
+            {
+                data.get()->ptr()[i * data.get()->stride() + j] =
+                    (T)(((1.0 * rand()) / RAND_MAX) * (rangeHigh - rangeLow) + rangeLow);
+            }
+        }
+
+        this->bInit = true;
+    }
+
+    NCVTestSourceProvider(std::string pgmFilename)
+        :
+        bInit(false)
+    {
+        ncvAssertPrintReturn(sizeof(T) == 1, "NCVTestSourceProvider ctor:: PGM constructor complies only with 8bit types", );
+
+        cv::Mat image = cv::imread(pgmFilename);
+        ncvAssertPrintReturn(!image.empty(), "NCVTestSourceProvider ctor:: PGM file error", );
+
+        int devId;
+        cudaDeviceProp devProp;
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", );
+        ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&devProp, devId), "Error returned from cudaGetDeviceProperties", );
+
+        allocatorCPU.reset(new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, static_cast<Ncv32u>(devProp.textureAlignment)));
+        data.reset(new NCVMatrixAlloc<T>(*this->allocatorCPU.get(), image.cols, image.rows));
+        ncvAssertPrintReturn(data.get()->isMemAllocated(), "NCVTestSourceProvider ctor:: Matrix not allocated", );
+
+        this->dataWidth = image.cols;
+        this->dataHeight = image.rows;
+
+        cv::Mat hdr(image.size(), CV_8UC1, data.get()->ptr(), data.get()->pitch());
+        image.copyTo(hdr);
+
+        this->bInit = true;
+    }
+
+    NcvBool fill(NCVMatrix<T> &dst)
+    {
+        ncvAssertReturn(this->isInit() &&
+                        dst.memType() == allocatorCPU.get()->memType(), false);
+
+        if (dst.width() == 0 || dst.height() == 0)
+        {
+            return true;
+        }
+
+        for (Ncv32u i=0; i<dst.height(); i++)
+        {
+            Ncv32u srcLine = i % this->dataHeight;
+
+            Ncv32u srcFullChunks = dst.width() / this->dataWidth;
+            for (Ncv32u j=0; j<srcFullChunks; j++)
+            {
+                memcpy(dst.ptr() + i * dst.stride() + j * this->dataWidth,
+                    this->data.get()->ptr() + this->data.get()->stride() * srcLine,
+                    this->dataWidth * sizeof(T));
+            }
+
+            Ncv32u srcLastChunk = dst.width() % this->dataWidth;
+            memcpy(dst.ptr() + i * dst.stride() + srcFullChunks * this->dataWidth,
+                this->data.get()->ptr() + this->data.get()->stride() * srcLine,
+                srcLastChunk * sizeof(T));
+        }
+
+        return true;
+    }
+
+    NcvBool fill(NCVVector<T> &dst)
+    {
+        ncvAssertReturn(this->isInit() &&
+                        dst.memType() == allocatorCPU.get()->memType(), false);
+
+        if (dst.length() == 0)
+        {
+            return true;
+        }
+
+        Ncv32u srcLen = this->dataWidth * this->dataHeight;
+
+        Ncv32u srcFullChunks = (Ncv32u)dst.length() / srcLen;
+        for (Ncv32u j=0; j<srcFullChunks; j++)
+        {
+            memcpy(dst.ptr() + j * srcLen, this->data.get()->ptr(), srcLen * sizeof(T));
+        }
+
+        Ncv32u srcLastChunk = dst.length() % srcLen;
+        memcpy(dst.ptr() + srcFullChunks * srcLen, this->data.get()->ptr(), srcLastChunk * sizeof(T));
+
+        return true;
+    }
+
+    ~NCVTestSourceProvider()
+    {
+        data.reset();
+        allocatorCPU.reset();
+    }
+
+private:
+
+    NcvBool isInit(void)
+    {
+        return this->bInit;
+    }
+
+    NcvBool bInit;
+    std::unique_ptr< INCVMemAllocator > allocatorCPU;
+    std::unique_ptr< NCVMatrixAlloc<T> > data;
+    Ncv32u dataWidth;
+    Ncv32u dataHeight;
+};
+
+#endif // _ncvtestsourceprovider_hpp_
diff --git a/modules/cudalegacy/test/TestCompact.cpp b/modules/cudalegacy/test/TestCompact.cpp
new file mode 100644
index 00000000000..70640f37db8
--- /dev/null
+++ b/modules/cudalegacy/test/TestCompact.cpp
@@ -0,0 +1,159 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+TestCompact::TestCompact(std::string testName_, NCVTestSourceProvider<Ncv32u> &src_,
+                                             Ncv32u length_, Ncv32u badElem_, Ncv32u badElemPercentage_)
+    :
+    NCVTestProvider(testName_),
+    src(src_),
+    length(length_),
+    badElem(badElem_),
+    badElemPercentage(badElemPercentage_ > 100 ? 100 : badElemPercentage_)
+{
+}
+
+
+bool TestCompact::toString(std::ofstream &strOut)
+{
+    strOut << "length=" << length << std::endl;
+    strOut << "badElem=" << badElem << std::endl;
+    strOut << "badElemPercentage=" << badElemPercentage << std::endl;
+    return true;
+}
+
+
+bool TestCompact::init()
+{
+    return true;
+}
+
+
+bool TestCompact::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    NCVVectorAlloc<Ncv32u> h_vecSrc(*this->allocatorCPU.get(), this->length);
+    ncvAssertReturn(h_vecSrc.isMemAllocated(), false);
+    NCVVectorAlloc<Ncv32u> d_vecSrc(*this->allocatorGPU.get(), this->length);
+    ncvAssertReturn(d_vecSrc.isMemAllocated(), false);
+
+    NCVVectorAlloc<Ncv32u> h_vecDst(*this->allocatorCPU.get(), this->length);
+    ncvAssertReturn(h_vecDst.isMemAllocated(), false);
+    NCVVectorAlloc<Ncv32u> d_vecDst(*this->allocatorGPU.get(), this->length);
+    ncvAssertReturn(d_vecDst.isMemAllocated(), false);
+    NCVVectorAlloc<Ncv32u> h_vecDst_d(*this->allocatorCPU.get(), this->length);
+    ncvAssertReturn(h_vecDst_d.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_vecSrc), false);
+    for (Ncv32u i=0; i<this->length; i++)
+    {
+        Ncv32u tmp = (h_vecSrc.ptr()[i]) & 0xFF;
+        tmp = tmp * 99 / 255;
+        if (tmp < this->badElemPercentage)
+        {
+            h_vecSrc.ptr()[i] = this->badElem;
+        }
+    }
+    NCV_SKIP_COND_END
+
+    NCVVectorAlloc<Ncv32u> h_dstLen(*this->allocatorCPU.get(), 1);
+    ncvAssertReturn(h_dstLen.isMemAllocated(), false);
+    Ncv32u bufSize;
+    ncvStat = nppsStCompactGetSize_32u(this->length, &bufSize, this->devProp);
+    ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    NCVVectorAlloc<Ncv8u> d_tmpBuf(*this->allocatorGPU.get(), bufSize);
+    ncvAssertReturn(d_tmpBuf.isMemAllocated(), false);
+
+    Ncv32u h_outElemNum_h = 0;
+
+    NCV_SKIP_COND_BEGIN
+    ncvStat = h_vecSrc.copySolid(d_vecSrc, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    ncvStat = nppsStCompact_32u(d_vecSrc.ptr(), this->length,
+                                d_vecDst.ptr(), h_dstLen.ptr(), this->badElem,
+                                d_tmpBuf.ptr(), bufSize, this->devProp);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    ncvStat = d_vecDst.copySolid(h_vecDst_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppsStCompact_32u_host(h_vecSrc.ptr(), this->length, h_vecDst.ptr(), &h_outElemNum_h, this->badElem);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    if (h_dstLen.ptr()[0] != h_outElemNum_h)
+    {
+        bLoopVirgin = false;
+    }
+    else
+    {
+        for (Ncv32u i=0; bLoopVirgin && i < h_outElemNum_h; i++)
+        {
+            if (h_vecDst.ptr()[i] != h_vecDst_d.ptr()[i])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestCompact::deinit()
+{
+    return true;
+}
diff --git a/modules/cudalegacy/test/TestCompact.h b/modules/cudalegacy/test/TestCompact.h
new file mode 100644
index 00000000000..256cdf4cf9e
--- /dev/null
+++ b/modules/cudalegacy/test/TestCompact.h
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testhypothesescompact_h_
+#define _testhypothesescompact_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestCompact : public NCVTestProvider
+{
+public:
+
+    TestCompact(std::string testName, NCVTestSourceProvider<Ncv32u> &src,
+                          Ncv32u length, Ncv32u badElem, Ncv32u badElemPercentage);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+    TestCompact(const TestCompact&);
+    TestCompact& operator=(const TestCompact&);
+
+
+    NCVTestSourceProvider<Ncv32u> &src;
+    Ncv32u length;
+    Ncv32u badElem;
+    Ncv32u badElemPercentage;
+};
+
+#endif // _testhypothesescompact_h_
diff --git a/modules/cudalegacy/test/TestDrawRects.cpp b/modules/cudalegacy/test/TestDrawRects.cpp
new file mode 100644
index 00000000000..40d8e21b49a
--- /dev/null
+++ b/modules/cudalegacy/test/TestDrawRects.cpp
@@ -0,0 +1,194 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+
+template <class T>
+TestDrawRects<T>::TestDrawRects(std::string testName_, NCVTestSourceProvider<T> &src_,
+                                NCVTestSourceProvider<Ncv32u> &src32u_,
+                                Ncv32u width_, Ncv32u height_, Ncv32u numRects_, T color_)
+    :
+    NCVTestProvider(testName_),
+    src(src_),
+    src32u(src32u_),
+    width(width_),
+    height(height_),
+    numRects(numRects_),
+    color(color_)
+{
+}
+
+
+template <class T>
+bool TestDrawRects<T>::toString(std::ofstream &strOut)
+{
+    strOut << "sizeof(T)=" << sizeof(T) << std::endl;
+    strOut << "width=" << width << std::endl;
+    strOut << "height=" << height << std::endl;
+    strOut << "numRects=" << numRects << std::endl;
+    strOut << "color=" << color << std::endl;
+    return true;
+}
+
+
+template <class T>
+bool TestDrawRects<T>::init()
+{
+    return true;
+}
+
+
+template <class T>
+bool TestDrawRects<T>::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    NCVMatrixAlloc<T> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_img_d(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img_d.isMemAllocated(), false);
+
+    NCVVectorAlloc<NcvRect32u> d_rects(*this->allocatorGPU.get(), this->numRects);
+    ncvAssertReturn(d_rects.isMemAllocated(), false);
+    NCVVectorAlloc<NcvRect32u> h_rects(*this->allocatorCPU.get(), this->numRects);
+    ncvAssertReturn(h_rects.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_img), false);
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    //fill vector of rectangles with random rects covering the input
+    NCVVectorReuse<Ncv32u> h_rects_as32u(h_rects.getSegment());
+    ncvAssertReturn(h_rects_as32u.isMemReused(), false);
+    ncvAssertReturn(this->src32u.fill(h_rects_as32u), false);
+    for (Ncv32u i=0; i<this->numRects; i++)
+    {
+        h_rects.ptr()[i].x = (Ncv32u)(((1.0 * h_rects.ptr()[i].x) / RAND_MAX) * (this->width-2));
+        h_rects.ptr()[i].y = (Ncv32u)(((1.0 * h_rects.ptr()[i].y) / RAND_MAX) * (this->height-2));
+        h_rects.ptr()[i].width = (Ncv32u)(((1.0 * h_rects.ptr()[i].width) / RAND_MAX) * (this->width+10 - h_rects.ptr()[i].x));
+        h_rects.ptr()[i].height = (Ncv32u)(((1.0 * h_rects.ptr()[i].height) / RAND_MAX) * (this->height+10 - h_rects.ptr()[i].y));
+    }
+    ncvStat = h_rects.copySolid(d_rects, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = ncvDrawRects_32u_device((Ncv32u *)d_img.ptr(), d_img.stride(), this->width, this->height,
+                                          (NcvRect32u *)d_rects.ptr(), this->numRects, this->color, 0);
+    }
+    else if (sizeof(T) == sizeof(Ncv8u))
+    {
+        ncvStat = ncvDrawRects_8u_device((Ncv8u *)d_img.ptr(), d_img.stride(), this->width, this->height,
+                                         (NcvRect32u *)d_rects.ptr(), this->numRects, (Ncv8u)this->color, 0);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect drawrects test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    ncvStat = d_img.copySolid(h_img_d, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    NCV_SKIP_COND_BEGIN
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = ncvDrawRects_32u_host((Ncv32u *)h_img.ptr(), h_img.stride(), this->width, this->height,
+                                        (NcvRect32u *)h_rects.ptr(), this->numRects, this->color);
+    }
+    else if (sizeof(T) == sizeof(Ncv8u))
+    {
+        ncvStat = ncvDrawRects_8u_host((Ncv8u *)h_img.ptr(), h_img.stride(), this->width, this->height,
+                                       (NcvRect32u *)h_rects.ptr(), this->numRects, (Ncv8u)this->color);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect drawrects test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    //const Ncv64f relEPS = 0.005;
+    for (Ncv32u i=0; bLoopVirgin && i < h_img.height(); i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < h_img.width(); j++)
+        {
+            if (h_img.ptr()[h_img.stride()*i+j] != h_img_d.ptr()[h_img_d.stride()*i+j])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+template <class T>
+bool TestDrawRects<T>::deinit()
+{
+    return true;
+}
+
+
+template class TestDrawRects<Ncv8u>;
+template class TestDrawRects<Ncv32u>;
diff --git a/modules/cudalegacy/test/TestDrawRects.h b/modules/cudalegacy/test/TestDrawRects.h
new file mode 100644
index 00000000000..b64c133d505
--- /dev/null
+++ b/modules/cudalegacy/test/TestDrawRects.h
@@ -0,0 +1,76 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testdrawrects_h_
+#define _testdrawrects_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+template <class T>
+class TestDrawRects : public NCVTestProvider
+{
+public:
+
+    TestDrawRects(std::string testName, NCVTestSourceProvider<T> &src, NCVTestSourceProvider<Ncv32u> &src32u,
+                  Ncv32u width, Ncv32u height, Ncv32u numRects, T color);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+
+    TestDrawRects(const TestDrawRects&);
+    TestDrawRects& operator=(const TestDrawRects&);
+
+    NCVTestSourceProvider<T> &src;
+    NCVTestSourceProvider<Ncv32u> &src32u;
+    Ncv32u width;
+    Ncv32u height;
+    Ncv32u numRects;
+    T color;
+};
+
+#endif // _testdrawrects_h_
diff --git a/modules/cudalegacy/test/TestHaarCascadeApplication.cpp b/modules/cudalegacy/test/TestHaarCascadeApplication.cpp
new file mode 100644
index 00000000000..d5fbb7561cc
--- /dev/null
+++ b/modules/cudalegacy/test/TestHaarCascadeApplication.cpp
@@ -0,0 +1,335 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+namespace
+{
+    // http://www.christian-seiler.de/projekte/fpmath/
+    class FpuControl
+    {
+    public:
+        FpuControl();
+        ~FpuControl();
+
+    private:
+    #if defined(__GNUC__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__powerpc64__)
+        fpu_control_t fpu_oldcw, fpu_cw;
+    #elif defined(_WIN32) && !defined(_WIN64)
+        unsigned int fpu_oldcw, fpu_cw;
+    #endif
+    };
+
+    FpuControl::FpuControl()
+    {
+    #if defined(__GNUC__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__powerpc64__)
+        _FPU_GETCW(fpu_oldcw);
+        fpu_cw = (fpu_oldcw & ~_FPU_EXTENDED & ~_FPU_DOUBLE & ~_FPU_SINGLE) | _FPU_SINGLE;
+        _FPU_SETCW(fpu_cw);
+    #elif defined(_WIN32) && !defined(_WIN64)
+        _controlfp_s(&fpu_cw, 0, 0);
+        fpu_oldcw = fpu_cw;
+        _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);
+    #endif
+    }
+
+    FpuControl::~FpuControl()
+    {
+    #if defined(__GNUC__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__powerpc64__)
+        _FPU_SETCW(fpu_oldcw);
+    #elif defined(_WIN32) && !defined(_WIN64)
+        _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);
+    #endif
+    }
+}
+
+TestHaarCascadeApplication::TestHaarCascadeApplication(std::string testName_, NCVTestSourceProvider<Ncv8u> &src_,
+                                                       std::string cascadeName_, Ncv32u width_, Ncv32u height_)
+    :
+    NCVTestProvider(testName_),
+    src(src_),
+    cascadeName(cascadeName_),
+    width(width_),
+    height(height_)
+{
+}
+
+
+bool TestHaarCascadeApplication::toString(std::ofstream &strOut)
+{
+    strOut << "cascadeName=" << cascadeName << std::endl;
+    strOut << "width=" << width << std::endl;
+    strOut << "height=" << height << std::endl;
+    return true;
+}
+
+
+bool TestHaarCascadeApplication::init()
+{
+    return true;
+}
+
+bool TestHaarCascadeApplication::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32u numStages, numNodes, numFeatures;
+
+    ncvStat = ncvHaarGetClassifierSize(this->cascadeName, numStages, numNodes, numFeatures);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    NCVVectorAlloc<HaarStage64> h_HaarStages(*this->allocatorCPU.get(), numStages);
+    ncvAssertReturn(h_HaarStages.isMemAllocated(), false);
+    NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(*this->allocatorCPU.get(), numNodes);
+    ncvAssertReturn(h_HaarNodes.isMemAllocated(), false);
+    NCVVectorAlloc<HaarFeature64> h_HaarFeatures(*this->allocatorCPU.get(), numFeatures);
+    ncvAssertReturn(h_HaarFeatures.isMemAllocated(), false);
+
+    NCVVectorAlloc<HaarStage64> d_HaarStages(*this->allocatorGPU.get(), numStages);
+    ncvAssertReturn(d_HaarStages.isMemAllocated(), false);
+    NCVVectorAlloc<HaarClassifierNode128> d_HaarNodes(*this->allocatorGPU.get(), numNodes);
+    ncvAssertReturn(d_HaarNodes.isMemAllocated(), false);
+    NCVVectorAlloc<HaarFeature64> d_HaarFeatures(*this->allocatorGPU.get(), numFeatures);
+    ncvAssertReturn(d_HaarFeatures.isMemAllocated(), false);
+
+    HaarClassifierCascadeDescriptor haar;
+    haar.ClassifierSize.width = haar.ClassifierSize.height = 1;
+    haar.bNeedsTiltedII = false;
+    haar.NumClassifierRootNodes = numNodes;
+    haar.NumClassifierTotalNodes = numNodes;
+    haar.NumFeatures = numFeatures;
+    haar.NumStages = numStages;
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = ncvHaarLoadFromFile_host(this->cascadeName, haar, h_HaarStages, h_HaarNodes, h_HaarFeatures);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    ncvAssertReturn(NCV_SUCCESS == h_HaarStages.copySolid(d_HaarStages, 0), false);
+    ncvAssertReturn(NCV_SUCCESS == h_HaarNodes.copySolid(d_HaarNodes, 0), false);
+    ncvAssertReturn(NCV_SUCCESS == h_HaarFeatures.copySolid(d_HaarFeatures, 0), false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    NCV_SKIP_COND_END
+
+    NcvSize32s srcRoi, srcIIRoi, searchRoi;
+    srcRoi.width = this->width;
+    srcRoi.height = this->height;
+    srcIIRoi.width = srcRoi.width + 1;
+    srcIIRoi.height = srcRoi.height + 1;
+    searchRoi.width = srcIIRoi.width - haar.ClassifierSize.width;
+    searchRoi.height = srcIIRoi.height - haar.ClassifierSize.height;
+    if (searchRoi.width <= 0 || searchRoi.height <= 0)
+    {
+        return false;
+    }
+    NcvSize32u searchRoiU(searchRoi.width, searchRoi.height);
+
+    NCVMatrixAlloc<Ncv8u> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv8u> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+
+    Ncv32u integralWidth = this->width + 1;
+    Ncv32u integralHeight = this->height + 1;
+
+    NCVMatrixAlloc<Ncv32u> d_integralImage(*this->allocatorGPU.get(), integralWidth, integralHeight);
+    ncvAssertReturn(d_integralImage.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(*this->allocatorGPU.get(), integralWidth, integralHeight);
+    ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32u> h_integralImage(*this->allocatorCPU.get(), integralWidth, integralHeight);
+    ncvAssertReturn(h_integralImage.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> h_sqIntegralImage(*this->allocatorCPU.get(), integralWidth, integralHeight);
+    ncvAssertReturn(h_sqIntegralImage.isMemAllocated(), false);
+
+    NCVMatrixAlloc<Ncv32f> d_rectStdDev(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_rectStdDev.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32u> d_pixelMask(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_pixelMask.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32f> h_rectStdDev(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_rectStdDev.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32u> h_pixelMask(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_pixelMask.isMemAllocated(), false);
+
+    NCVVectorAlloc<NcvRect32u> d_hypotheses(*this->allocatorGPU.get(), this->width * this->height);
+    ncvAssertReturn(d_hypotheses.isMemAllocated(), false);
+    NCVVectorAlloc<NcvRect32u> h_hypotheses(*this->allocatorCPU.get(), this->width * this->height);
+    ncvAssertReturn(h_hypotheses.isMemAllocated(), false);
+
+    NCVStatus nppStat;
+    Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
+    nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(this->width, this->height), &szTmpBufIntegral, this->devProp);
+    ncvAssertReturn(nppStat == NPPST_SUCCESS, false);
+    nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(this->width, this->height), &szTmpBufSqIntegral, this->devProp);
+    ncvAssertReturn(nppStat == NPPST_SUCCESS, false);
+    NCVVectorAlloc<Ncv8u> d_tmpIIbuf(*this->allocatorGPU.get(), std::max(szTmpBufIntegral, szTmpBufSqIntegral));
+    ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), false);
+
+    Ncv32u detectionsOnThisScale_d = 0;
+    Ncv32u detectionsOnThisScale_h = 0;
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvAssertReturn(this->src.fill(h_img), false);
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    nppStat = nppiStIntegral_8u32u_C1R(d_img.ptr(), d_img.pitch(),
+                                       d_integralImage.ptr(), d_integralImage.pitch(),
+                                       NcvSize32u(d_img.width(), d_img.height()),
+                                       d_tmpIIbuf.ptr(), szTmpBufIntegral, this->devProp);
+    ncvAssertReturn(nppStat == NPPST_SUCCESS, false);
+
+    nppStat = nppiStSqrIntegral_8u64u_C1R(d_img.ptr(), d_img.pitch(),
+                                          d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
+                                          NcvSize32u(d_img.width(), d_img.height()),
+                                          d_tmpIIbuf.ptr(), szTmpBufSqIntegral, this->devProp);
+    ncvAssertReturn(nppStat == NPPST_SUCCESS, false);
+
+    const NcvRect32u rect(
+        HAAR_STDDEV_BORDER,
+        HAAR_STDDEV_BORDER,
+        haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
+        haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);
+    nppStat = nppiStRectStdDev_32f_C1R(
+        d_integralImage.ptr(), d_integralImage.pitch(),
+        d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
+        d_rectStdDev.ptr(), d_rectStdDev.pitch(),
+        NcvSize32u(searchRoi.width, searchRoi.height), rect,
+        1.0f, true);
+    ncvAssertReturn(nppStat == NPPST_SUCCESS, false);
+
+    ncvStat = d_integralImage.copySolid(h_integralImage, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvStat = d_rectStdDev.copySolid(h_rectStdDev, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    for (Ncv32u i=0; i<searchRoiU.height; i++)
+    {
+        for (Ncv32u j=0; j<h_pixelMask.stride(); j++)
+        {
+            if (j<searchRoiU.width)
+            {
+                h_pixelMask.ptr()[i*h_pixelMask.stride()+j] = (i << 16) | j;
+            }
+            else
+            {
+                h_pixelMask.ptr()[i*h_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;
+            }
+        }
+    }
+    ncvAssertReturn(cudaSuccess == cudaStreamSynchronize(0), false);
+
+    {
+        // calculations here
+        FpuControl fpu;
+        CV_UNUSED(fpu);
+
+        ncvStat = ncvApplyHaarClassifierCascade_host(
+            h_integralImage, h_rectStdDev, h_pixelMask,
+            detectionsOnThisScale_h,
+            haar, h_HaarStages, h_HaarNodes, h_HaarFeatures, false,
+            searchRoiU, 1, 1.0f);
+        ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    }
+
+    NCV_SKIP_COND_END
+
+    int devId;
+    ncvAssertCUDAReturn(cudaGetDevice(&devId), false);
+    cudaDeviceProp _devProp;
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&_devProp, devId), false);
+
+    ncvStat = ncvApplyHaarClassifierCascade_device(
+        d_integralImage, d_rectStdDev, d_pixelMask,
+        detectionsOnThisScale_d,
+        haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
+        searchRoiU, 1, 1.0f,
+        *this->allocatorGPU.get(), *this->allocatorCPU.get(),
+        _devProp, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    NCVMatrixAlloc<Ncv32u> h_pixelMask_d(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_pixelMask_d.isMemAllocated(), false);
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    if (detectionsOnThisScale_d != detectionsOnThisScale_h)
+    {
+        bLoopVirgin = false;
+    }
+    else
+    {
+        std::sort(h_pixelMask_d.ptr(), h_pixelMask_d.ptr() + detectionsOnThisScale_d);
+        for (Ncv32u i=0; i<detectionsOnThisScale_d && bLoopVirgin; i++)
+        {
+            if (h_pixelMask.ptr()[i] != h_pixelMask_d.ptr()[i])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestHaarCascadeApplication::deinit()
+{
+    return true;
+}
diff --git a/modules/cudalegacy/test/TestHaarCascadeApplication.h b/modules/cudalegacy/test/TestHaarCascadeApplication.h
new file mode 100644
index 00000000000..eaa43717451
--- /dev/null
+++ b/modules/cudalegacy/test/TestHaarCascadeApplication.h
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testhaarcascadeapplication_h_
+#define _testhaarcascadeapplication_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestHaarCascadeApplication : public NCVTestProvider
+{
+public:
+
+    TestHaarCascadeApplication(std::string testName, NCVTestSourceProvider<Ncv8u> &src,
+                               std::string cascadeName, Ncv32u width, Ncv32u height);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+    TestHaarCascadeApplication(const TestHaarCascadeApplication&);
+    TestHaarCascadeApplication& operator=(const TestHaarCascadeApplication&);
+
+
+    NCVTestSourceProvider<Ncv8u> &src;
+    std::string cascadeName;
+    Ncv32u width;
+    Ncv32u height;
+};
+
+#endif // _testhaarcascadeapplication_h_
diff --git a/modules/cudalegacy/test/TestHaarCascadeLoader.cpp b/modules/cudalegacy/test/TestHaarCascadeLoader.cpp
new file mode 100644
index 00000000000..8ca44dd1332
--- /dev/null
+++ b/modules/cudalegacy/test/TestHaarCascadeLoader.cpp
@@ -0,0 +1,153 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+
+TestHaarCascadeLoader::TestHaarCascadeLoader(std::string testName_, std::string cascadeName_)
+    :
+    NCVTestProvider(testName_),
+    cascadeName(cascadeName_)
+{
+}
+
+
+bool TestHaarCascadeLoader::toString(std::ofstream &strOut)
+{
+    strOut << "cascadeName=" << cascadeName << std::endl;
+    return true;
+}
+
+
+bool TestHaarCascadeLoader::init()
+{
+    return true;
+}
+
+
+bool TestHaarCascadeLoader::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32u numStages, numNodes, numFeatures;
+    Ncv32u numStages_2 = 0, numNodes_2 = 0, numFeatures_2 = 0;
+
+    ncvStat = ncvHaarGetClassifierSize(this->cascadeName, numStages, numNodes, numFeatures);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    NCVVectorAlloc<HaarStage64> h_HaarStages(*this->allocatorCPU.get(), numStages);
+    ncvAssertReturn(h_HaarStages.isMemAllocated(), false);
+    NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(*this->allocatorCPU.get(), numNodes);
+    ncvAssertReturn(h_HaarNodes.isMemAllocated(), false);
+    NCVVectorAlloc<HaarFeature64> h_HaarFeatures(*this->allocatorCPU.get(), numFeatures);
+    ncvAssertReturn(h_HaarFeatures.isMemAllocated(), false);
+
+    NCVVectorAlloc<HaarStage64> h_HaarStages_2(*this->allocatorCPU.get(), numStages);
+    ncvAssertReturn(h_HaarStages_2.isMemAllocated(), false);
+    NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes_2(*this->allocatorCPU.get(), numNodes);
+    ncvAssertReturn(h_HaarNodes_2.isMemAllocated(), false);
+    NCVVectorAlloc<HaarFeature64> h_HaarFeatures_2(*this->allocatorCPU.get(), numFeatures);
+    ncvAssertReturn(h_HaarFeatures_2.isMemAllocated(), false);
+
+    HaarClassifierCascadeDescriptor haar;
+    HaarClassifierCascadeDescriptor haar_2;
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+
+    const std::string testNvbinName = cv::tempfile("test.nvbin");
+    ncvStat = ncvHaarLoadFromFile_host(this->cascadeName, haar, h_HaarStages, h_HaarNodes, h_HaarFeatures);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    ncvStat = ncvHaarStoreNVBIN_host(testNvbinName, haar, h_HaarStages, h_HaarNodes, h_HaarFeatures);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    ncvStat = ncvHaarGetClassifierSize(testNvbinName, numStages_2, numNodes_2, numFeatures_2);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    ncvStat = ncvHaarLoadFromFile_host(testNvbinName, haar_2, h_HaarStages_2, h_HaarNodes_2, h_HaarFeatures_2);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+
+    if (
+    numStages_2 != numStages                                       ||
+    numNodes_2 != numNodes                                         ||
+    numFeatures_2 != numFeatures                                   ||
+    haar.NumStages               != haar_2.NumStages               ||
+    haar.NumClassifierRootNodes  != haar_2.NumClassifierRootNodes  ||
+    haar.NumClassifierTotalNodes != haar_2.NumClassifierTotalNodes ||
+    haar.NumFeatures             != haar_2.NumFeatures             ||
+    haar.ClassifierSize.width    != haar_2.ClassifierSize.width    ||
+    haar.ClassifierSize.height   != haar_2.ClassifierSize.height   ||
+    haar.bNeedsTiltedII          != haar_2.bNeedsTiltedII          ||
+    haar.bHasStumpsOnly          != haar_2.bHasStumpsOnly          )
+    {
+        bLoopVirgin = false;
+    }
+    if (memcmp(h_HaarStages.ptr(), h_HaarStages_2.ptr(), haar.NumStages * sizeof(HaarStage64)) ||
+        memcmp(h_HaarNodes.ptr(), h_HaarNodes_2.ptr(), haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128)) ||
+        memcmp(h_HaarFeatures.ptr(), h_HaarFeatures_2.ptr(), haar.NumFeatures * sizeof(HaarFeature64)) )
+    {
+        bLoopVirgin = false;
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestHaarCascadeLoader::deinit()
+{
+    return true;
+}
diff --git a/modules/cudalegacy/test/TestHaarCascadeLoader.h b/modules/cudalegacy/test/TestHaarCascadeLoader.h
new file mode 100644
index 00000000000..86c6324635b
--- /dev/null
+++ b/modules/cudalegacy/test/TestHaarCascadeLoader.h
@@ -0,0 +1,66 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testhaarcascadeloader_h_
+#define _testhaarcascadeloader_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestHaarCascadeLoader : public NCVTestProvider
+{
+public:
+
+    TestHaarCascadeLoader(std::string testName, std::string cascadeName);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+
+    std::string cascadeName;
+};
+
+#endif // _testhaarcascadeloader_h_
diff --git a/modules/cudalegacy/test/TestHypothesesFilter.cpp b/modules/cudalegacy/test/TestHypothesesFilter.cpp
new file mode 100644
index 00000000000..39d6556616a
--- /dev/null
+++ b/modules/cudalegacy/test/TestHypothesesFilter.cpp
@@ -0,0 +1,206 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+
+TestHypothesesFilter::TestHypothesesFilter(std::string testName_, NCVTestSourceProvider<Ncv32u> &src_,
+                                           Ncv32u numDstRects_, Ncv32u minNeighbors_, Ncv32f eps_)
+    :
+    NCVTestProvider(testName_),
+    src(src_),
+    numDstRects(numDstRects_),
+    minNeighbors(minNeighbors_),
+    eps(eps_)
+{
+}
+
+
+bool TestHypothesesFilter::toString(std::ofstream &strOut)
+{
+    strOut << "numDstRects=" << numDstRects << std::endl;
+    strOut << "minNeighbors=" << minNeighbors << std::endl;
+    strOut << "eps=" << eps << std::endl;
+    return true;
+}
+
+
+bool TestHypothesesFilter::init()
+{
+    this->canvasWidth = 4096;
+    this->canvasHeight = 4096;
+    return true;
+}
+
+
+bool compareRects(const NcvRect32u &r1, const NcvRect32u &r2, Ncv32f eps)
+{
+    double delta = eps*(std::min(r1.width, r2.width) + std::min(r1.height, r2.height))*0.5;
+    return std::abs((Ncv32s)r1.x - (Ncv32s)r2.x) <= delta &&
+        std::abs((Ncv32s)r1.y - (Ncv32s)r2.y) <= delta &&
+        std::abs((Ncv32s)r1.x + (Ncv32s)r1.width - (Ncv32s)r2.x - (Ncv32s)r2.width) <= delta &&
+        std::abs((Ncv32s)r1.y + (Ncv32s)r1.height - (Ncv32s)r2.y - (Ncv32s)r2.height) <= delta;
+}
+
+
+inline bool operator < (const NcvRect32u &a, const NcvRect32u &b)
+{
+    return a.x < b.x;
+}
+
+
+bool TestHypothesesFilter::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    NCVVectorAlloc<Ncv32u> h_random32u(*this->allocatorCPU.get(), this->numDstRects * sizeof(NcvRect32u) / sizeof(Ncv32u));
+    ncvAssertReturn(h_random32u.isMemAllocated(), false);
+
+    Ncv32u srcSlotSize = 2 * this->minNeighbors + 1;
+
+    NCVVectorAlloc<NcvRect32u> h_vecSrc(*this->allocatorCPU.get(), this->numDstRects*srcSlotSize);
+    ncvAssertReturn(h_vecSrc.isMemAllocated(), false);
+    NCVVectorAlloc<NcvRect32u> h_vecDst_groundTruth(*this->allocatorCPU.get(), this->numDstRects);
+    ncvAssertReturn(h_vecDst_groundTruth.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorCPU.get()->isCounting());
+
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_random32u), false);
+    Ncv32u randCnt = 0;
+    Ncv64f randVal;
+
+    for (Ncv32u i=0; i<this->numDstRects; i++)
+    {
+        h_vecDst_groundTruth.ptr()[i].x = i * this->canvasWidth / this->numDstRects + this->canvasWidth / (this->numDstRects * 4);
+        h_vecDst_groundTruth.ptr()[i].y = i * this->canvasHeight / this->numDstRects + this->canvasHeight / (this->numDstRects * 4);
+        h_vecDst_groundTruth.ptr()[i].width = this->canvasWidth / (this->numDstRects * 2);
+        h_vecDst_groundTruth.ptr()[i].height = this->canvasHeight / (this->numDstRects * 2);
+
+        Ncv32u numNeighbors = this->minNeighbors + 1 + (Ncv32u)(((1.0 * h_random32u.ptr()[i]) * (this->minNeighbors + 1)) / 0xFFFFFFFF);
+        numNeighbors = (numNeighbors > srcSlotSize) ? srcSlotSize : numNeighbors;
+
+        //fill in strong hypotheses                           (2 * ((1.0 * randVal) / 0xFFFFFFFF) - 1)
+        for (Ncv32u j=0; j<numNeighbors; j++)
+        {
+            randVal = (1.0 * h_random32u.ptr()[randCnt++]) / 0xFFFFFFFF; randCnt = randCnt % h_random32u.length();
+            h_vecSrc.ptr()[srcSlotSize * i + j].x =
+                h_vecDst_groundTruth.ptr()[i].x +
+                (Ncv32s)(h_vecDst_groundTruth.ptr()[i].width * this->eps * (randVal - 0.5));
+            randVal = (1.0 * h_random32u.ptr()[randCnt++]) / 0xFFFFFFFF; randCnt = randCnt % h_random32u.length();
+            h_vecSrc.ptr()[srcSlotSize * i + j].y =
+                h_vecDst_groundTruth.ptr()[i].y +
+                (Ncv32s)(h_vecDst_groundTruth.ptr()[i].height * this->eps * (randVal - 0.5));
+            h_vecSrc.ptr()[srcSlotSize * i + j].width = h_vecDst_groundTruth.ptr()[i].width;
+            h_vecSrc.ptr()[srcSlotSize * i + j].height = h_vecDst_groundTruth.ptr()[i].height;
+        }
+
+        //generate weak hypotheses (to be removed in processing)
+        for (Ncv32u j=numNeighbors; j<srcSlotSize; j++)
+        {
+            randVal = (1.0 * h_random32u.ptr()[randCnt++]) / 0xFFFFFFFF; randCnt = randCnt % h_random32u.length();
+            h_vecSrc.ptr()[srcSlotSize * i + j].x =
+                this->canvasWidth + h_vecDst_groundTruth.ptr()[i].x +
+                (Ncv32s)(h_vecDst_groundTruth.ptr()[i].width * this->eps * (randVal - 0.5));
+            randVal = (1.0 * h_random32u.ptr()[randCnt++]) / 0xFFFFFFFF; randCnt = randCnt % h_random32u.length();
+            h_vecSrc.ptr()[srcSlotSize * i + j].y =
+                this->canvasHeight + h_vecDst_groundTruth.ptr()[i].y +
+                (Ncv32s)(h_vecDst_groundTruth.ptr()[i].height * this->eps * (randVal - 0.5));
+            h_vecSrc.ptr()[srcSlotSize * i + j].width = h_vecDst_groundTruth.ptr()[i].width;
+            h_vecSrc.ptr()[srcSlotSize * i + j].height = h_vecDst_groundTruth.ptr()[i].height;
+        }
+    }
+
+    //shuffle
+    for (Ncv32u i=0; i<this->numDstRects*srcSlotSize-1; i++)
+    {
+        Ncv32u randValLocal = h_random32u.ptr()[randCnt++]; randCnt = randCnt % h_random32u.length();
+        Ncv32u secondSwap = randValLocal % (this->numDstRects*srcSlotSize-1 - i);
+        NcvRect32u tmp = h_vecSrc.ptr()[i + secondSwap];
+        h_vecSrc.ptr()[i + secondSwap] = h_vecSrc.ptr()[i];
+        h_vecSrc.ptr()[i] = tmp;
+    }
+    NCV_SKIP_COND_END
+
+    Ncv32u numHypothesesSrc = static_cast<Ncv32u>(h_vecSrc.length());
+    NCV_SKIP_COND_BEGIN
+    ncvStat = ncvGroupRectangles_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //verification
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    if (numHypothesesSrc != this->numDstRects)
+    {
+        bLoopVirgin = false;
+    }
+    else
+    {
+        std::vector<NcvRect32u> tmpRects(numHypothesesSrc);
+        memcpy(&tmpRects[0], h_vecSrc.ptr(), numHypothesesSrc * sizeof(NcvRect32u));
+        std::sort(tmpRects.begin(), tmpRects.end());
+        for (Ncv32u i=0; i<numHypothesesSrc && bLoopVirgin; i++)
+        {
+            if (!compareRects(tmpRects[i], h_vecDst_groundTruth.ptr()[i], this->eps))
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestHypothesesFilter::deinit()
+{
+    return true;
+}
diff --git a/modules/cudalegacy/test/TestHypothesesFilter.h b/modules/cudalegacy/test/TestHypothesesFilter.h
new file mode 100644
index 00000000000..f190785fe7c
--- /dev/null
+++ b/modules/cudalegacy/test/TestHypothesesFilter.h
@@ -0,0 +1,76 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testhypothesesfilter_h_
+#define _testhypothesesfilter_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestHypothesesFilter : public NCVTestProvider
+{
+public:
+
+    TestHypothesesFilter(std::string testName, NCVTestSourceProvider<Ncv32u> &src,
+                         Ncv32u numDstRects, Ncv32u minNeighbors, Ncv32f eps);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+
+    TestHypothesesFilter(const TestHypothesesFilter&);
+    TestHypothesesFilter& operator=(const TestHypothesesFilter&);
+
+    NCVTestSourceProvider<Ncv32u> &src;
+    Ncv32u numDstRects;
+    Ncv32u minNeighbors;
+    Ncv32f eps;
+
+    Ncv32u canvasWidth;
+    Ncv32u canvasHeight;
+};
+
+#endif // _testhypothesesfilter_h_
diff --git a/modules/cudalegacy/test/TestHypothesesGrow.cpp b/modules/cudalegacy/test/TestHypothesesGrow.cpp
new file mode 100644
index 00000000000..e7fe4d939df
--- /dev/null
+++ b/modules/cudalegacy/test/TestHypothesesGrow.cpp
@@ -0,0 +1,164 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+
+TestHypothesesGrow::TestHypothesesGrow(std::string testName_, NCVTestSourceProvider<Ncv32u> &src_,
+                                       Ncv32u rectWidth_, Ncv32u rectHeight_, Ncv32f rectScale_,
+                                       Ncv32u maxLenSrc_, Ncv32u lenSrc_, Ncv32u maxLenDst_, Ncv32u lenDst_)
+    :
+    NCVTestProvider(testName_),
+    src(src_),
+    rectWidth(rectWidth_),
+    rectHeight(rectHeight_),
+    rectScale(rectScale_),
+    maxLenSrc(maxLenSrc_),
+    lenSrc(lenSrc_),
+    maxLenDst(maxLenDst_),
+    lenDst(lenDst_)
+{
+}
+
+
+bool TestHypothesesGrow::toString(std::ofstream &strOut)
+{
+    strOut << "rectWidth=" << rectWidth << std::endl;
+    strOut << "rectHeight=" << rectHeight << std::endl;
+    strOut << "rectScale=" << rectScale << std::endl;
+    strOut << "maxLenSrc=" << maxLenSrc << std::endl;
+    strOut << "lenSrc=" << lenSrc << std::endl;
+    strOut << "maxLenDst=" << maxLenDst << std::endl;
+    strOut << "lenDst=" << lenDst << std::endl;
+    return true;
+}
+
+
+bool TestHypothesesGrow::init()
+{
+    return true;
+}
+
+
+bool TestHypothesesGrow::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    NCVVectorAlloc<Ncv32u> h_vecSrc(*this->allocatorCPU.get(), this->maxLenSrc);
+    ncvAssertReturn(h_vecSrc.isMemAllocated(), false);
+    NCVVectorAlloc<Ncv32u> d_vecSrc(*this->allocatorGPU.get(), this->maxLenSrc);
+    ncvAssertReturn(d_vecSrc.isMemAllocated(), false);
+
+    NCVVectorAlloc<NcvRect32u> h_vecDst(*this->allocatorCPU.get(), this->maxLenDst);
+    ncvAssertReturn(h_vecDst.isMemAllocated(), false);
+    NCVVectorAlloc<NcvRect32u> d_vecDst(*this->allocatorGPU.get(), this->maxLenDst);
+    ncvAssertReturn(d_vecDst.isMemAllocated(), false);
+    NCVVectorAlloc<NcvRect32u> h_vecDst_d(*this->allocatorCPU.get(), this->maxLenDst);
+    ncvAssertReturn(h_vecDst_d.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_vecSrc), false);
+    memset(h_vecDst.ptr(), 0, h_vecDst.length() * sizeof(NcvRect32u));
+    NCVVectorReuse<Ncv32u> h_vecDst_as32u(h_vecDst.getSegment(), lenDst * sizeof(NcvRect32u) / sizeof(Ncv32u));
+    ncvAssertReturn(h_vecDst_as32u.isMemReused(), false);
+    ncvAssertReturn(this->src.fill(h_vecDst_as32u), false);
+    memcpy(h_vecDst_d.ptr(), h_vecDst.ptr(), h_vecDst.length() * sizeof(NcvRect32u));
+    NCV_SKIP_COND_END
+
+    ncvStat = h_vecSrc.copySolid(d_vecSrc, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvStat = h_vecDst.copySolid(d_vecDst, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    Ncv32u h_outElemNum_d = 0;
+    Ncv32u h_outElemNum_h = 0;
+    NCV_SKIP_COND_BEGIN
+    h_outElemNum_d = this->lenDst;
+    ncvStat = ncvGrowDetectionsVector_device(d_vecSrc, this->lenSrc,
+                                             d_vecDst, h_outElemNum_d, this->maxLenDst,
+                                             this->rectWidth, this->rectHeight, this->rectScale, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvStat = d_vecDst.copySolid(h_vecDst_d, 0);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), false);
+
+    h_outElemNum_h = this->lenDst;
+    ncvStat = ncvGrowDetectionsVector_host(h_vecSrc, this->lenSrc,
+                                           h_vecDst, h_outElemNum_h, this->maxLenDst,
+                                           this->rectWidth, this->rectHeight, this->rectScale);
+    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    if (h_outElemNum_d != h_outElemNum_h)
+    {
+        bLoopVirgin = false;
+    }
+    else
+    {
+        if (memcmp(h_vecDst.ptr(), h_vecDst_d.ptr(), this->maxLenDst * sizeof(NcvRect32u)))
+        {
+            bLoopVirgin = false;
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestHypothesesGrow::deinit()
+{
+    return true;
+}
diff --git a/modules/cudalegacy/test/TestHypothesesGrow.h b/modules/cudalegacy/test/TestHypothesesGrow.h
new file mode 100644
index 00000000000..51e879f6978
--- /dev/null
+++ b/modules/cudalegacy/test/TestHypothesesGrow.h
@@ -0,0 +1,78 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testhypothesesgrow_h_
+#define _testhypothesesgrow_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestHypothesesGrow : public NCVTestProvider
+{
+public:
+
+    TestHypothesesGrow(std::string testName, NCVTestSourceProvider<Ncv32u> &src,
+                       Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f rectScale,
+                       Ncv32u maxLenSrc, Ncv32u lenSrc, Ncv32u maxLenDst, Ncv32u lenDst);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+    TestHypothesesGrow(const TestHypothesesGrow&);
+    TestHypothesesGrow& operator=(const TestHypothesesGrow&);
+
+
+    NCVTestSourceProvider<Ncv32u> &src;
+    Ncv32u rectWidth;
+    Ncv32u rectHeight;
+    Ncv32f rectScale;
+    Ncv32u maxLenSrc;
+    Ncv32u lenSrc;
+    Ncv32u maxLenDst;
+    Ncv32u lenDst;
+};
+
+#endif // _testhypothesesgrow_h_
diff --git a/modules/cudalegacy/test/TestIntegralImage.cpp b/modules/cudalegacy/test/TestIntegralImage.cpp
new file mode 100644
index 00000000000..c04edff7c96
--- /dev/null
+++ b/modules/cudalegacy/test/TestIntegralImage.cpp
@@ -0,0 +1,215 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+
+template <class T_in, class T_out>
+TestIntegralImage<T_in, T_out>::TestIntegralImage(std::string testName_, NCVTestSourceProvider<T_in> &src_,
+                                                  Ncv32u width_, Ncv32u height_)
+    :
+    NCVTestProvider(testName_),
+    src(src_),
+    width(width_),
+    height(height_)
+{
+}
+
+
+template <class T_in, class T_out>
+bool TestIntegralImage<T_in, T_out>::toString(std::ofstream &strOut)
+{
+    strOut << "sizeof(T_in)=" << sizeof(T_in) << std::endl;
+    strOut << "sizeof(T_out)=" << sizeof(T_out) << std::endl;
+    strOut << "width=" << width << std::endl;
+    strOut << "height=" << height << std::endl;
+    return true;
+}
+
+
+template <class T_in, class T_out>
+bool TestIntegralImage<T_in, T_out>::init()
+{
+    return true;
+}
+
+
+template <class T_in, class T_out>
+bool TestIntegralImage<T_in, T_out>::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32u widthII = this->width + 1;
+    Ncv32u heightII = this->height + 1;
+
+    NCVMatrixAlloc<T_in> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T_in> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T_out> d_imgII(*this->allocatorGPU.get(), widthII, heightII);
+    ncvAssertReturn(d_imgII.isMemAllocated(), false);
+    NCVMatrixAlloc<T_out> h_imgII(*this->allocatorCPU.get(), widthII, heightII);
+    ncvAssertReturn(h_imgII.isMemAllocated(), false);
+    NCVMatrixAlloc<T_out> h_imgII_d(*this->allocatorCPU.get(), widthII, heightII);
+    ncvAssertReturn(h_imgII_d.isMemAllocated(), false);
+
+    Ncv32u bufSize;
+    if (sizeof(T_in) == sizeof(Ncv8u))
+    {
+        ncvStat = nppiStIntegralGetSize_8u32u(NcvSize32u(this->width, this->height), &bufSize, this->devProp);
+        ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    }
+    else if (sizeof(T_in) == sizeof(Ncv32f))
+    {
+        ncvStat = nppiStIntegralGetSize_32f32f(NcvSize32u(this->width, this->height), &bufSize, this->devProp);
+        ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect integral image test instance", false);
+    }
+
+    NCVVectorAlloc<Ncv8u> d_tmpBuf(*this->allocatorGPU.get(), bufSize);
+    ncvAssertReturn(d_tmpBuf.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+
+    ncvAssertReturn(this->src.fill(h_img), false);
+
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    if (sizeof(T_in) == sizeof(Ncv8u))
+    {
+        ncvStat = nppiStIntegral_8u32u_C1R((Ncv8u *)d_img.ptr(), d_img.pitch(),
+                                           (Ncv32u *)d_imgII.ptr(), d_imgII.pitch(),
+                                           NcvSize32u(this->width, this->height),
+                                           d_tmpBuf.ptr(), bufSize, this->devProp);
+        ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    }
+    else if (sizeof(T_in) == sizeof(Ncv32f))
+    {
+        ncvStat = nppiStIntegral_32f32f_C1R((Ncv32f *)d_img.ptr(), d_img.pitch(),
+                                            (Ncv32f *)d_imgII.ptr(), d_imgII.pitch(),
+                                            NcvSize32u(this->width, this->height),
+                                            d_tmpBuf.ptr(), bufSize, this->devProp);
+        ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect integral image test instance", false);
+    }
+
+    ncvStat = d_imgII.copySolid(h_imgII_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    if (sizeof(T_in) == sizeof(Ncv8u))
+    {
+        ncvStat = nppiStIntegral_8u32u_C1R_host((Ncv8u *)h_img.ptr(), h_img.pitch(),
+                                                (Ncv32u *)h_imgII.ptr(), h_imgII.pitch(),
+                                                NcvSize32u(this->width, this->height));
+        ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    }
+    else if (sizeof(T_in) == sizeof(Ncv32f))
+    {
+        ncvStat = nppiStIntegral_32f32f_C1R_host((Ncv32f *)h_img.ptr(), h_img.pitch(),
+                                                 (Ncv32f *)h_imgII.ptr(), h_imgII.pitch(),
+                                                 NcvSize32u(this->width, this->height));
+        ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect integral image test instance", false);
+    }
+
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    for (Ncv32u i=0; bLoopVirgin && i < h_img.height() + 1; i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < h_img.width() + 1; j++)
+        {
+            if (sizeof(T_in) == sizeof(Ncv8u))
+            {
+                if (h_imgII.ptr()[h_imgII.stride()*i+j] != h_imgII_d.ptr()[h_imgII_d.stride()*i+j])
+                {
+                    bLoopVirgin = false;
+                }
+            }
+            else if (sizeof(T_in) == sizeof(Ncv32f))
+            {
+                if (fabsf((float)h_imgII.ptr()[h_imgII.stride()*i+j] - (float)h_imgII_d.ptr()[h_imgII_d.stride()*i+j]) > 0.01f)
+                {
+                    bLoopVirgin = false;
+                }
+            }
+            else
+            {
+                ncvAssertPrintReturn(false, "Incorrect integral image test instance", false);
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+template <class T_in, class T_out>
+bool TestIntegralImage<T_in, T_out>::deinit()
+{
+    return true;
+}
+
+
+template class TestIntegralImage<Ncv8u, Ncv32u>;
+template class TestIntegralImage<Ncv32f, Ncv32f>;
diff --git a/modules/cudalegacy/test/TestIntegralImage.h b/modules/cudalegacy/test/TestIntegralImage.h
new file mode 100644
index 00000000000..c4f58ba1467
--- /dev/null
+++ b/modules/cudalegacy/test/TestIntegralImage.h
@@ -0,0 +1,72 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testintegralimage_h_
+#define _testintegralimage_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+template <class T_in, class T_out>
+class TestIntegralImage : public NCVTestProvider
+{
+public:
+
+    TestIntegralImage(std::string testName, NCVTestSourceProvider<T_in> &src,
+                      Ncv32u width, Ncv32u height);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+    TestIntegralImage(const TestIntegralImage&);
+    TestIntegralImage& operator=(const TestIntegralImage&);
+
+    NCVTestSourceProvider<T_in> &src;
+    Ncv32u width;
+    Ncv32u height;
+};
+
+#endif // _testintegralimage_h_
diff --git a/modules/cudalegacy/test/TestIntegralImageSquared.cpp b/modules/cudalegacy/test/TestIntegralImageSquared.cpp
new file mode 100644
index 00000000000..5481fa2e3a6
--- /dev/null
+++ b/modules/cudalegacy/test/TestIntegralImageSquared.cpp
@@ -0,0 +1,148 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+
+TestIntegralImageSquared::TestIntegralImageSquared(std::string testName_, NCVTestSourceProvider<Ncv8u> &src_,
+                                                   Ncv32u width_, Ncv32u height_)
+    :
+    NCVTestProvider(testName_),
+    src(src_),
+    width(width_),
+    height(height_)
+{
+}
+
+
+bool TestIntegralImageSquared::toString(std::ofstream &strOut)
+{
+    strOut << "width=" << width << std::endl;
+    strOut << "height=" << height << std::endl;
+    return true;
+}
+
+
+bool TestIntegralImageSquared::init()
+{
+    return true;
+}
+
+
+bool TestIntegralImageSquared::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32u widthSII = this->width + 1;
+    Ncv32u heightSII = this->height + 1;
+
+    NCVMatrixAlloc<Ncv8u> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv8u> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> d_imgSII(*this->allocatorGPU.get(), widthSII, heightSII);
+    ncvAssertReturn(d_imgSII.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> h_imgSII(*this->allocatorCPU.get(), widthSII, heightSII);
+    ncvAssertReturn(h_imgSII.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> h_imgSII_d(*this->allocatorCPU.get(), widthSII, heightSII);
+    ncvAssertReturn(h_imgSII_d.isMemAllocated(), false);
+
+    Ncv32u bufSize;
+    ncvStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(this->width, this->height), &bufSize, this->devProp);
+    ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    NCVVectorAlloc<Ncv8u> d_tmpBuf(*this->allocatorGPU.get(), bufSize);
+    ncvAssertReturn(d_tmpBuf.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+
+    ncvAssertReturn(this->src.fill(h_img), false);
+
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStSqrIntegral_8u64u_C1R(d_img.ptr(), d_img.pitch(),
+                                          d_imgSII.ptr(), d_imgSII.pitch(),
+                                          NcvSize32u(this->width, this->height),
+                                          d_tmpBuf.ptr(), bufSize, this->devProp);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = d_imgSII.copySolid(h_imgSII_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStSqrIntegral_8u64u_C1R_host(h_img.ptr(), h_img.pitch(),
+                                               h_imgSII.ptr(), h_imgSII.pitch(),
+                                               NcvSize32u(this->width, this->height));
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    for (Ncv32u i=0; bLoopVirgin && i < h_img.height() + 1; i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < h_img.width() + 1; j++)
+        {
+            if (h_imgSII.ptr()[h_imgSII.stride()*i+j] != h_imgSII_d.ptr()[h_imgSII_d.stride()*i+j])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestIntegralImageSquared::deinit()
+{
+    return true;
+}
diff --git a/modules/cudalegacy/test/TestIntegralImageSquared.h b/modules/cudalegacy/test/TestIntegralImageSquared.h
new file mode 100644
index 00000000000..20e5ca8df3b
--- /dev/null
+++ b/modules/cudalegacy/test/TestIntegralImageSquared.h
@@ -0,0 +1,71 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testintegralimagesquared_h_
+#define _testintegralimagesquared_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestIntegralImageSquared : public NCVTestProvider
+{
+public:
+
+    TestIntegralImageSquared(std::string testName, NCVTestSourceProvider<Ncv8u> &src,
+                             Ncv32u width, Ncv32u height);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+    TestIntegralImageSquared(const TestIntegralImageSquared&);
+    TestIntegralImageSquared& operator=(const TestIntegralImageSquared&);
+
+    NCVTestSourceProvider<Ncv8u> &src;
+    Ncv32u width;
+    Ncv32u height;
+};
+
+#endif // _testintegralimagesquared_h_
diff --git a/modules/cudalegacy/test/TestRectStdDev.cpp b/modules/cudalegacy/test/TestRectStdDev.cpp
new file mode 100644
index 00000000000..86bb9ed23b9
--- /dev/null
+++ b/modules/cudalegacy/test/TestRectStdDev.cpp
@@ -0,0 +1,209 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+
+TestRectStdDev::TestRectStdDev(std::string testName_, NCVTestSourceProvider<Ncv8u> &src_,
+                               Ncv32u width_, Ncv32u height_, NcvRect32u rect_, Ncv32f scaleFactor_,
+                               NcvBool bTextureCache_)
+    :
+    NCVTestProvider(testName_),
+    src(src_),
+    width(width_),
+    height(height_),
+    rect(rect_),
+    scaleFactor(scaleFactor_),
+    bTextureCache(bTextureCache_)
+{
+}
+
+
+bool TestRectStdDev::toString(std::ofstream &strOut)
+{
+    strOut << "width=" << width << std::endl;
+    strOut << "height=" << height << std::endl;
+    strOut << "rect=[" << rect.x << ", " << rect.y << ", " << rect.width << ", " << rect.height << "]\n";
+    strOut << "scaleFactor=" << scaleFactor << std::endl;
+    strOut << "bTextureCache=" << bTextureCache << std::endl;
+    return true;
+}
+
+
+bool TestRectStdDev::init()
+{
+    return true;
+}
+
+
+bool TestRectStdDev::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32s _normWidth = (Ncv32s)this->width - this->rect.x - this->rect.width + 1;
+    Ncv32s _normHeight = (Ncv32s)this->height - this->rect.y - this->rect.height + 1;
+    if (_normWidth <= 0 || _normHeight <= 0)
+    {
+        return true;
+    }
+    Ncv32u normWidth = (Ncv32u)_normWidth;
+    Ncv32u normHeight = (Ncv32u)_normHeight;
+    NcvSize32u szNormRoi(normWidth, normHeight);
+
+    Ncv32u widthII = this->width + 1;
+    Ncv32u heightII = this->height + 1;
+    Ncv32u widthSII = this->width + 1;
+    Ncv32u heightSII = this->height + 1;
+
+    NCVMatrixAlloc<Ncv8u> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv8u> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+
+    NCVMatrixAlloc<Ncv32u> d_imgII(*this->allocatorGPU.get(), widthII, heightII);
+    ncvAssertReturn(d_imgII.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32u> h_imgII(*this->allocatorCPU.get(), widthII, heightII);
+    ncvAssertReturn(h_imgII.isMemAllocated(), false);
+
+    NCVMatrixAlloc<Ncv64u> d_imgSII(*this->allocatorGPU.get(), widthSII, heightSII);
+    ncvAssertReturn(d_imgSII.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv64u> h_imgSII(*this->allocatorCPU.get(), widthSII, heightSII);
+    ncvAssertReturn(h_imgSII.isMemAllocated(), false);
+
+    NCVMatrixAlloc<Ncv32f> d_norm(*this->allocatorGPU.get(), normWidth, normHeight);
+    ncvAssertReturn(d_norm.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32f> h_norm(*this->allocatorCPU.get(), normWidth, normHeight);
+    ncvAssertReturn(h_norm.isMemAllocated(), false);
+    NCVMatrixAlloc<Ncv32f> h_norm_d(*this->allocatorCPU.get(), normWidth, normHeight);
+    ncvAssertReturn(h_norm_d.isMemAllocated(), false);
+
+    Ncv32u bufSizeII, bufSizeSII;
+    ncvStat = nppiStIntegralGetSize_8u32u(NcvSize32u(this->width, this->height), &bufSizeII, this->devProp);
+    ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    ncvStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(this->width, this->height), &bufSizeSII, this->devProp);
+    ncvAssertReturn(NPPST_SUCCESS == ncvStat, false);
+    Ncv32u bufSize = bufSizeII > bufSizeSII ? bufSizeII : bufSizeSII;
+    NCVVectorAlloc<Ncv8u> d_tmpBuf(*this->allocatorGPU.get(), bufSize);
+    ncvAssertReturn(d_tmpBuf.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_img), false);
+
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStIntegral_8u32u_C1R(d_img.ptr(), d_img.pitch(),
+                                       d_imgII.ptr(), d_imgII.pitch(),
+                                       NcvSize32u(this->width, this->height),
+                                       d_tmpBuf.ptr(), bufSize, this->devProp);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStSqrIntegral_8u64u_C1R(d_img.ptr(), d_img.pitch(),
+                                          d_imgSII.ptr(), d_imgSII.pitch(),
+                                          NcvSize32u(this->width, this->height),
+                                          d_tmpBuf.ptr(), bufSize, this->devProp);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStRectStdDev_32f_C1R(d_imgII.ptr(), d_imgII.pitch(),
+                                       d_imgSII.ptr(), d_imgSII.pitch(),
+                                       d_norm.ptr(), d_norm.pitch(),
+                                       szNormRoi, this->rect,
+                                       this->scaleFactor,
+                                       this->bTextureCache);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = d_norm.copySolid(h_norm_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStIntegral_8u32u_C1R_host(h_img.ptr(), h_img.pitch(),
+                                          h_imgII.ptr(), h_imgII.pitch(),
+                                          NcvSize32u(this->width, this->height));
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStSqrIntegral_8u64u_C1R_host(h_img.ptr(), h_img.pitch(),
+                                             h_imgSII.ptr(), h_imgSII.pitch(),
+                                             NcvSize32u(this->width, this->height));
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    ncvStat = nppiStRectStdDev_32f_C1R_host(h_imgII.ptr(), h_imgII.pitch(),
+                                          h_imgSII.ptr(), h_imgSII.pitch(),
+                                          h_norm.ptr(), h_norm.pitch(),
+                                          szNormRoi, this->rect,
+                                          this->scaleFactor);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    const Ncv64f relEPS = 0.005;
+    for (Ncv32u i=0; bLoopVirgin && i < h_norm.height(); i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < h_norm.width(); j++)
+        {
+            Ncv64f absErr = fabs(h_norm.ptr()[h_norm.stride()*i+j] - h_norm_d.ptr()[h_norm_d.stride()*i+j]);
+            Ncv64f relErr = absErr / h_norm.ptr()[h_norm.stride()*i+j];
+
+            if (relErr > relEPS)
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+bool TestRectStdDev::deinit()
+{
+    return true;
+}
diff --git a/modules/cudalegacy/test/TestRectStdDev.h b/modules/cudalegacy/test/TestRectStdDev.h
new file mode 100644
index 00000000000..e22fbe87c78
--- /dev/null
+++ b/modules/cudalegacy/test/TestRectStdDev.h
@@ -0,0 +1,76 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testrectstddev_h_
+#define _testrectstddev_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+class TestRectStdDev : public NCVTestProvider
+{
+public:
+
+    TestRectStdDev(std::string testName, NCVTestSourceProvider<Ncv8u> &src,
+                   Ncv32u width, Ncv32u height, NcvRect32u rect, Ncv32f scaleFactor,
+                   NcvBool bTextureCache);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+    TestRectStdDev(const TestRectStdDev&);
+    TestRectStdDev& operator=(const TestRectStdDev&);
+
+    NCVTestSourceProvider<Ncv8u> &src;
+    Ncv32u width;
+    Ncv32u height;
+    NcvRect32u rect;
+    Ncv32f scaleFactor;
+
+    NcvBool bTextureCache;
+};
+
+#endif // _testrectstddev_h_
diff --git a/modules/cudalegacy/test/TestResize.cpp b/modules/cudalegacy/test/TestResize.cpp
new file mode 100644
index 00000000000..d2080f06de7
--- /dev/null
+++ b/modules/cudalegacy/test/TestResize.cpp
@@ -0,0 +1,190 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+
+template <class T>
+TestResize<T>::TestResize(std::string testName_, NCVTestSourceProvider<T> &src_,
+                          Ncv32u width_, Ncv32u height_, Ncv32u scaleFactor_, NcvBool bTextureCache_)
+    :
+    NCVTestProvider(testName_),
+    src(src_),
+    width(width_),
+    height(height_),
+    scaleFactor(scaleFactor_),
+    bTextureCache(bTextureCache_)
+{
+}
+
+
+template <class T>
+bool TestResize<T>::toString(std::ofstream &strOut)
+{
+    strOut << "sizeof(T)=" << sizeof(T) << std::endl;
+    strOut << "width=" << width << std::endl;
+    strOut << "scaleFactor=" << scaleFactor << std::endl;
+    strOut << "bTextureCache=" << bTextureCache << std::endl;
+    return true;
+}
+
+
+template <class T>
+bool TestResize<T>::init()
+{
+    return true;
+}
+
+
+template <class T>
+bool TestResize<T>::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    Ncv32s smallWidth = this->width / this->scaleFactor;
+    Ncv32s smallHeight = this->height / this->scaleFactor;
+    if (smallWidth == 0 || smallHeight == 0)
+    {
+        return true;
+    }
+
+    NcvSize32u srcSize(this->width, this->height);
+
+    NCVMatrixAlloc<T> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+
+    NCVMatrixAlloc<T> d_small(*this->allocatorGPU.get(), smallWidth, smallHeight);
+    ncvAssertReturn(d_small.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_small(*this->allocatorCPU.get(), smallWidth, smallHeight);
+    ncvAssertReturn(h_small.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_small_d(*this->allocatorCPU.get(), smallWidth, smallHeight);
+    ncvAssertReturn(h_small_d.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_img), false);
+    NCV_SKIP_COND_END
+
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_BEGIN
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = nppiStDecimate_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
+                                         (Ncv32u *)d_small.ptr(), d_small.pitch(),
+                                         srcSize, this->scaleFactor,
+                                         this->bTextureCache);
+    }
+    else if (sizeof(T) == sizeof(Ncv64u))
+    {
+        ncvStat = nppiStDecimate_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
+                                         (Ncv64u *)d_small.ptr(), d_small.pitch(),
+                                         srcSize, this->scaleFactor,
+                                         this->bTextureCache);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect downsample test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+    ncvStat = d_small.copySolid(h_small_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    NCV_SKIP_COND_BEGIN
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = nppiStDecimate_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
+                                              (Ncv32u *)h_small.ptr(), h_small.pitch(),
+                                              srcSize, this->scaleFactor);
+    }
+    else if (sizeof(T) == sizeof(Ncv64u))
+    {
+        ncvStat = nppiStDecimate_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
+                                              (Ncv64u *)h_small.ptr(), h_small.pitch(),
+                                              srcSize, this->scaleFactor);
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect downsample test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    //const Ncv64f relEPS = 0.005;
+    for (Ncv32u i=0; bLoopVirgin && i < h_small.height(); i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < h_small.width(); j++)
+        {
+            if (h_small.ptr()[h_small.stride()*i+j] != h_small_d.ptr()[h_small_d.stride()*i+j])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+template <class T>
+bool TestResize<T>::deinit()
+{
+    return true;
+}
+
+
+template class TestResize<Ncv32u>;
+template class TestResize<Ncv64u>;
diff --git a/modules/cudalegacy/test/TestResize.h b/modules/cudalegacy/test/TestResize.h
new file mode 100644
index 00000000000..b2b28a83ff9
--- /dev/null
+++ b/modules/cudalegacy/test/TestResize.h
@@ -0,0 +1,74 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testresize_h_
+#define _testresize_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+template <class T>
+class TestResize : public NCVTestProvider
+{
+public:
+
+    TestResize(std::string testName, NCVTestSourceProvider<T> &src,
+               Ncv32u width, Ncv32u height, Ncv32u scaleFactor, NcvBool bTextureCache);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+    TestResize(const TestResize&);
+    TestResize& operator=(const TestResize&);
+
+    NCVTestSourceProvider<T> &src;
+    Ncv32u width;
+    Ncv32u height;
+    Ncv32u scaleFactor;
+
+    NcvBool bTextureCache;
+};
+
+#endif // _testresize_h_
diff --git a/modules/cudalegacy/test/TestTranspose.cpp b/modules/cudalegacy/test/TestTranspose.cpp
new file mode 100644
index 00000000000..3322a0758bc
--- /dev/null
+++ b/modules/cudalegacy/test/TestTranspose.cpp
@@ -0,0 +1,177 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+
+template <class T>
+TestTranspose<T>::TestTranspose(std::string testName_, NCVTestSourceProvider<T> &src_,
+                                Ncv32u width_, Ncv32u height_)
+    :
+    NCVTestProvider(testName_),
+    src(src_),
+    width(width_),
+    height(height_)
+{
+}
+
+
+template <class T>
+bool TestTranspose<T>::toString(std::ofstream &strOut)
+{
+    strOut << "sizeof(T)=" << sizeof(T) << std::endl;
+    strOut << "width=" << width << std::endl;
+    return true;
+}
+
+
+template <class T>
+bool TestTranspose<T>::init()
+{
+    return true;
+}
+
+
+template <class T>
+bool TestTranspose<T>::process()
+{
+    NCVStatus ncvStat;
+    bool rcode = false;
+
+    NcvSize32u srcSize(this->width, this->height);
+
+    NCVMatrixAlloc<T> d_img(*this->allocatorGPU.get(), this->width, this->height);
+    ncvAssertReturn(d_img.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_img(*this->allocatorCPU.get(), this->width, this->height);
+    ncvAssertReturn(h_img.isMemAllocated(), false);
+
+    NCVMatrixAlloc<T> d_dst(*this->allocatorGPU.get(), this->height, this->width);
+    ncvAssertReturn(d_dst.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_dst(*this->allocatorCPU.get(), this->height, this->width);
+    ncvAssertReturn(h_dst.isMemAllocated(), false);
+    NCVMatrixAlloc<T> h_dst_d(*this->allocatorCPU.get(), this->height, this->width);
+    ncvAssertReturn(h_dst_d.isMemAllocated(), false);
+
+    NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
+    NCV_SKIP_COND_BEGIN
+    ncvAssertReturn(this->src.fill(h_img), false);
+    NCV_SKIP_COND_END
+
+    ncvStat = h_img.copySolid(d_img, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_BEGIN
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
+                                          (Ncv32u *)d_dst.ptr(), d_dst.pitch(),
+                                          NcvSize32u(this->width, this->height));
+    }
+    else if (sizeof(T) == sizeof(Ncv64u))
+    {
+        ncvStat = nppiStTranspose_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
+                                        (Ncv64u *)d_dst.ptr(), d_dst.pitch(),
+                                        NcvSize32u(this->width, this->height));
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect transpose test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+    ncvStat = d_dst.copySolid(h_dst_d, 0);
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+
+    NCV_SKIP_COND_BEGIN
+    if (sizeof(T) == sizeof(Ncv32u))
+    {
+        ncvStat = nppiStTranspose_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
+                                               (Ncv32u *)h_dst.ptr(), h_dst.pitch(),
+                                               NcvSize32u(this->width, this->height));
+    }
+    else if (sizeof(T) == sizeof(Ncv64u))
+    {
+        ncvStat = nppiStTranspose_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
+                                               (Ncv64u *)h_dst.ptr(), h_dst.pitch(),
+                                               NcvSize32u(this->width, this->height));
+    }
+    else
+    {
+        ncvAssertPrintReturn(false, "Incorrect downsample test instance", false);
+    }
+    ncvAssertReturn(ncvStat == NPPST_SUCCESS, false);
+    NCV_SKIP_COND_END
+
+    //bit-to-bit check
+    bool bLoopVirgin = true;
+
+    NCV_SKIP_COND_BEGIN
+    //const Ncv64f relEPS = 0.005;
+    for (Ncv32u i=0; bLoopVirgin && i < this->width; i++)
+    {
+        for (Ncv32u j=0; bLoopVirgin && j < this->height; j++)
+        {
+            if (h_dst.ptr()[h_dst.stride()*i+j] != h_dst_d.ptr()[h_dst_d.stride()*i+j])
+            {
+                bLoopVirgin = false;
+            }
+        }
+    }
+    NCV_SKIP_COND_END
+
+    if (bLoopVirgin)
+    {
+        rcode = true;
+    }
+
+    return rcode;
+}
+
+
+template <class T>
+bool TestTranspose<T>::deinit()
+{
+    return true;
+}
+
+
+template class TestTranspose<Ncv32u>;
+template class TestTranspose<Ncv64u>;
diff --git a/modules/cudalegacy/test/TestTranspose.h b/modules/cudalegacy/test/TestTranspose.h
new file mode 100644
index 00000000000..c83306fd1c8
--- /dev/null
+++ b/modules/cudalegacy/test/TestTranspose.h
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _testtranspose_h_
+#define _testtranspose_h_
+
+#include "NCVTest.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+
+template <class T>
+class TestTranspose : public NCVTestProvider
+{
+public:
+
+    TestTranspose(std::string testName, NCVTestSourceProvider<T> &src,
+                  Ncv32u width, Ncv32u height);
+
+    virtual bool init();
+    virtual bool process();
+    virtual bool deinit();
+    virtual bool toString(std::ofstream &strOut);
+
+private:
+
+    TestTranspose(const TestTranspose&);
+    TestTranspose& operator=(const TestTranspose&);
+
+    NCVTestSourceProvider<T> &src;
+    Ncv32u width;
+    Ncv32u height;
+};
+
+#endif // _testtranspose_h_
diff --git a/modules/cudalegacy/test/main_nvidia.cpp b/modules/cudalegacy/test/main_nvidia.cpp
new file mode 100644
index 00000000000..347eb55ac02
--- /dev/null
+++ b/modules/cudalegacy/test/main_nvidia.cpp
@@ -0,0 +1,459 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#if defined _MSC_VER && _MSC_VER >= 1200
+# pragma warning (disable : 4408 4201 4100)
+#endif
+
+static std::string path;
+
+namespace {
+
+template <class T_in, class T_out>
+void generateIntegralTests(NCVAutoTestLister &testLister,
+                           NCVTestSourceProvider<T_in> &src,
+                           Ncv32u maxWidth, Ncv32u maxHeight)
+{
+    for (Ncv32f _i=1.0; _i<maxWidth; _i*=1.2f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf(testName, "LinIntImgW%dH%d", i, 2);
+        testLister.add(new TestIntegralImage<T_in, T_out>(testName, src, i, 2));
+    }
+    for (Ncv32f _i=1.0; _i<maxHeight; _i*=1.2f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf(testName, "LinIntImgW%dH%d", 2, i);
+        testLister.add(new TestIntegralImage<T_in, T_out>(testName, src, 2, i));
+    }
+
+    testLister.add(new TestIntegralImage<T_in, T_out>("LinIntImg_VGA", src, 640, 480));
+}
+
+void generateSquaredIntegralTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv8u> &src,
+                                  Ncv32u maxWidth, Ncv32u maxHeight)
+{
+    for (Ncv32f _i=1.0; _i<maxWidth; _i*=1.2f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf(testName, "SqIntImgW%dH%d", i, 32);
+        testLister.add(new TestIntegralImageSquared(testName, src, i, 32));
+    }
+    for (Ncv32f _i=1.0; _i<maxHeight; _i*=1.2f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf(testName, "SqIntImgW%dH%d", 32, i);
+        testLister.add(new TestIntegralImageSquared(testName, src, 32, i));
+    }
+
+    testLister.add(new TestIntegralImageSquared("SqLinIntImg_VGA", src, 640, 480));
+}
+
+void generateRectStdDevTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv8u> &src,
+                             Ncv32u maxWidth, Ncv32u maxHeight)
+{
+    NcvRect32u rect(1,1,18,18);
+
+    for (Ncv32f _i=32; _i<maxHeight/2 && _i < maxWidth/2; _i*=1.2f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf(testName, "RectStdDevW%dH%d", i*2, i);
+        testLister.add(new TestRectStdDev(testName, src, i*2, i, rect, 1, true));
+        testLister.add(new TestRectStdDev(testName, src, i*2, i, rect, 1.5, false));
+        testLister.add(new TestRectStdDev(testName, src, i-1, i*2-1, rect, 1, false));
+        testLister.add(new TestRectStdDev(testName, src, i-1, i*2-1, rect, 2.5, true));
+    }
+
+    testLister.add(new TestRectStdDev("RectStdDev_VGA", src, 640, 480, rect, 1, true));
+}
+
+template <class T>
+void generateResizeTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<T> &src)
+{
+    for (Ncv32u i=2; i<10; ++i)
+    {
+        char testName[80];
+        sprintf(testName, "TestResize_VGA_s%d", i);
+        testLister.add(new TestResize<T>(testName, src, 640, 480, i, true));
+        testLister.add(new TestResize<T>(testName, src, 640, 480, i, false));
+    }
+
+    for (Ncv32u i=2; i<10; ++i)
+    {
+        char testName[80];
+        sprintf(testName, "TestResize_1080_s%d", i);
+        testLister.add(new TestResize<T>(testName, src, 1920, 1080, i, true));
+        testLister.add(new TestResize<T>(testName, src, 1920, 1080, i, false));
+    }
+}
+
+void generateNPPSTVectorTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv32u> &src, Ncv32u maxLength)
+{
+    //compaction
+    for (Ncv32f _i=256.0; _i<maxLength; _i*=1.5f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf(testName, "Compaction%d", i);
+        testLister.add(new TestCompact(testName, src, i, 0xFFFFFFFF, 30));
+    }
+    for (Ncv32u i=1; i<260; i++)
+    {
+        char testName[80];
+        sprintf(testName, "Compaction%d", i);
+        testLister.add(new TestCompact(testName, src, i, 0xC001C0DE, 70));
+        testLister.add(new TestCompact(testName, src, i, 0xC001C0DE, 0));
+        testLister.add(new TestCompact(testName, src, i, 0xC001C0DE, 100));
+    }
+    for (Ncv32u i=256*256-10; i<256*256+10; i++)
+    {
+        char testName[80];
+        sprintf(testName, "Compaction%d", i);
+        testLister.add(new TestCompact(testName, src, i, 0xFFFFFFFF, 40));
+    }
+    for (Ncv32u i=256*256*256-2; i<256*256*256+2; i++)
+    {
+        char testName[80];
+        sprintf(testName, "Compaction%d", i);
+        testLister.add(new TestCompact(testName, src, i, 0x00000000, 2));
+    }
+}
+
+
+template <class T>
+void generateTransposeTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<T> &src)
+{
+    for (int i=2; i<64; i+=4)
+    {
+        for (int j=2; j<64; j+=4)
+        {
+            char testName[80];
+            sprintf(testName, "TestTranspose_%dx%d", i, j);
+            testLister.add(new TestTranspose<T>(testName, src, i, j));
+        }
+    }
+
+    for (int i=1; i<128; i+=1)
+    {
+        for (int j=1; j<2; j+=1)
+        {
+            char testName[80];
+            sprintf(testName, "TestTranspose_%dx%d", i, j);
+            testLister.add(new TestTranspose<T>(testName, src, i, j));
+        }
+    }
+
+    testLister.add(new TestTranspose<T>("TestTranspose_VGA", src, 640, 480));
+    testLister.add(new TestTranspose<T>("TestTranspose_HD1080", src, 1920, 1080));
+
+    //regression tests
+    testLister.add(new TestTranspose<T>("TestTranspose_reg_0", src, 1072, 375));
+}
+
+template <class T>
+void generateDrawRectsTests(NCVAutoTestLister &testLister,
+                            NCVTestSourceProvider<T> &src,
+                            NCVTestSourceProvider<Ncv32u> &src32u,
+                            Ncv32u maxWidth, Ncv32u maxHeight)
+{
+    for (Ncv32f _i=16.0; _i<maxWidth; _i*=1.1f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        Ncv32u j = maxHeight * i / maxWidth;
+        if (!j) continue;
+        char testName[80];
+        sprintf(testName, "DrawRectsW%dH%d", i, j);
+
+        if (sizeof(T) == sizeof(Ncv32u))
+        {
+            testLister.add(new TestDrawRects<T>(testName, src, src32u, i, j, i*j/1000+1, (T)0xFFFFFFFF));
+        }
+        else if (sizeof(T) == sizeof(Ncv8u))
+        {
+            testLister.add(new TestDrawRects<T>(testName, src, src32u, i, j, i*j/1000+1, (T)0xFF));
+        }
+        else
+        {
+            ncvAssertPrintCheck(false, "Attempted to instantiate non-existing DrawRects test suite");
+        }
+    }
+
+    //test VGA
+    testLister.add(new TestDrawRects<T>("DrawRects_VGA", src, src32u, 640, 480, 640*480/1000, (T)0xFF));
+}
+
+void generateVectorTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv32u> &src, Ncv32u maxLength)
+{
+    //growth
+    for (Ncv32f _i=10.0; _i<maxLength; _i*=1.5f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf(testName, "VectorGrow%d", i);
+        testLister.add(new TestHypothesesGrow(testName, src, 20, 20, 2.2f, i, i/2, i, i/4));
+        testLister.add(new TestHypothesesGrow(testName, src, 10, 42, 1.2f, i, i, i, 0));
+    }
+    testLister.add(new TestHypothesesGrow("VectorGrow01b", src, 10, 42, 1.2f, 10, 0, 10, 1));
+    testLister.add(new TestHypothesesGrow("VectorGrow11b", src, 10, 42, 1.2f, 10, 1, 10, 1));
+    testLister.add(new TestHypothesesGrow("VectorGrow10b", src, 10, 42, 1.2f, 10, 1, 10, 0));
+    testLister.add(new TestHypothesesGrow("VectorGrow00b", src, 10, 42, 1.2f, 10, 0, 10, 0));
+}
+
+void generateHypothesesFiltrationTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv32u> &src, Ncv32u maxLength)
+{
+    for (Ncv32f _i=1.0; _i<maxLength; _i*=1.1f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf(testName, "HypFilter%d", i);
+        testLister.add(new TestHypothesesFilter(testName, src, i, 3, 0.2f));
+        testLister.add(new TestHypothesesFilter(testName, src, i, 0, 0.2f));
+        testLister.add(new TestHypothesesFilter(testName, src, i, 1, 0.1f));
+    }
+}
+
+
+void generateHaarLoaderTests(NCVAutoTestLister &testLister)
+{
+    testLister.add(new TestHaarCascadeLoader("haarcascade_eye.xml", path + "haarcascade_eye.xml"));
+    testLister.add(new TestHaarCascadeLoader("haarcascade_frontalface_alt.xml", path + "haarcascade_frontalface_alt.xml"));
+    testLister.add(new TestHaarCascadeLoader("haarcascade_frontalface_alt2.xml", path + "haarcascade_frontalface_alt2.xml"));
+    testLister.add(new TestHaarCascadeLoader("haarcascade_frontalface_alt_tree.xml", path + "haarcascade_frontalface_alt_tree.xml"));
+    testLister.add(new TestHaarCascadeLoader("haarcascade_eye_tree_eyeglasses.xml", path + "haarcascade_eye_tree_eyeglasses.xml"));
+}
+
+void generateHaarApplicationTests(NCVAutoTestLister &testLister, NCVTestSourceProvider<Ncv8u> &src,
+                                  Ncv32u maxWidth, Ncv32u maxHeight)
+{
+    CV_UNUSED(maxHeight);
+    for (Ncv32u i=100; i<512; i+=41)
+    {
+        for (Ncv32u j=100; j<128; j+=25)
+        {
+            char testName[80];
+            sprintf(testName, "HaarAppl%d_%d", i, j);
+            testLister.add(new TestHaarCascadeApplication(testName, src, path + "haarcascade_frontalface_alt.xml", j, i));
+        }
+    }
+    for (Ncv32f _i=20.0; _i<maxWidth; _i*=1.5f)
+    {
+        Ncv32u i = (Ncv32u)_i;
+        char testName[80];
+        sprintf(testName, "HaarAppl%d", i);
+        testLister.add(new TestHaarCascadeApplication(testName, src, path + "haarcascade_frontalface_alt.xml", i, i));
+    }
+}
+
+static void devNullOutput(const cv::String& msg)
+{
+    CV_UNUSED(msg);
+}
+
+}
+
+bool nvidia_NPPST_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path.c_str();
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerII("NPPST Integral Image", outputLevel);
+
+    NCVTestSourceProvider<Ncv8u> testSrcRandom_8u(2010, 0, 255, 2048, 2048);
+    NCVTestSourceProvider<Ncv32f> testSrcRandom_32f(2010, -1.0f, 1.0f, 2048, 2048);
+
+    generateIntegralTests<Ncv8u, Ncv32u>(testListerII, testSrcRandom_8u, 2048, 2048);
+    generateIntegralTests<Ncv32f, Ncv32f>(testListerII, testSrcRandom_32f, 2048, 2048);
+
+    return testListerII.invoke();
+}
+
+bool nvidia_NPPST_Squared_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path;
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerSII("NPPST Squared Integral Image", outputLevel);
+
+    NCVTestSourceProvider<Ncv8u> testSrcRandom_8u(2010, 0, 255, 2048, 2048);
+
+    generateSquaredIntegralTests(testListerSII, testSrcRandom_8u, 2048, 2048);
+
+    return testListerSII.invoke();
+}
+
+bool nvidia_NPPST_RectStdDev(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path;
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerRStdDev("NPPST RectStdDev", outputLevel);
+
+    NCVTestSourceProvider<Ncv8u> testSrcRandom_8u(2010, 0, 255, 2048, 2048);
+
+    generateRectStdDevTests(testListerRStdDev, testSrcRandom_8u, 2048, 2048);
+
+    return testListerRStdDev.invoke();
+}
+
+bool nvidia_NPPST_Resize(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path;
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerResize("NPPST Resize", outputLevel);
+
+    NCVTestSourceProvider<Ncv32u> testSrcRandom_32u(2010, 0, 0xFFFFFFFF, 2048, 2048);
+    NCVTestSourceProvider<Ncv64u> testSrcRandom_64u(2010, 0, (Ncv64u) -1, 2048, 2048);
+
+    generateResizeTests(testListerResize, testSrcRandom_32u);
+    generateResizeTests(testListerResize, testSrcRandom_64u);
+
+    return testListerResize.invoke();
+}
+
+bool nvidia_NPPST_Vector_Operations(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path;
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerNPPSTVectorOperations("NPPST Vector Operations", outputLevel);
+
+    NCVTestSourceProvider<Ncv32u> testSrcRandom_32u(2010, 0, 0xFFFFFFFF, 2048, 2048);
+
+    generateNPPSTVectorTests(testListerNPPSTVectorOperations, testSrcRandom_32u, 2048*2048);
+
+    return testListerNPPSTVectorOperations.invoke();
+}
+
+bool nvidia_NPPST_Transpose(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path;
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerTranspose("NPPST Transpose", outputLevel);
+
+    NCVTestSourceProvider<Ncv32u> testSrcRandom_32u(2010, 0, 0xFFFFFFFF, 2048, 2048);
+    NCVTestSourceProvider<Ncv64u> testSrcRandom_64u(2010, 0, (Ncv64u) -1, 2048, 2048);
+
+    generateTransposeTests(testListerTranspose, testSrcRandom_32u);
+    generateTransposeTests(testListerTranspose, testSrcRandom_64u);
+
+    return testListerTranspose.invoke();
+}
+
+bool nvidia_NCV_Vector_Operations(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path;
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerVectorOperations("Vector Operations", outputLevel);
+
+    NCVTestSourceProvider<Ncv32u> testSrcRandom_32u(2010, 0, 0xFFFFFFFF, 2048, 2048);
+
+    generateVectorTests(testListerVectorOperations, testSrcRandom_32u, 2048*2048);
+
+    return testListerVectorOperations.invoke();
+
+}
+
+bool nvidia_NCV_Haar_Cascade_Loader(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path;
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerHaarLoader("Haar Cascade Loader", outputLevel);
+
+    generateHaarLoaderTests(testListerHaarLoader);
+
+    return testListerHaarLoader.invoke();
+}
+
+bool nvidia_NCV_Haar_Cascade_Application(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path;
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerHaarAppl("Haar Cascade Application", outputLevel);
+
+    NCVTestSourceProvider<Ncv8u> testSrcFacesVGA_8u(path + "group_1_640x480_VGA.pgm");
+
+    generateHaarApplicationTests(testListerHaarAppl, testSrcFacesVGA_8u, 640, 480);
+
+    return testListerHaarAppl.invoke();
+}
+
+bool nvidia_NCV_Hypotheses_Filtration(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path;
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerHypFiltration("Hypotheses Filtration", outputLevel);
+
+    NCVTestSourceProvider<Ncv32u> testSrcRandom_32u(2010, 0, 0xFFFFFFFF, 2048, 2048);
+
+    generateHypothesesFiltrationTests(testListerHypFiltration, testSrcRandom_32u, 512);
+
+    return testListerHypFiltration.invoke();
+}
+
+bool nvidia_NCV_Visualization(const std::string& test_data_path, OutputLevel outputLevel)
+{
+    path = test_data_path;
+    ncvSetDebugOutputHandler(devNullOutput);
+
+    NCVAutoTestLister testListerVisualize("Visualization", outputLevel);
+
+    NCVTestSourceProvider<Ncv8u> testSrcRandom_8u(2010, 0, 255, 2048, 2048);
+    NCVTestSourceProvider<Ncv32u> testSrcRandom_32u(2010, 0, RAND_MAX, 2048, 2048);
+
+    generateDrawRectsTests(testListerVisualize, testSrcRandom_8u, testSrcRandom_32u, 2048, 2048);
+    generateDrawRectsTests(testListerVisualize, testSrcRandom_32u, testSrcRandom_32u, 2048, 2048);
+
+    return testListerVisualize.invoke();
+}
diff --git a/modules/cudalegacy/test/main_test_nvidia.h b/modules/cudalegacy/test/main_test_nvidia.h
new file mode 100644
index 00000000000..9a3ae1e06a7
--- /dev/null
+++ b/modules/cudalegacy/test/main_test_nvidia.h
@@ -0,0 +1,67 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __main_test_nvidia_h__
+#define __main_test_nvidia_h__
+
+enum OutputLevel
+{
+    OutputLevelNone,
+    OutputLevelCompact,
+    OutputLevelFull
+};
+
+extern OutputLevel nvidiaTestOutputLevel;
+
+bool nvidia_NPPST_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel);
+bool nvidia_NPPST_Squared_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel);
+bool nvidia_NPPST_RectStdDev(const std::string& test_data_path, OutputLevel outputLevel);
+bool nvidia_NPPST_Resize(const std::string& test_data_path, OutputLevel outputLevel);
+bool nvidia_NPPST_Vector_Operations(const std::string& test_data_path, OutputLevel outputLevel);
+bool nvidia_NPPST_Transpose(const std::string& test_data_path, OutputLevel outputLevel);
+bool nvidia_NCV_Vector_Operations(const std::string& test_data_path, OutputLevel outputLevel);
+bool nvidia_NCV_Haar_Cascade_Loader(const std::string& test_data_path, OutputLevel outputLevel);
+bool nvidia_NCV_Haar_Cascade_Application(const std::string& test_data_path, OutputLevel outputLevel);
+bool nvidia_NCV_Hypotheses_Filtration(const std::string& test_data_path, OutputLevel outputLevel);
+bool nvidia_NCV_Visualization(const std::string& test_data_path, OutputLevel outputLevel);
+
+#endif
diff --git a/modules/cudalegacy/test/test_calib3d.cpp b/modules/cudalegacy/test/test_calib3d.cpp
new file mode 100644
index 00000000000..e21432a8a4c
--- /dev/null
+++ b/modules/cudalegacy/test/test_calib3d.cpp
@@ -0,0 +1,193 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#if defined HAVE_CUDA && defined HAVE_OPENCV_CALIB3D
+
+#include "opencv2/calib3d.hpp"
+
+namespace opencv_test { namespace {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// transformPoints
+
+struct TransformPoints : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(TransformPoints, Accuracy)
+{
+    cv::Mat src = randomMat(cv::Size(1000, 1), CV_32FC3, 0, 10);
+    cv::Mat rvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+    cv::Mat tvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::transformPoints(loadMat(src), rvec, tvec, dst);
+
+    ASSERT_EQ(src.size(), dst.size());
+    ASSERT_EQ(src.type(), dst.type());
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat rot;
+    cv::Rodrigues(rvec, rot);
+
+    for (int i = 0; i < h_dst.cols; ++i)
+    {
+        cv::Point3f res = h_dst.at<cv::Point3f>(0, i);
+
+        cv::Point3f p = src.at<cv::Point3f>(0, i);
+        cv::Point3f res_gold(
+                rot.at<float>(0, 0) * p.x + rot.at<float>(0, 1) * p.y + rot.at<float>(0, 2) * p.z + tvec.at<float>(0, 0),
+                rot.at<float>(1, 0) * p.x + rot.at<float>(1, 1) * p.y + rot.at<float>(1, 2) * p.z + tvec.at<float>(0, 1),
+                rot.at<float>(2, 0) * p.x + rot.at<float>(2, 1) * p.y + rot.at<float>(2, 2) * p.z + tvec.at<float>(0, 2));
+
+        ASSERT_POINT3_NEAR(res_gold, res, 1e-5);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Calib3D, TransformPoints, ALL_DEVICES);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// ProjectPoints
+
+struct ProjectPoints : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(ProjectPoints, Accuracy)
+{
+    cv::Mat src = randomMat(cv::Size(1000, 1), CV_32FC3, 0, 10);
+    cv::Mat rvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+    cv::Mat tvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+    cv::Mat camera_mat = randomMat(cv::Size(3, 3), CV_32F, 0.5, 1);
+    camera_mat.at<float>(0, 1) = 0.f;
+    camera_mat.at<float>(1, 0) = 0.f;
+    camera_mat.at<float>(2, 0) = 0.f;
+    camera_mat.at<float>(2, 1) = 0.f;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::projectPoints(loadMat(src), rvec, tvec, camera_mat, cv::Mat(), dst);
+
+    ASSERT_EQ(1, dst.rows);
+    ASSERT_EQ(MatType(CV_32FC2), MatType(dst.type()));
+
+    std::vector<cv::Point2f> dst_gold;
+    cv::projectPoints(src, rvec, tvec, camera_mat, cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)), dst_gold);
+
+    ASSERT_EQ(dst_gold.size(), static_cast<size_t>(dst.cols));
+
+    cv::Mat h_dst(dst);
+
+    for (size_t i = 0; i < dst_gold.size(); ++i)
+    {
+        cv::Point2f res = h_dst.at<cv::Point2f>(0, (int)i);
+        cv::Point2f res_gold = dst_gold[i];
+
+        ASSERT_LE(cv::norm(res_gold - res) / cv::norm(res_gold), 1e-3f);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Calib3D, ProjectPoints, ALL_DEVICES);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SolvePnPRansac
+
+struct SolvePnPRansac : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(SolvePnPRansac, Accuracy)
+{
+    cv::Mat object = randomMat(cv::Size(5000, 1), CV_32FC3, 0, 100);
+    cv::Mat camera_mat = randomMat(cv::Size(3, 3), CV_32F, 0.5, 1);
+    camera_mat.at<float>(0, 1) = 0.f;
+    camera_mat.at<float>(1, 0) = 0.f;
+    camera_mat.at<float>(2, 0) = 0.f;
+    camera_mat.at<float>(2, 1) = 0.f;
+
+    std::vector<cv::Point2f> image_vec;
+    cv::Mat rvec_gold;
+    cv::Mat tvec_gold;
+    rvec_gold = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+    tvec_gold = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)), image_vec);
+
+    cv::Mat rvec, tvec;
+    std::vector<int> inliers;
+    cv::cuda::solvePnPRansac(object, cv::Mat(1, (int)image_vec.size(), CV_32FC2, &image_vec[0]),
+                            camera_mat, cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)),
+                            rvec, tvec, false, 200, 2.f, 100, &inliers);
+
+    ASSERT_LE(cv::norm(rvec - rvec_gold), 1e-3);
+    ASSERT_LE(cv::norm(tvec - tvec_gold), 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Calib3D, SolvePnPRansac, ALL_DEVICES);
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudalegacy/test/test_labeling.cpp b/modules/cudalegacy/test/test_labeling.cpp
new file mode 100644
index 00000000000..2d0bde41d59
--- /dev/null
+++ b/modules/cudalegacy/test/test_labeling.cpp
@@ -0,0 +1,200 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+namespace
+{
+    struct GreedyLabeling
+    {
+        struct dot
+        {
+            int x;
+            int y;
+
+            static dot make(int i, int j)
+            {
+                dot d; d.x = i; d.y = j;
+                return d;
+            }
+        };
+
+        struct InInterval
+        {
+            InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {}
+            const int lo, hi;
+
+            bool operator() (const unsigned char a, const unsigned char b) const
+            {
+                int d = a - b;
+                return lo <= d && d <= hi;
+            }
+        };
+
+        GreedyLabeling(cv::Mat img)
+        : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {}
+
+        void operator() (cv::Mat labels) const
+        {
+            InInterval inInt(0, 2);
+            dot* stack = new dot[image.cols * image.rows];
+
+            int cc = -1;
+
+            int* dist_labels = (int*)labels.data;
+            int pitch = (int) labels.step1();
+
+            unsigned char* source = (unsigned char*)image.data;
+            int width = image.cols;
+            int height = image.rows;
+            int step1 = (int)image.step1();
+
+            for (int j = 0; j < image.rows; ++j)
+                for (int i = 0; i < image.cols; ++i)
+                {
+                    if (dist_labels[j * pitch + i] != -1) continue;
+
+                    dot* top = stack;
+                    dot p = dot::make(i, j);
+                    cc++;
+
+                    dist_labels[j * pitch + i] = cc;
+
+                    while (top >= stack)
+                    {
+                        int*  dl = &dist_labels[p.y * pitch + p.x];
+                        unsigned char* sp = &source[p.y * step1 + p.x];
+
+                        dl[0] = cc;
+
+                        //right
+                        if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
+                            *top++ = dot::make(p.x + 1, p.y);
+
+                        //left
+                        if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
+                            *top++ = dot::make(p.x - 1, p.y);
+
+                        //bottom
+                        if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+step1]))
+                            *top++ = dot::make(p.x, p.y + 1);
+
+                        //top
+                        if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-step1]))
+                            *top++ = dot::make(p.x, p.y - 1);
+
+                        p = *--top;
+                    }
+                }
+            delete[] stack;
+        }
+
+        void checkCorrectness(cv::Mat gpu)
+        {
+            cv::Mat diff = gpu - _labels;
+
+            int outliers = 0;
+            for (int j = 0; j < image.rows; ++j)
+                for (int i = 0; i < image.cols - 1; ++i)
+                {
+                    if ( (_labels.at<int>(j,i) == gpu.at<int>(j,i + 1)) && (diff.at<int>(j, i) != diff.at<int>(j,i + 1)))
+                    {
+                        outliers++;
+                    }
+                }
+            ASSERT_TRUE(outliers < gpu.cols + gpu.rows);
+        }
+
+        cv::Mat image;
+        cv::Mat _labels;
+    };
+}
+
+struct Labeling : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+
+    cv::Mat loat_image()
+    {
+        return cv::imread(std::string( cvtest::TS::ptr()->get_data_path() ) + "labeling/label.png");
+    }
+};
+
+CUDA_TEST_P(Labeling, DISABLED_ConnectedComponents)
+{
+    cv::Mat image;
+    cvtColor(loat_image(), image, cv::COLOR_BGR2GRAY);
+
+    cv::threshold(image, image, 150, 255, cv::THRESH_BINARY);
+
+    ASSERT_TRUE(image.type() == CV_8UC1);
+
+    GreedyLabeling host(image);
+    host(host._labels);
+
+    cv::cuda::GpuMat mask;
+    mask.create(image.rows, image.cols, CV_8UC1);
+
+    cv::cuda::GpuMat components;
+    components.create(image.rows, image.cols, CV_32SC1);
+
+    cv::cuda::connectivityMask(cv::cuda::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+    cv::cuda::labelComponents(mask, components);
+
+    host.checkCorrectness(cv::Mat(components));
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ConnectedComponents, Labeling, ALL_DEVICES);
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudalegacy/test/test_main.cpp b/modules/cudalegacy/test/test_main.cpp
new file mode 100644
index 00000000000..3f5b70659eb
--- /dev/null
+++ b/modules/cudalegacy/test/test_main.cpp
@@ -0,0 +1,130 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace std;
+using namespace cv;
+using namespace cv::cuda;
+using namespace opencv_test;
+using namespace testing;
+
+int main(int argc, char** argv)
+{
+    try
+    {
+        const std::string keys =
+                "{ h help ?            |      | Print help}"
+                "{ i info              |      | Print information about system and exit }"
+                "{ device              | -1   | Device on which tests will be executed (-1 means all devices) }"
+                "{ nvtest_output_level | none | NVidia test verbosity level (none, compact, full) }"
+                ;
+
+        CommandLineParser cmd(argc, (const char**)argv, keys);
+
+        if (cmd.has("help"))
+        {
+            cmd.printMessage();
+            return 0;
+        }
+
+        printCudaInfo();
+
+        if (cmd.has("info"))
+        {
+            return 0;
+        }
+
+        int device = cmd.get<int>("device");
+        if (device < 0)
+        {
+            DeviceManager::instance().loadAll();
+
+            cout << "Run tests on all supported devices \n" << endl;
+        }
+        else
+        {
+            DeviceManager::instance().load(device);
+
+            DeviceInfo info(device);
+            cout << "Run tests on device " << device << " [" << info.name() << "] \n" << endl;
+        }
+
+        string outputLevel = cmd.get<string>("nvtest_output_level");
+
+        if (outputLevel == "none")
+            nvidiaTestOutputLevel = OutputLevelNone;
+        else if (outputLevel == "compact")
+            nvidiaTestOutputLevel = OutputLevelCompact;
+        else if (outputLevel == "full")
+            nvidiaTestOutputLevel = OutputLevelFull;
+
+        TS::ptr()->init("gpu");
+        InitGoogleTest(&argc, argv);
+
+        return RUN_ALL_TESTS();
+    }
+    catch (const exception& e)
+    {
+        cerr << e.what() << endl;
+        return -1;
+    }
+    catch (...)
+    {
+        cerr << "Unknown error" << endl;
+        return -1;
+    }
+
+    return 0;
+}
+
+#else // HAVE_CUDA
+
+int main()
+{
+    printf("OpenCV was built without CUDA support\n");
+    return 0;
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/cudalegacy/test/test_nvidia.cpp b/modules/cudalegacy/test/test_nvidia.cpp
new file mode 100644
index 00000000000..f1df44e1ada
--- /dev/null
+++ b/modules/cudalegacy/test/test_nvidia.cpp
@@ -0,0 +1,152 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+OutputLevel nvidiaTestOutputLevel = OutputLevelNone;
+
+namespace opencv_test { namespace {
+
+struct NVidiaTest : TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    std::string _path;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+        _path = TS::ptr()->get_data_path().c_str();
+        _path = _path + "haarcascade/";
+    }
+};
+
+struct NPPST : NVidiaTest {};
+struct NCV : NVidiaTest {};
+
+CUDA_TEST_P(NPPST, Integral)
+{
+    bool res = nvidia_NPPST_Integral_Image(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+CUDA_TEST_P(NPPST, SquaredIntegral)
+{
+    bool res = nvidia_NPPST_Squared_Integral_Image(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+CUDA_TEST_P(NPPST, RectStdDev)
+{
+    bool res = nvidia_NPPST_RectStdDev(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+CUDA_TEST_P(NPPST, Resize)
+{
+    bool res = nvidia_NPPST_Resize(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+CUDA_TEST_P(NPPST, VectorOperations)
+{
+    bool res = nvidia_NPPST_Vector_Operations(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+CUDA_TEST_P(NPPST, Transpose)
+{
+    bool res = nvidia_NPPST_Transpose(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+CUDA_TEST_P(NCV, VectorOperations)
+{
+    bool res = nvidia_NCV_Vector_Operations(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+CUDA_TEST_P(NCV, HaarCascadeLoader)
+{
+    bool res = nvidia_NCV_Haar_Cascade_Loader(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+CUDA_TEST_P(NCV, HaarCascadeApplication)
+{
+    bool res = nvidia_NCV_Haar_Cascade_Application(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+CUDA_TEST_P(NCV, HypothesesFiltration)
+{
+    bool res = nvidia_NCV_Hypotheses_Filtration(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+CUDA_TEST_P(NCV, Visualization)
+{
+    bool res = nvidia_NCV_Visualization(_path, nvidiaTestOutputLevel);
+
+    ASSERT_TRUE(res);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Legacy, NPPST, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_Legacy, NCV, ALL_DEVICES);
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudalegacy/test/test_precomp.hpp b/modules/cudalegacy/test/test_precomp.hpp
new file mode 100644
index 00000000000..f4ca2f48a52
--- /dev/null
+++ b/modules/cudalegacy/test/test_precomp.hpp
@@ -0,0 +1,90 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#if defined(__GNUC__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__powerpc64__)
+    #include <fpu_control.h>
+#endif
+
+#include <cfloat>
+#include <cstdio>
+#include <cmath>
+#include <vector>
+#include <string>
+#include <map>
+#include <memory>
+#include <algorithm>
+#include <fstream>
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/cudalegacy.hpp"
+#include "opencv2/highgui.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#include "cvconfig.h"
+
+#include "NCVTest.hpp"
+#include "NCVAutoTestLister.hpp"
+#include "NCVTestSourceProvider.hpp"
+
+#include "TestIntegralImage.h"
+#include "TestIntegralImageSquared.h"
+#include "TestRectStdDev.h"
+#include "TestResize.h"
+#include "TestCompact.h"
+#include "TestTranspose.h"
+#include "TestDrawRects.h"
+#include "TestHypothesesGrow.h"
+#include "TestHypothesesFilter.h"
+#include "TestHaarCascadeLoader.h"
+#include "TestHaarCascadeApplication.h"
+
+#include "main_test_nvidia.h"
+
+#endif
diff --git a/modules/cudaobjdetect/CMakeLists.txt b/modules/cudaobjdetect/CMakeLists.txt
new file mode 100644
index 00000000000..0001afaffd0
--- /dev/null
+++ b/modules/cudaobjdetect/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudaobjdetect)
+endif()
+
+set(the_description "CUDA-accelerated Object Detection")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudaobjdetect opencv_objdetect opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy WRAP python)
diff --git a/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp b/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
new file mode 100644
index 00000000000..29703694ee6
--- /dev/null
+++ b/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
@@ -0,0 +1,288 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDAOBJDETECT_HPP
+#define OPENCV_CUDAOBJDETECT_HPP
+
+#ifndef __cplusplus
+#  error cudaobjdetect.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+      @defgroup cudaobjdetect Object Detection
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudaobjdetect
+//! @{
+
+//
+// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector
+//
+
+/** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.
+
+@note
+    -   An example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/cpp/peopledetect.cpp
+    -   A CUDA example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/gpu/hog.cpp
+    -   (Python) An example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/python/peopledetect.py
+ */
+class CV_EXPORTS_W HOG : public Algorithm
+{
+public:
+    enum
+    {
+        DESCR_FORMAT_ROW_BY_ROW,
+        DESCR_FORMAT_COL_BY_COL
+    };
+
+    /** @brief Creates the HOG descriptor and detector.
+
+    @param win_size Detection window size. Align to block size and block stride.
+    @param block_size Block size in pixels. Align to cell size. Only (16,16) is supported for now.
+    @param block_stride Block stride. It must be a multiple of cell size.
+    @param cell_size Cell size. Only (8, 8) is supported for now.
+    @param nbins Number of bins. Only 9 bins per cell are supported for now.
+     */
+    CV_WRAP static Ptr<HOG> create(Size win_size = Size(64, 128),
+                           Size block_size = Size(16, 16),
+                           Size block_stride = Size(8, 8),
+                           Size cell_size = Size(8, 8),
+                           int nbins = 9);
+
+    //! Gaussian smoothing window parameter.
+    CV_WRAP virtual void setWinSigma(double win_sigma) = 0;
+    CV_WRAP virtual double getWinSigma() const = 0;
+
+    //! L2-Hys normalization method shrinkage.
+    CV_WRAP virtual void setL2HysThreshold(double threshold_L2hys) = 0;
+    CV_WRAP virtual double getL2HysThreshold() const = 0;
+
+    //! Flag to specify whether the gamma correction preprocessing is required or not.
+    CV_WRAP virtual void setGammaCorrection(bool gamma_correction) = 0;
+    CV_WRAP virtual bool getGammaCorrection() const = 0;
+
+    //! Maximum number of detection window increases.
+    CV_WRAP virtual void setNumLevels(int nlevels) = 0;
+    CV_WRAP virtual int getNumLevels() const = 0;
+
+    //! Threshold for the distance between features and SVM classifying plane.
+    //! Usually it is 0 and should be specified in the detector coefficients (as the last free
+    //! coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
+    //! manually here.
+    CV_WRAP virtual void setHitThreshold(double hit_threshold) = 0;
+    CV_WRAP virtual double getHitThreshold() const = 0;
+
+    //! Window stride. It must be a multiple of block stride.
+    CV_WRAP virtual void setWinStride(Size win_stride) = 0;
+    CV_WRAP virtual Size getWinStride() const = 0;
+
+    //! Coefficient of the detection window increase.
+    CV_WRAP virtual void setScaleFactor(double scale0) = 0;
+    CV_WRAP virtual double getScaleFactor() const = 0;
+
+    //! Coefficient to regulate the similarity threshold. When detected, some
+    //! objects can be covered by many rectangles. 0 means not to perform grouping.
+    //! See groupRectangles.
+    CV_WRAP virtual void setGroupThreshold(int group_threshold) = 0;
+    CV_WRAP virtual int getGroupThreshold() const = 0;
+
+    //! Descriptor storage format:
+    //!   - **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
+    //!   - **DESCR_FORMAT_COL_BY_COL** - Column-major order.
+    CV_WRAP virtual void setDescriptorFormat(int descr_format) = 0;
+    CV_WRAP virtual int getDescriptorFormat() const = 0;
+
+    /** @brief Returns the number of coefficients required for the classification.
+     */
+    CV_WRAP virtual size_t getDescriptorSize() const = 0;
+
+    /** @brief Returns the block histogram size.
+     */
+    CV_WRAP virtual size_t getBlockHistogramSize() const = 0;
+
+    /** @brief Sets coefficients for the linear SVM classifier.
+     */
+    CV_WRAP virtual void setSVMDetector(InputArray detector) = 0;
+
+    /** @brief Returns coefficients of the classifier trained for people detection.
+     */
+    CV_WRAP virtual Mat getDefaultPeopleDetector() const = 0;
+
+    /** @brief Performs object detection without a multi-scale window.
+
+    @param img Source image. CV_8UC1 and CV_8UC4 types are supported for now.
+    @param found_locations Left-top corner points of detected objects boundaries.
+    @param confidences Optional output array for confidences.
+     */
+    virtual void detect(InputArray img,
+                        std::vector<Point>& found_locations,
+                        std::vector<double>* confidences = NULL) = 0;
+
+    /** @brief Performs object detection with a multi-scale window.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param found_locations Detected objects boundaries.
+    @param confidences Optional output array for confidences.
+     */
+    virtual void detectMultiScale(InputArray img,
+                                  std::vector<Rect>& found_locations,
+                                  std::vector<double>* confidences = NULL) = 0;
+
+    /** @brief Returns block descriptors computed for the whole image.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param descriptors 2D array of descriptors.
+    @param stream CUDA stream.
+     */
+    CV_WRAP virtual void compute(InputArray img,
+                         OutputArray descriptors,
+                         Stream& stream = Stream::Null()) = 0;
+};
+
+//
+// CascadeClassifier
+//
+
+/** @brief Cascade classifier class used for object detection. Supports HAAR and LBP cascades. :
+
+@note
+   -   A cascade classifier example can be found at
+        opencv_source_code/samples/gpu/cascadeclassifier.cpp
+    -   A Nvidea API specific cascade classifier example can be found at
+        opencv_source_code/samples/gpu/cascadeclassifier_nvidia_api.cpp
+ */
+class CV_EXPORTS_W CascadeClassifier : public Algorithm
+{
+public:
+    /** @brief Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
+
+    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
+    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
+    type of OpenCV XML cascade supported for LBP. The working haar models can be found at opencv_folder/data/haarcascades_cuda/
+     */
+    CV_WRAP static Ptr<cuda::CascadeClassifier> create(const String& filename);
+    /** @overload
+     */
+    static Ptr<cuda::CascadeClassifier> create(const FileStorage& file);
+
+    //! Maximum possible object size. Objects larger than that are ignored. Used for
+    //! second signature and supported only for LBP cascades.
+    CV_WRAP virtual void setMaxObjectSize(Size maxObjectSize) = 0;
+    CV_WRAP virtual Size getMaxObjectSize() const = 0;
+
+    //! Minimum possible object size. Objects smaller than that are ignored.
+    CV_WRAP virtual void setMinObjectSize(Size minSize) = 0;
+    CV_WRAP virtual Size getMinObjectSize() const = 0;
+
+    //! Parameter specifying how much the image size is reduced at each image scale.
+    CV_WRAP virtual void setScaleFactor(double scaleFactor) = 0;
+    CV_WRAP virtual double getScaleFactor() const = 0;
+
+    //! Parameter specifying how many neighbors each candidate rectangle should have
+    //! to retain it.
+    CV_WRAP virtual void setMinNeighbors(int minNeighbors) = 0;
+    CV_WRAP virtual int getMinNeighbors() const = 0;
+
+    CV_WRAP virtual void setFindLargestObject(bool findLargestObject) = 0;
+    CV_WRAP virtual bool getFindLargestObject() = 0;
+
+    CV_WRAP virtual void setMaxNumObjects(int maxNumObjects) = 0;
+    CV_WRAP virtual int getMaxNumObjects() const = 0;
+
+    CV_WRAP virtual Size getClassifierSize() const = 0;
+
+    /** @brief Detects objects of different sizes in the input image.
+
+    @param image Matrix of type CV_8U containing an image where objects should be detected.
+    @param objects Buffer to store detected objects (rectangles).
+    @param stream CUDA stream.
+
+    To get final array of detected objects use CascadeClassifier::convert method.
+
+    @code
+        Ptr<cuda::CascadeClassifier> cascade_gpu = cuda::CascadeClassifier::create(...);
+
+        Mat image_cpu = imread(...)
+        GpuMat image_gpu(image_cpu);
+
+        GpuMat objbuf;
+        cascade_gpu->detectMultiScale(image_gpu, objbuf);
+
+        std::vector<Rect> faces;
+        cascade_gpu->convert(objbuf, faces);
+
+        for(int i = 0; i < detections_num; ++i)
+           cv::rectangle(image_cpu, faces[i], Scalar(255));
+
+        imshow("Faces", image_cpu);
+    @endcode
+
+    @sa CascadeClassifier::detectMultiScale
+     */
+    CV_WRAP virtual void detectMultiScale(InputArray image,
+                                  OutputArray objects,
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts objects array from internal representation to standard vector.
+
+    @param gpu_objects Objects array in internal representation.
+    @param objects Resulting array.
+     */
+    CV_WRAP virtual void convert(OutputArray gpu_objects,
+                         std::vector<Rect>& objects) = 0;
+};
+
+//! @}
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_CUDAOBJDETECT_HPP */
diff --git a/modules/cudaobjdetect/perf/perf_main.cpp b/modules/cudaobjdetect/perf/perf_main.cpp
new file mode 100644
index 00000000000..7a927be744d
--- /dev/null
+++ b/modules/cudaobjdetect/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudaobjdetect)
diff --git a/modules/cudaobjdetect/perf/perf_objdetect.cpp b/modules/cudaobjdetect/perf/perf_objdetect.cpp
new file mode 100644
index 00000000000..2bfd1052694
--- /dev/null
+++ b/modules/cudaobjdetect/perf/perf_objdetect.cpp
@@ -0,0 +1,173 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+///////////////////////////////////////////////////////////////
+// HOG
+
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, ObjDetect_HOG,
+            Values<string>("gpu/hog/road.png",
+                           "gpu/caltech/image_00000009_0.png",
+                           "gpu/caltech/image_00000032_0.png",
+                           "gpu/caltech/image_00000165_0.png",
+                           "gpu/caltech/image_00000261_0.png",
+                           "gpu/caltech/image_00000469_0.png",
+                           "gpu/caltech/image_00000527_0.png",
+                           "gpu/caltech/image_00000574_0.png"))
+{
+    declare.time(300.0);
+
+    const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_img(img);
+        std::vector<cv::Rect> gpu_found_locations;
+
+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+        d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());
+
+        TEST_CYCLE() d_hog->detectMultiScale(d_img, gpu_found_locations);
+
+        SANITY_CHECK(gpu_found_locations);
+    }
+    else
+    {
+        std::vector<cv::Rect> cpu_found_locations;
+
+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+
+        cv::HOGDescriptor hog;
+        hog.setSVMDetector(d_hog->getDefaultPeopleDetector());
+
+        TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);
+
+        SANITY_CHECK(cpu_found_locations);
+    }
+}
+
+///////////////////////////////////////////////////////////////
+// HaarClassifier
+
+typedef pair<string, string> pair_string;
+DEF_PARAM_TEST_1(ImageAndCascade, pair_string);
+
+PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
+            Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
+{
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::CascadeClassifier> d_cascade =
+                cv::cuda::CascadeClassifier::create(perf::TestBase::getDataPath(GetParam().second));
+
+        const cv::cuda::GpuMat d_img(img);
+        cv::cuda::GpuMat objects_buffer;
+
+        TEST_CYCLE() d_cascade->detectMultiScale(d_img, objects_buffer);
+
+        std::vector<cv::Rect> gpu_rects;
+        d_cascade->convert(objects_buffer, gpu_rects);
+
+        cv::groupRectangles(gpu_rects, 3, 0.2);
+        SANITY_CHECK(gpu_rects);
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
+
+        std::vector<cv::Rect> cpu_rects;
+
+        TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);
+
+        SANITY_CHECK(cpu_rects);
+    }
+}
+
+///////////////////////////////////////////////////////////////
+// LBP cascade
+
+PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
+            Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
+{
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::CascadeClassifier> d_cascade =
+                cv::cuda::CascadeClassifier::create(perf::TestBase::getDataPath(GetParam().second));
+
+        const cv::cuda::GpuMat d_img(img);
+        cv::cuda::GpuMat objects_buffer;
+
+        TEST_CYCLE() d_cascade->detectMultiScale(d_img, objects_buffer);
+
+        std::vector<cv::Rect> gpu_rects;
+        d_cascade->convert(objects_buffer, gpu_rects);
+
+        cv::groupRectangles(gpu_rects, 3, 0.2);
+        SANITY_CHECK(gpu_rects);
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
+
+        std::vector<cv::Rect> cpu_rects;
+
+        TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);
+
+        SANITY_CHECK(cpu_rects);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaobjdetect/perf/perf_precomp.hpp b/modules/cudaobjdetect/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..77575a6d4d9
--- /dev/null
+++ b/modules/cudaobjdetect/perf/perf_precomp.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/objdetect.hpp"
+
+namespace opencv_test { using namespace perf; }
+
+#endif
diff --git a/modules/cudaobjdetect/src/cascadeclassifier.cpp b/modules/cudaobjdetect/src/cascadeclassifier.cpp
new file mode 100644
index 00000000000..c264e182f3b
--- /dev/null
+++ b/modules/cudaobjdetect/src/cascadeclassifier.cpp
@@ -0,0 +1,861 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/objdetect/objdetect_c.h"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const String&) { throw_no_cuda(); return Ptr<cuda::CascadeClassifier>(); }
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const FileStorage&) { throw_no_cuda(); return Ptr<cuda::CascadeClassifier>(); }
+
+#else
+
+//
+// CascadeClassifierBase
+//
+
+namespace
+{
+    class CascadeClassifierBase : public cuda::CascadeClassifier
+    {
+    public:
+        CascadeClassifierBase();
+
+        virtual void setMaxObjectSize(Size maxObjectSize) { maxObjectSize_ = maxObjectSize; }
+        virtual Size getMaxObjectSize() const { return maxObjectSize_; }
+
+        virtual void setMinObjectSize(Size minSize) { minObjectSize_ = minSize; }
+        virtual Size getMinObjectSize() const { return minObjectSize_; }
+
+        virtual void setScaleFactor(double scaleFactor) { scaleFactor_ = scaleFactor; }
+        virtual double getScaleFactor() const { return scaleFactor_; }
+
+        virtual void setMinNeighbors(int minNeighbors) { minNeighbors_ = minNeighbors; }
+        virtual int getMinNeighbors() const { return minNeighbors_; }
+
+        virtual void setFindLargestObject(bool findLargestObject) { findLargestObject_ = findLargestObject; }
+        virtual bool getFindLargestObject() { return findLargestObject_; }
+
+        virtual void setMaxNumObjects(int maxNumObjects) { maxNumObjects_ = maxNumObjects; }
+        virtual int getMaxNumObjects() const { return maxNumObjects_; }
+
+    protected:
+        Size maxObjectSize_;
+        Size minObjectSize_;
+        double scaleFactor_;
+        int minNeighbors_;
+        bool findLargestObject_;
+        int maxNumObjects_;
+    };
+
+    CascadeClassifierBase::CascadeClassifierBase() :
+        maxObjectSize_(),
+        minObjectSize_(),
+        scaleFactor_(1.2),
+        minNeighbors_(4),
+        findLargestObject_(false),
+        maxNumObjects_(100)
+    {
+    }
+}
+
+//
+// HaarCascade
+//
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+
+namespace
+{
+    class HaarCascade_Impl : public CascadeClassifierBase
+    {
+    public:
+        explicit HaarCascade_Impl(const String& filename);
+
+        virtual Size getClassifierSize() const;
+
+        virtual void detectMultiScale(InputArray image,
+                                      OutputArray objects,
+                                      Stream& stream);
+
+        virtual void convert(OutputArray gpu_objects,
+                             std::vector<Rect>& objects);
+
+    private:
+        NCVStatus load(const String& classifierFile);
+        NCVStatus calculateMemReqsAndAllocate(const Size& frameSize);
+        NCVStatus process(const GpuMat& src, GpuMat& objects, cv::Size ncvMinSize, /*out*/ unsigned int& numDetections);
+
+        Size lastAllocatedFrameSize;
+
+        Ptr<NCVMemStackAllocator> gpuAllocator;
+        Ptr<NCVMemStackAllocator> cpuAllocator;
+
+        cudaDeviceProp devProp;
+        NCVStatus ncvStat;
+
+        Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
+        Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
+
+        Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;
+        Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
+        Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
+
+        HaarClassifierCascadeDescriptor haar;
+
+        Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
+        Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
+        Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
+    };
+
+    static void NCVDebugOutputHandler(const String &msg)
+    {
+        CV_Error(Error::GpuApiCallError, msg.c_str());
+    }
+
+    HaarCascade_Impl::HaarCascade_Impl(const String& filename) :
+        lastAllocatedFrameSize(-1, -1)
+    {
+        ncvSetDebugOutputHandler(NCVDebugOutputHandler);
+        ncvSafeCall( load(filename) );
+    }
+
+    Size HaarCascade_Impl::getClassifierSize() const
+    {
+        return Size(haar.ClassifierSize.width, haar.ClassifierSize.height);
+    }
+
+    void HaarCascade_Impl::detectMultiScale(InputArray _image,
+                                            OutputArray _objects,
+                                            Stream& stream)
+    {
+        const GpuMat image = _image.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U);
+        CV_Assert( scaleFactor_ > 1 );
+        CV_Assert( !stream );
+
+        Size ncvMinSize = getClassifierSize();
+        if (ncvMinSize.width < minObjectSize_.width && ncvMinSize.height < minObjectSize_.height)
+        {
+            ncvMinSize.width = minObjectSize_.width;
+            ncvMinSize.height = minObjectSize_.height;
+        }
+
+        BufferPool pool(stream);
+        GpuMat objectsBuf = pool.getBuffer(1, maxNumObjects_, traits::Type<Rect>::value);
+
+        unsigned int numDetections;
+        ncvSafeCall( process(image, objectsBuf, ncvMinSize, numDetections) );
+
+        if (numDetections > 0)
+        {
+            objectsBuf.colRange(0, numDetections).copyTo(_objects);
+        }
+        else
+        {
+            _objects.release();
+        }
+    }
+
+    void HaarCascade_Impl::convert(OutputArray _gpu_objects, std::vector<Rect>& objects)
+    {
+        if (_gpu_objects.empty())
+        {
+            objects.clear();
+            return;
+        }
+
+        Mat gpu_objects;
+        if (_gpu_objects.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_objects.getGpuMat().download(gpu_objects);
+        }
+        else
+        {
+            gpu_objects = _gpu_objects.getMat();
+        }
+
+        CV_Assert( gpu_objects.rows == 1 );
+        CV_Assert( gpu_objects.type() == traits::Type<Rect>::value );
+
+        Rect* ptr = gpu_objects.ptr<Rect>();
+        objects.assign(ptr, ptr + gpu_objects.cols);
+    }
+
+    NCVStatus HaarCascade_Impl::load(const String& classifierFile)
+    {
+        int devId = cv::cuda::getDevice();
+        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
+
+        // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
+        gpuCascadeAllocator = makePtr<NCVMemNativeAllocator>(NCVMemoryTypeDevice, static_cast<int>(devProp.textureAlignment));
+        cpuCascadeAllocator = makePtr<NCVMemNativeAllocator>(NCVMemoryTypeHostPinned, static_cast<int>(devProp.textureAlignment));
+
+        ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);
+
+        Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
+        ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR);
+
+        h_haarStages.reset  (new NCVVectorAlloc<HaarStage64>(*cpuCascadeAllocator, haarNumStages));
+        h_haarNodes.reset   (new NCVVectorAlloc<HaarClassifierNode128>(*cpuCascadeAllocator, haarNumNodes));
+        h_haarFeatures.reset(new NCVVectorAlloc<HaarFeature64>(*cpuCascadeAllocator, haarNumFeatures));
+
+        ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+
+        ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR);
+
+        d_haarStages.reset  (new NCVVectorAlloc<HaarStage64>(*gpuCascadeAllocator, haarNumStages));
+        d_haarNodes.reset   (new NCVVectorAlloc<HaarClassifierNode128>(*gpuCascadeAllocator, haarNumNodes));
+        d_haarFeatures.reset(new NCVVectorAlloc<HaarFeature64>(*gpuCascadeAllocator, haarNumFeatures));
+
+        ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+
+        ncvStat = h_haarStages->copySolid(*d_haarStages, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+        ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+        ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+
+        return NCV_SUCCESS;
+    }
+
+    NCVStatus HaarCascade_Impl::calculateMemReqsAndAllocate(const Size& frameSize)
+    {
+        if (lastAllocatedFrameSize == frameSize)
+        {
+            return NCV_SUCCESS;
+        }
+
+        // Calculate memory requirements and create real allocators
+        NCVMemStackAllocator gpuCounter(static_cast<int>(devProp.textureAlignment));
+        NCVMemStackAllocator cpuCounter(static_cast<int>(devProp.textureAlignment));
+
+        ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR);
+
+        NCVMatrixAlloc<Ncv8u> d_src(gpuCounter, frameSize.width, frameSize.height);
+        NCVMatrixAlloc<Ncv8u> h_src(cpuCounter, frameSize.width, frameSize.height);
+
+        ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+        ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+        NCVVectorAlloc<NcvRect32u> d_rects(gpuCounter, 100);
+        ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+        Ncv32u numDetections;
+        ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0);
+
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+        gpuAllocator = makePtr<NCVMemStackAllocator>(NCVMemoryTypeDevice, gpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
+        cpuAllocator = makePtr<NCVMemStackAllocator>(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
+
+        ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR);
+
+        lastAllocatedFrameSize = frameSize;
+        return NCV_SUCCESS;
+    }
+
+    NCVStatus HaarCascade_Impl::process(const GpuMat& src, GpuMat& objects, cv::Size ncvMinSize, /*out*/ unsigned int& numDetections)
+    {
+        calculateMemReqsAndAllocate(src.size());
+
+        NCVMemPtr src_beg;
+        src_beg.ptr = (void*)src.ptr<Ncv8u>();
+        src_beg.memtype = NCVMemoryTypeDevice;
+
+        NCVMemSegment src_seg;
+        src_seg.begin = src_beg;
+        src_seg.size  = src.step * src.rows;
+
+        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
+        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
+
+        CV_Assert(objects.rows == 1);
+
+        NCVMemPtr objects_beg;
+        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
+        objects_beg.memtype = NCVMemoryTypeDevice;
+
+        NCVMemSegment objects_seg;
+        objects_seg.begin = objects_beg;
+        objects_seg.size = objects.step * objects.rows;
+        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
+        ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
+
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+
+        NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height);
+
+        Ncv32u flags = 0;
+        flags |= findLargestObject_ ? NCVPipeObjDet_FindLargestObject : 0;
+
+        ncvStat = ncvDetectObjectsMultiScale_device(
+            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures,
+            winMinSize,
+            minNeighbors_,
+            scaleFactor_, 1,
+            flags,
+            *gpuAllocator, *cpuAllocator, devProp, 0);
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+        return NCV_SUCCESS;
+    }
+}
+
+#endif
+
+//
+// LbpCascade
+//
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace lbp
+    {
+        void classifyPyramid(int frameW,
+                             int frameH,
+                             int windowW,
+                             int windowH,
+                             float initalScale,
+                             float factor,
+                             int total,
+                             const PtrStepSzb& mstages,
+                             const int nstages,
+                             const PtrStepSzi& mnodes,
+                             const PtrStepSzf& mleaves,
+                             const PtrStepSzi& msubsets,
+                             const PtrStepSzb& mfeatures,
+                             const int subsetSize,
+                             PtrStepSz<int4> objects,
+                             unsigned int* classified,
+                             PtrStepSzi integral);
+
+        void connectedConmonents(PtrStepSz<int4> candidates,
+                                 int ncandidates,
+                                 PtrStepSz<int4> objects,
+                                 int groupThreshold,
+                                 float grouping_eps,
+                                 unsigned int* nclasses);
+    }
+}}}
+
+namespace
+{
+    cv::Size operator -(const cv::Size& a, const cv::Size& b)
+    {
+        return cv::Size(a.width - b.width, a.height - b.height);
+    }
+
+    cv::Size operator +(const cv::Size& a, const int& i)
+    {
+        return cv::Size(a.width + i, a.height + i);
+    }
+
+    cv::Size operator *(const cv::Size& a, const float& f)
+    {
+        return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
+    }
+
+    cv::Size operator /(const cv::Size& a, const float& f)
+    {
+        return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
+    }
+
+    bool operator <=(const cv::Size& a, const cv::Size& b)
+    {
+        return a.width <= b.width && a.height <= b.width;
+    }
+
+    struct PyrLavel
+    {
+        PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window, cv::Size minObjectSize)
+        {
+            do
+            {
+                order = _order;
+                scale = pow(_scale, order);
+                sFrame = frame / scale;
+                workArea = sFrame - window + 1;
+                sWindow = window * scale;
+                _order++;
+            } while (sWindow <= minObjectSize);
+        }
+
+        bool isFeasible(cv::Size maxObj)
+        {
+            return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
+        }
+
+        PyrLavel next(float factor, cv::Size frame, cv::Size window, cv::Size minObjectSize)
+        {
+            return PyrLavel(order + 1, factor, frame, window, minObjectSize);
+        }
+
+        int order;
+        float scale;
+        cv::Size sFrame;
+        cv::Size workArea;
+        cv::Size sWindow;
+    };
+
+    class LbpCascade_Impl : public CascadeClassifierBase
+    {
+    public:
+        explicit LbpCascade_Impl(const FileStorage& file);
+
+        virtual Size getClassifierSize() const { return NxM; }
+
+        virtual void detectMultiScale(InputArray image,
+                                      OutputArray objects,
+                                      Stream& stream);
+
+        virtual void convert(OutputArray gpu_objects,
+                             std::vector<Rect>& objects);
+
+    private:
+        bool load(const FileNode &root);
+        void allocateBuffers(cv::Size frame);
+
+    private:
+        struct Stage
+        {
+            int    first;
+            int    ntrees;
+            float  threshold;
+        };
+
+        enum stage { BOOST = 0 };
+        enum feature { LBP = 1, HAAR = 2 };
+
+        static const stage stageType = BOOST;
+        static const feature featureType = LBP;
+
+        cv::Size NxM;
+        bool isStumps;
+        int ncategories;
+        int subsetSize;
+        int nodeStep;
+
+        // gpu representation of classifier
+        GpuMat stage_mat;
+        GpuMat trees_mat;
+        GpuMat nodes_mat;
+        GpuMat leaves_mat;
+        GpuMat subsets_mat;
+        GpuMat features_mat;
+
+        GpuMat integral;
+        GpuMat integralBuffer;
+        GpuMat resuzeBuffer;
+
+        GpuMat candidates;
+        static const int integralFactor = 4;
+    };
+
+    LbpCascade_Impl::LbpCascade_Impl(const FileStorage& file)
+    {
+        load(file.getFirstTopLevelNode());
+    }
+
+    void LbpCascade_Impl::detectMultiScale(InputArray _image,
+                                           OutputArray _objects,
+                                           Stream& stream)
+    {
+        const GpuMat image = _image.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U);
+        CV_Assert( scaleFactor_ > 1 );
+        CV_Assert( !stream );
+
+        const float grouping_eps = 0.2f;
+
+        BufferPool pool(stream);
+        GpuMat objects = pool.getBuffer(1, maxNumObjects_, traits::Type<Rect>::value);
+
+        // used for debug
+        // candidates.setTo(cv::Scalar::all(0));
+        // objects.setTo(cv::Scalar::all(0));
+
+        if (maxObjectSize_ == cv::Size())
+            maxObjectSize_ = image.size();
+
+        allocateBuffers(image.size());
+
+        unsigned int classified = 0;
+        GpuMat dclassified(1, 1, CV_32S);
+        cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
+
+        PyrLavel level(0, scaleFactor_, image.size(), NxM, minObjectSize_);
+
+        while (level.isFeasible(maxObjectSize_))
+        {
+            int acc = level.sFrame.width + 1;
+            float iniScale = level.scale;
+
+            cv::Size area = level.workArea;
+            int step = 1 + (level.scale <= 2.f);
+
+            int total = 0, prev  = 0;
+
+            while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize_))
+            {
+                // create sutable matrix headers
+                GpuMat src  = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
+                GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
+
+                // generate integral for scale
+                cuda::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR);
+                cuda::integral(src, sint);
+
+                // calculate job
+                int totalWidth = level.workArea.width / step;
+                total += totalWidth * (level.workArea.height / step);
+
+                // go to next pyramid level
+                level = level.next(scaleFactor_, image.size(), NxM, minObjectSize_);
+                area = level.workArea;
+
+                step = (1 + (level.scale <= 2.f));
+                prev = acc;
+                acc += level.sFrame.width + 1;
+            }
+
+            device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor_, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
+                leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
+        }
+
+        if (minNeighbors_ <= 0  || objects.empty())
+            return;
+
+        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
+        device::lbp::connectedConmonents(candidates, classified, objects, minNeighbors_, grouping_eps, dclassified.ptr<unsigned int>());
+
+        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+        if (classified > 0)
+        {
+            objects.colRange(0, classified).copyTo(_objects);
+        }
+        else
+        {
+            _objects.release();
+        }
+    }
+
+    void LbpCascade_Impl::convert(OutputArray _gpu_objects, std::vector<Rect>& objects)
+    {
+        if (_gpu_objects.empty())
+        {
+            objects.clear();
+            return;
+        }
+
+        Mat gpu_objects;
+        if (_gpu_objects.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_objects.getGpuMat().download(gpu_objects);
+        }
+        else
+        {
+            gpu_objects = _gpu_objects.getMat();
+        }
+
+        CV_Assert( gpu_objects.rows == 1 );
+        CV_Assert( gpu_objects.type() == traits::Type<Rect>::value );
+
+        Rect* ptr = gpu_objects.ptr<Rect>();
+        objects.assign(ptr, ptr + gpu_objects.cols);
+    }
+
+    bool LbpCascade_Impl::load(const FileNode &root)
+    {
+        const char *CUDA_CC_STAGE_TYPE       = "stageType";
+        const char *CUDA_CC_FEATURE_TYPE     = "featureType";
+        const char *CUDA_CC_BOOST            = "BOOST";
+        const char *CUDA_CC_LBP              = "LBP";
+        const char *CUDA_CC_MAX_CAT_COUNT    = "maxCatCount";
+        const char *CUDA_CC_HEIGHT           = "height";
+        const char *CUDA_CC_WIDTH            = "width";
+        const char *CUDA_CC_STAGE_PARAMS     = "stageParams";
+        const char *CUDA_CC_MAX_DEPTH        = "maxDepth";
+        const char *CUDA_CC_FEATURE_PARAMS   = "featureParams";
+        const char *CUDA_CC_STAGES           = "stages";
+        const char *CUDA_CC_STAGE_THRESHOLD  = "stageThreshold";
+        const float CUDA_THRESHOLD_EPS       = 1e-5f;
+        const char *CUDA_CC_WEAK_CLASSIFIERS = "weakClassifiers";
+        const char *CUDA_CC_INTERNAL_NODES   = "internalNodes";
+        const char *CUDA_CC_LEAF_VALUES      = "leafValues";
+        const char *CUDA_CC_FEATURES         = "features";
+        const char *CUDA_CC_RECT             = "rect";
+
+        String stageTypeStr = (String)root[CUDA_CC_STAGE_TYPE];
+        CV_Assert(stageTypeStr == CUDA_CC_BOOST);
+
+        String featureTypeStr = (String)root[CUDA_CC_FEATURE_TYPE];
+        CV_Assert(featureTypeStr == CUDA_CC_LBP);
+
+        NxM.width =  (int)root[CUDA_CC_WIDTH];
+        NxM.height = (int)root[CUDA_CC_HEIGHT];
+        CV_Assert( NxM.height > 0 && NxM.width > 0 );
+
+        isStumps = ((int)(root[CUDA_CC_STAGE_PARAMS][CUDA_CC_MAX_DEPTH]) == 1) ? true : false;
+        CV_Assert(isStumps);
+
+        FileNode fn = root[CUDA_CC_FEATURE_PARAMS];
+        if (fn.empty())
+            return false;
+
+        ncategories = fn[CUDA_CC_MAX_CAT_COUNT];
+
+        subsetSize = (ncategories + 31) / 32;
+        nodeStep = 3 + ( ncategories > 0 ? subsetSize : 1 );
+
+        fn = root[CUDA_CC_STAGES];
+        if (fn.empty())
+            return false;
+
+        std::vector<Stage> stages;
+        stages.reserve(fn.size());
+
+        std::vector<int> cl_trees;
+        std::vector<int> cl_nodes;
+        std::vector<float> cl_leaves;
+        std::vector<int> subsets;
+
+        FileNodeIterator it = fn.begin(), it_end = fn.end();
+        for (size_t si = 0; it != it_end; si++, ++it )
+        {
+            FileNode fns = *it;
+            Stage st;
+            st.threshold = (float)fns[CUDA_CC_STAGE_THRESHOLD] - CUDA_THRESHOLD_EPS;
+
+            fns = fns[CUDA_CC_WEAK_CLASSIFIERS];
+            if (fns.empty())
+                return false;
+
+            st.ntrees = (int)fns.size();
+            st.first = (int)cl_trees.size();
+
+            stages.push_back(st);// (int, int, float)
+
+            cl_trees.reserve(stages[si].first + stages[si].ntrees);
+
+            // weak trees
+            FileNodeIterator it1 = fns.begin(), it1_end = fns.end();
+            for ( ; it1 != it1_end; ++it1 )
+            {
+                FileNode fnw = *it1;
+
+                FileNode internalNodes = fnw[CUDA_CC_INTERNAL_NODES];
+                FileNode leafValues = fnw[CUDA_CC_LEAF_VALUES];
+                if ( internalNodes.empty() || leafValues.empty() )
+                    return false;
+
+                int nodeCount = (int)internalNodes.size()/nodeStep;
+                cl_trees.push_back(nodeCount);
+
+                cl_nodes.reserve((cl_nodes.size() + nodeCount) * 3);
+                cl_leaves.reserve(cl_leaves.size() + leafValues.size());
+
+                if( subsetSize > 0 )
+                    subsets.reserve(subsets.size() + nodeCount * subsetSize);
+
+                // nodes
+                FileNodeIterator iIt = internalNodes.begin(), iEnd = internalNodes.end();
+
+                for( ; iIt != iEnd; )
+                {
+                    cl_nodes.push_back((int)*(iIt++));
+                    cl_nodes.push_back((int)*(iIt++));
+                    cl_nodes.push_back((int)*(iIt++));
+
+                    if( subsetSize > 0 )
+                        for( int j = 0; j < subsetSize; j++, ++iIt )
+                            subsets.push_back((int)*iIt);
+                }
+
+                // leaves
+                iIt = leafValues.begin(), iEnd = leafValues.end();
+                for( ; iIt != iEnd; ++iIt )
+                    cl_leaves.push_back((float)*iIt);
+            }
+        }
+
+        fn = root[CUDA_CC_FEATURES];
+        if( fn.empty() )
+            return false;
+        std::vector<uchar> features;
+        features.reserve(fn.size() * 4);
+        FileNodeIterator f_it = fn.begin(), f_end = fn.end();
+        for (; f_it != f_end; ++f_it)
+        {
+            FileNode rect = (*f_it)[CUDA_CC_RECT];
+            FileNodeIterator r_it = rect.begin();
+            features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+            features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+            features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+            features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+        }
+
+        // copy data structures on gpu
+        stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
+        trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
+        nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
+        leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));
+        subsets_mat.upload(cv::Mat(subsets).reshape(1,1));
+        features_mat.upload(cv::Mat(features).reshape(4,1));
+
+        return true;
+    }
+
+    void LbpCascade_Impl::allocateBuffers(cv::Size frame)
+    {
+        if (frame == cv::Size())
+            return;
+
+        if (resuzeBuffer.empty() || frame.width > resuzeBuffer.cols || frame.height > resuzeBuffer.rows)
+        {
+            resuzeBuffer.create(frame, CV_8UC1);
+
+            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
+
+        #ifdef HAVE_OPENCV_CUDALEGACY
+            NcvSize32u roiSize;
+            roiSize.width = frame.width;
+            roiSize.height = frame.height;
+
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
+
+            Ncv32u bufSize;
+            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+            integralBuffer.create(1, bufSize, CV_8UC1);
+        #endif
+
+            candidates.create(1 , frame.width >> 1, CV_32SC4);
+        }
+    }
+
+}
+
+//
+// create
+//
+
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const String& filename)
+{
+    String fext = filename.substr(filename.find_last_of(".") + 1);
+    std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);
+
+    if (fext == "nvbin")
+    {
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
+    }
+
+    FileStorage fs(filename, FileStorage::READ);
+
+    if (!fs.isOpened())
+    {
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
+    }
+
+    const char *CUDA_CC_LBP = "LBP";
+    String featureTypeStr = (String)fs.getFirstTopLevelNode()["featureType"];
+    if (featureTypeStr == CUDA_CC_LBP)
+    {
+        return makePtr<LbpCascade_Impl>(fs);
+    }
+    else
+    {
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
+    }
+
+    CV_Error(Error::StsUnsupportedFormat, "Unsupported format for CUDA CascadeClassifier");
+    return Ptr<cuda::CascadeClassifier>();
+}
+
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const FileStorage& file)
+{
+    return makePtr<LbpCascade_Impl>(file);
+}
+
+#endif
diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu
new file mode 100644
index 00000000000..b32e6da0343
--- /dev/null
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
@@ -0,0 +1,890 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+
+    namespace hog
+    {
+        __constant__ int cnbins;
+        __constant__ int cblock_stride_x;
+        __constant__ int cblock_stride_y;
+        __constant__ int cnblocks_win_x;
+        __constant__ int cnblocks_win_y;
+        __constant__ int cncells_block_x;
+        __constant__ int cncells_block_y;
+        __constant__ int cblock_hist_size;
+        __constant__ int cblock_hist_size_2up;
+        __constant__ int cdescr_size;
+        __constant__ int cdescr_width;
+
+
+        /* Returns the nearest upper power of two, works only for
+        the typical GPU thread count (pert block) values */
+        int power_2up(unsigned int n)
+        {
+            if (n <= 1) return 1;
+            else if (n <= 2) return 2;
+            else if (n <= 4) return 4;
+            else if (n <= 8) return 8;
+            else if (n <= 16) return 16;
+            else if (n <= 32) return 32;
+            else if (n <= 64) return 64;
+            else if (n <= 128) return 128;
+            else if (n <= 256) return 256;
+            else if (n <= 512) return 512;
+            else if (n <= 1024) return 1024;
+            return -1; // Input is too big
+        }
+
+        /* Returns the max size for nblocks */
+        int max_nblocks(int nthreads, int ncells_block = 1)
+        {
+            int threads = nthreads * ncells_block;
+            if(threads * 4 <= 256)
+                return 4;
+            else if(threads * 3 <= 256)
+                return 3;
+            else if(threads * 2 <= 256)
+                return 2;
+            else
+                return 1;
+        }
+
+
+        void set_up_constants(int nbins,
+                              int block_stride_x, int block_stride_y,
+                              int nblocks_win_x, int nblocks_win_y,
+                              int ncells_block_x, int ncells_block_y,
+                              const cudaStream_t& stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cnbins,               &nbins,               sizeof(nbins),               0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_x,      &block_stride_x,      sizeof(block_stride_x),      0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_y,      &block_stride_y,      sizeof(block_stride_y),      0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_x,       &nblocks_win_x,       sizeof(nblocks_win_x),       0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_y,       &nblocks_win_y,       sizeof(nblocks_win_y),       0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_x,      &ncells_block_x,      sizeof(ncells_block_x),      0, cudaMemcpyHostToDevice, stream));
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_y,      &ncells_block_y,      sizeof(ncells_block_y),      0, cudaMemcpyHostToDevice, stream));
+
+            int block_hist_size = nbins * ncells_block_x * ncells_block_y;
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size,     &block_hist_size,     sizeof(block_hist_size),     0, cudaMemcpyHostToDevice, stream));
+
+            int block_hist_size_2up = power_2up(block_hist_size);
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up), 0, cudaMemcpyHostToDevice, stream));
+
+            int descr_width = nblocks_win_x * block_hist_size;
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_width,         &descr_width,         sizeof(descr_width),         0, cudaMemcpyHostToDevice, stream));
+
+            int descr_size = descr_width * nblocks_win_y;
+            cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_size,          &descr_size,          sizeof(descr_size),          0, cudaMemcpyHostToDevice, stream));
+        }
+
+
+        //----------------------------------------------------------------------------
+        // Histogram computation
+        //
+        // CUDA kernel to compute the histograms
+        template <int nblocks> // Number of histogram blocks processed by single GPU thread block
+        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrStepf grad,
+                                                         const PtrStepb qangle, float scale, float* block_hists,
+                                                         int cell_size, int patch_size, int block_patch_size,
+                                                         int threads_cell, int threads_block, int half_cell_size)
+        {
+            const int block_x = threadIdx.z;
+            const int cell_x = threadIdx.x / threads_cell;
+            const int cell_y = threadIdx.y;
+            const int cell_thread_x = threadIdx.x & (threads_cell - 1);
+
+            if (blockIdx.x * blockDim.z + block_x >= img_block_width)
+                return;
+
+            extern __shared__ float smem[];
+            float* hists = smem;
+            float* final_hist = smem + cnbins * block_patch_size * nblocks;
+
+            // patch_size means that patch_size pixels affect on block's cell
+            if (cell_thread_x < patch_size)
+            {
+                const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
+                                     half_cell_size * cell_x + cell_thread_x;
+                const int offset_y = blockIdx.y * cblock_stride_y + half_cell_size * cell_y;
+
+                const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
+                const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;
+
+
+                float* hist = hists + patch_size * (cell_y * blockDim.z * cncells_block_y +
+                                            cell_x + block_x * cncells_block_x) +
+                                           cell_thread_x;
+                for (int bin_id = 0; bin_id < cnbins; ++bin_id)
+                    hist[bin_id * block_patch_size * nblocks] = 0.f;
+
+                //(dist_x, dist_y) : distance between current pixel in patch and cell's center
+                const int dist_x = -half_cell_size + (int)cell_thread_x - half_cell_size * cell_x;
+
+                const int dist_y_begin = -half_cell_size - half_cell_size * (int)threadIdx.y;
+                for (int dist_y = dist_y_begin; dist_y < dist_y_begin + patch_size; ++dist_y)
+                {
+                    float2 vote = *(const float2*)grad_ptr;
+                    uchar2 bin = *(const uchar2*)qangle_ptr;
+
+                    grad_ptr += grad.step/sizeof(float);
+                    qangle_ptr += qangle.step;
+
+                    //(dist_center_x, dist_center_y) : distance between current pixel in patch and block's center
+                    int dist_center_y = dist_y - half_cell_size * (1 - 2 * cell_y);
+                    int dist_center_x = dist_x - half_cell_size * (1 - 2 * cell_x);
+
+                    float gaussian = ::expf(-(dist_center_y * dist_center_y +
+                                              dist_center_x * dist_center_x) * scale);
+
+                    float interp_weight = ((float)cell_size - ::fabs(dist_y + 0.5f)) *
+                                          ((float)cell_size - ::fabs(dist_x + 0.5f)) / (float)threads_block;
+
+                    hist[bin.x * block_patch_size * nblocks] += gaussian * interp_weight * vote.x;
+                    hist[bin.y * block_patch_size * nblocks] += gaussian * interp_weight * vote.y;
+                }
+
+                //reduction of the histograms
+                volatile float* hist_ = hist;
+                for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += block_patch_size * nblocks)
+                {
+                    if (cell_thread_x < patch_size/2) hist_[0] += hist_[patch_size/2];
+                    if (cell_thread_x < patch_size/4 && (!((patch_size/4) < 3 && cell_thread_x == 0)))
+                            hist_[0] += hist_[patch_size/4];
+                    if (cell_thread_x == 0)
+                        final_hist[((cell_x + block_x * cncells_block_x) * cncells_block_y + cell_y) * cnbins + bin_id]
+                            = hist_[0] + hist_[1] + hist_[2];
+                }
+            }
+
+            __syncthreads();
+
+            float* block_hist = block_hists + (blockIdx.y * img_block_width +
+                                               blockIdx.x * blockDim.z + block_x) *
+                                              cblock_hist_size;
+
+            //copying from final_hist to block_hist
+            int tid;
+            if(threads_cell < cnbins)
+            {
+                tid = (cell_y * cncells_block_y + cell_x) * cnbins + cell_thread_x;
+            } else
+            {
+                tid = (cell_y * cncells_block_y + cell_x) * threads_cell + cell_thread_x;
+            }
+            if (tid < cblock_hist_size)
+            {
+                block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];
+                if(threads_cell < cnbins && cell_thread_x == (threads_cell-1))
+                {
+                    for(int i=1;i<=(cnbins - threads_cell);++i)
+                    {
+                        block_hist[tid + i] = final_hist[block_x * cblock_hist_size + tid + i];
+                    }
+                }
+            }
+        }
+
+        //declaration of variables and invoke the kernel with the calculated number of blocks
+        void compute_hists(int nbins,
+                           int block_stride_x, int block_stride_y,
+                           int height, int width,
+                           const PtrStepSzf& grad, const PtrStepSzb& qangle,
+                           float sigma,
+                           float* block_hists,
+                           int cell_size_x, int cell_size_y,
+                           int ncells_block_x, int ncells_block_y,
+                           const cudaStream_t& stream)
+        {
+            const int ncells_block = ncells_block_x * ncells_block_y;
+            const int patch_side = cell_size_x / 4;
+            const int patch_size = cell_size_x + (patch_side * 2);
+            const int block_patch_size = ncells_block * patch_size;
+            const int threads_cell = power_2up(patch_size);
+            const int threads_block = ncells_block * threads_cell;
+            const int half_cell_size = cell_size_x / 2;
+
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) /
+                                  block_stride_x;
+            int img_block_height = (height - ncells_block_y * cell_size_y + block_stride_y) /
+                                   block_stride_y;
+
+            const int nblocks = max_nblocks(threads_cell, ncells_block);
+            dim3 grid(divUp(img_block_width, nblocks), img_block_height);
+            dim3 threads(threads_cell * ncells_block_x, ncells_block_y, nblocks);
+
+            // Precompute gaussian spatial window parameter
+            float scale = 1.f / (2.f * sigma * sigma);
+
+            int hists_size = (nbins * ncells_block * patch_size * nblocks) * sizeof(float);
+            int final_hists_size = (nbins * ncells_block * nblocks) * sizeof(float);
+            int smem = hists_size + final_hists_size;
+            if (nblocks == 4)
+                compute_hists_kernel_many_blocks<4><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+            else if (nblocks == 3)
+                compute_hists_kernel_many_blocks<3><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+            else if (nblocks == 2)
+                compute_hists_kernel_many_blocks<2><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+            else
+                compute_hists_kernel_many_blocks<1><<<grid, threads, smem, stream>>>(img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+
+        //-------------------------------------------------------------
+        //  Normalization of histograms via L2Hys_norm
+        //
+
+
+        template<int size>
+        __device__ float reduce_smem(float* smem, float val)
+        {
+            unsigned int tid = threadIdx.x;
+            float sum = val;
+
+            reduce<size>(smem, sum, tid, plus<float>());
+
+            if (size == 32)
+            {
+            #if __CUDA_ARCH__ >= 300
+                return shfl(sum, 0);
+            #else
+                return smem[0];
+            #endif
+            }
+            else
+            {
+            #if __CUDA_ARCH__ >= 300
+                if (threadIdx.x == 0)
+                    smem[0] = sum;
+            #endif
+
+                __syncthreads();
+
+                return smem[0];
+            }
+        }
+
+
+        template <int nthreads, // Number of threads which process one block historgam
+                  int nblocks> // Number of block hisograms processed by one GPU thread block
+        __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
+                                                           const int img_block_width,
+                                                           float* block_hists, float threshold)
+        {
+            if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)
+                return;
+
+            float* hist = block_hists + (blockIdx.y * img_block_width +
+                                         blockIdx.x * blockDim.z + threadIdx.z) *
+                                        block_hist_size + threadIdx.x;
+
+            __shared__ float sh_squares[nthreads * nblocks];
+            float* squares = sh_squares + threadIdx.z * nthreads;
+
+            float elem = 0.f;
+            if (threadIdx.x < block_hist_size)
+                elem = hist[0];
+
+            __syncthreads(); // prevent race condition (redundant?)
+            float sum = reduce_smem<nthreads>(squares, elem * elem);
+
+            float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
+            elem = ::min(elem * scale, threshold);
+
+            __syncthreads(); // prevent race condition
+            sum = reduce_smem<nthreads>(squares, elem * elem);
+
+            scale = 1.0f / (::sqrtf(sum) + 1e-3f);
+
+            if (threadIdx.x < block_hist_size)
+                hist[0] = elem * scale;
+        }
+
+
+        void normalize_hists(int nbins,
+                             int block_stride_x, int block_stride_y,
+                             int height, int width,
+                             float* block_hists,
+                             float threshold,
+                             int cell_size_x, int cell_size_y,
+                             int ncells_block_x, int ncells_block_y,
+                             const cudaStream_t& stream)
+        {
+            const int nblocks = 1;
+
+            int block_hist_size = nbins * ncells_block_x * ncells_block_y;
+            int nthreads = power_2up(block_hist_size);
+            dim3 threads(nthreads, 1, nblocks);
+
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
+            int img_block_height = (height - ncells_block_y * cell_size_y + block_stride_y) / block_stride_y;
+            dim3 grid(divUp(img_block_width, nblocks), img_block_height);
+
+            if (nthreads == 32)
+                normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 64)
+                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 128)
+                normalize_hists_kernel_many_blocks<128, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 256)
+                normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 512)
+                normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads, 0, stream>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else
+                CV_Error(cv::Error::StsBadArg, "normalize_hists: histogram's size is too big, try to decrease number of bins");
+
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+
+        //---------------------------------------------------------------------
+        //  Linear SVM based classification
+        //
+
+       // return confidence values not just positive location
+       template <int nthreads, // Number of threads per one histogram block
+                 int nblocks>  // Number of histogram block processed by single GPU thread block
+       __global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
+                                                                                                           const int win_block_stride_x, const int win_block_stride_y,
+                                                                                                           const float* block_hists, const float* coefs,
+                                                                                                           float free_coef, float threshold, float* confidences)
+       {
+           const int win_x = threadIdx.z;
+           if (blockIdx.x * blockDim.z + win_x >= img_win_width)
+                   return;
+
+           const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                                                                blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
+                                                                               cblock_hist_size;
+
+           float product = 0.f;
+           for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+           {
+                   int offset_y = i / cdescr_width;
+                   int offset_x = i - offset_y * cdescr_width;
+                   product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+           }
+
+           __shared__ float products[nthreads * nblocks];
+
+           const int tid = threadIdx.z * nthreads + threadIdx.x;
+
+           reduce<nthreads>(products, product, tid, plus<float>());
+
+           if (threadIdx.x == 0)
+               confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
+
+       }
+
+       void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                                               int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                               float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences)
+       {
+           const int nthreads = 256;
+           const int nblocks = 1;
+
+           int win_block_stride_x = win_stride_x / block_stride_x;
+           int win_block_stride_y = win_stride_y / block_stride_y;
+           int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+           int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+
+           dim3 threads(nthreads, 1, nblocks);
+           dim3 grid(divUp(img_win_width, nblocks), img_win_height);
+
+           cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
+                                                                                   cudaFuncCachePreferL1));
+
+           int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) /
+                                                       block_stride_x;
+           compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
+                   img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
+                   block_hists, coefs, free_coef, threshold, confidences);
+           cudaSafeCall(cudaThreadSynchronize());
+       }
+
+
+
+        template <int nthreads, // Number of threads per one histogram block
+                  int nblocks>  // Number of histogram block processed by single GPU thread block
+        __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
+                                                          const int win_block_stride_x, const int win_block_stride_y,
+                                                          const float* block_hists, const float* coefs,
+                                                          float free_coef, float threshold, unsigned char* labels)
+        {
+            const int win_x = threadIdx.z;
+            if (blockIdx.x * blockDim.z + win_x >= img_win_width)
+                return;
+
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                               blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
+                                              cblock_hist_size;
+
+            float product = 0.f;
+            for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+            {
+                int offset_y = i / cdescr_width;
+                int offset_x = i - offset_y * cdescr_width;
+                product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+            }
+
+            __shared__ float products[nthreads * nblocks];
+
+            const int tid = threadIdx.z * nthreads + threadIdx.x;
+
+            reduce<nthreads>(products, product, tid, plus<float>());
+
+            if (threadIdx.x == 0)
+                labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
+        }
+
+
+        void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                            float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels)
+        {
+            const int nthreads = 256;
+            const int nblocks = 1;
+
+            int win_block_stride_x = win_stride_x / block_stride_x;
+            int win_block_stride_y = win_stride_y / block_stride_y;
+            int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+            int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+
+            dim3 threads(nthreads, 1, nblocks);
+            dim3 grid(divUp(img_win_width, nblocks), img_win_height);
+
+            cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));
+
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
+            classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
+                img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
+                block_hists, coefs, free_coef, threshold, labels);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //----------------------------------------------------------------------------
+        // Extract descriptors
+
+
+        template <int nthreads>
+        __global__ void extract_descrs_by_rows_kernel(const int img_block_width,
+                                                      const int win_block_stride_x, const int win_block_stride_y,
+                                                      const float* block_hists,
+                                                      PtrStepf descriptors)
+        {
+            // Get left top corner of the window in src
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                               blockIdx.x * win_block_stride_x) * cblock_hist_size;
+
+            // Get left top corner of the window in dst
+            float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
+
+            // Copy elements from src to dst
+            for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+            {
+                int offset_y = i / cdescr_width;
+                int offset_x = i - offset_y * cdescr_width;
+                descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+            }
+        }
+
+
+        void extract_descrs_by_rows(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists, int cell_size_x,
+                                    int ncells_block_x,
+                                    PtrStepSzf descriptors,
+                                    const cudaStream_t& stream)
+        {
+            const int nthreads = 256;
+
+            int win_block_stride_x = win_stride_x / block_stride_x;
+            int win_block_stride_y = win_stride_y / block_stride_y;
+            int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+            int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+            dim3 threads(nthreads, 1);
+            dim3 grid(img_win_width, img_win_height);
+
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
+            extract_descrs_by_rows_kernel<nthreads><<<grid, threads, 0, stream>>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
+
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+
+        template <int nthreads>
+        __global__ void extract_descrs_by_cols_kernel(const int img_block_width,
+                                                      const int win_block_stride_x, const int win_block_stride_y,
+                                                      const float* block_hists,
+                                                      PtrStepf descriptors)
+        {
+            // Get left top corner of the window in src
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                               blockIdx.x * win_block_stride_x) * cblock_hist_size;
+
+            // Get left top corner of the window in dst
+            float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
+
+            // Copy elements from src to dst
+            for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+            {
+                int block_idx = i / cblock_hist_size;
+                int idx_in_block = i - block_idx * cblock_hist_size;
+
+                int y = block_idx / cnblocks_win_x;
+                int x = block_idx - y * cnblocks_win_x;
+
+                descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block]
+                    = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
+            }
+        }
+
+
+        void extract_descrs_by_cols(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists,
+                                    int cell_size_x, int ncells_block_x,
+                                    PtrStepSzf descriptors,
+                                    const cudaStream_t& stream)
+        {
+            const int nthreads = 256;
+
+            int win_block_stride_x = win_stride_x / block_stride_x;
+            int win_block_stride_y = win_stride_y / block_stride_y;
+            int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+            int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+            dim3 threads(nthreads, 1);
+            dim3 grid(img_win_width, img_win_height);
+
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
+            extract_descrs_by_cols_kernel<nthreads><<<grid, threads, 0, stream>>>(img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
+
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        //----------------------------------------------------------------------------
+        // Gradients computation
+
+
+        template <int nthreads, int correct_gamma>
+        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrStepb img,
+                                                      float angle_scale, PtrStepf grad, PtrStepb qangle)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+            const uchar4* row = (const uchar4*)img.ptr(blockIdx.y);
+
+            __shared__ float sh_row[(nthreads + 2) * 3];
+
+            uchar4 val;
+            if (x < width)
+                val = row[x];
+            else
+                val = row[width - 2];
+
+            sh_row[threadIdx.x + 1] = val.x;
+            sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y;
+            sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z;
+
+            if (threadIdx.x == 0)
+            {
+                val = row[::max(x - 1, 1)];
+                sh_row[0] = val.x;
+                sh_row[(nthreads + 2)] = val.y;
+                sh_row[2 * (nthreads + 2)] = val.z;
+            }
+
+            if (threadIdx.x == blockDim.x - 1)
+            {
+                val = row[::min(x + 1, width - 2)];
+                sh_row[blockDim.x + 1] = val.x;
+                sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;
+                sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;
+            }
+
+            __syncthreads();
+            if (x < width)
+            {
+                float3 a, b;
+
+                b.x = sh_row[threadIdx.x + 2];
+                b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)];
+                b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)];
+                a.x = sh_row[threadIdx.x];
+                a.y = sh_row[threadIdx.x + (nthreads + 2)];
+                a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)];
+
+                float3 dx;
+                if (correct_gamma)
+                    dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
+                else
+                    dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
+
+                float3 dy = make_float3(0.f, 0.f, 0.f);
+
+                if (blockIdx.y > 0 && blockIdx.y < height - 1)
+                {
+                    val = ((const uchar4*)img.ptr(blockIdx.y - 1))[x];
+                    a = make_float3(val.x, val.y, val.z);
+
+                    val = ((const uchar4*)img.ptr(blockIdx.y + 1))[x];
+                    b = make_float3(val.x, val.y, val.z);
+
+                    if (correct_gamma)
+                        dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
+                    else
+                        dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
+                }
+
+                float best_dx = dx.x;
+                float best_dy = dy.x;
+
+                float mag0 = dx.x * dx.x + dy.x * dy.x;
+                float mag1 = dx.y * dx.y + dy.y * dy.y;
+                if (mag0 < mag1)
+                {
+                    best_dx = dx.y;
+                    best_dy = dy.y;
+                    mag0 = mag1;
+                }
+
+                mag1 = dx.z * dx.z + dy.z * dy.z;
+                if (mag0 < mag1)
+                {
+                    best_dx = dx.z;
+                    best_dy = dy.z;
+                    mag0 = mag1;
+                }
+
+                mag0 = ::sqrtf(mag0);
+
+                float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
+                int hidx = (int)::floorf(ang);
+                ang -= hidx;
+                hidx = (hidx + cnbins) % cnbins;
+
+                ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
+                ((float2*)grad.ptr(blockIdx.y))[x] = make_float2(mag0 * (1.f - ang), mag0 * ang);
+            }
+        }
+
+
+        void compute_gradients_8UC4(int nbins,
+                                    int height, int width, const PtrStepSzb& img,
+                                    float angle_scale,
+                                    PtrStepSzf grad, PtrStepSzb qangle,
+                                    bool correct_gamma,
+                                    const cudaStream_t& stream)
+        {
+            CV_UNUSED(nbins);
+            const int nthreads = 256;
+
+            dim3 bdim(nthreads, 1);
+            dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
+
+            if (correct_gamma)
+                compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
+            else
+                compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
+
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        template <int nthreads, int correct_gamma>
+        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrStepb img,
+                                                      float angle_scale, PtrStepf grad, PtrStepb qangle)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+            const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y);
+
+            __shared__ float sh_row[nthreads + 2];
+
+            if (x < width)
+                sh_row[threadIdx.x + 1] = row[x];
+            else
+                sh_row[threadIdx.x + 1] = row[width - 2];
+
+            if (threadIdx.x == 0)
+                sh_row[0] = row[::max(x - 1, 1)];
+
+            if (threadIdx.x == blockDim.x - 1)
+                sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];
+
+            __syncthreads();
+            if (x < width)
+            {
+                float dx;
+
+                if (correct_gamma)
+                    dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);
+                else
+                    dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];
+
+                float dy = 0.f;
+                if (blockIdx.y > 0 && blockIdx.y < height - 1)
+                {
+                    float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];
+                    float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];
+                    if (correct_gamma)
+                        dy = ::sqrtf(a) - ::sqrtf(b);
+                    else
+                        dy = a - b;
+                }
+                float mag = ::sqrtf(dx * dx + dy * dy);
+
+                float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
+                int hidx = (int)::floorf(ang);
+                ang -= hidx;
+                hidx = (hidx + cnbins) % cnbins;
+
+                ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
+                ((float2*)  grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang);
+            }
+        }
+
+
+        void compute_gradients_8UC1(int nbins,
+                                    int height, int width, const PtrStepSzb& img,
+                                    float angle_scale,
+                                    PtrStepSzf grad, PtrStepSzb qangle,
+                                    bool correct_gamma,
+                                    const cudaStream_t& stream)
+        {
+            CV_UNUSED(nbins);
+            const int nthreads = 256;
+
+            dim3 bdim(nthreads, 1);
+            dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
+
+            if (correct_gamma)
+                compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
+            else
+                compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim, 0, stream>>>(height, width, img, angle_scale, grad, qangle);
+
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+
+
+        //-------------------------------------------------------------------
+        // Resize
+
+        texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
+        texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;
+
+        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
+        {
+            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
+        }
+
+        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs)
+        {
+            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
+                dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
+            }
+        }
+
+        template<class T, class TEX>
+        static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
+        {
+            tex.filterMode = cudaFilterModeLinear;
+
+            size_t texOfs = 0;
+            int colOfs = 0;
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+            cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
+
+            if (texOfs != 0)
+            {
+                colOfs = static_cast<int>( texOfs/sizeof(T) );
+                cudaSafeCall( cudaUnbindTexture(tex) );
+                cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
+            }
+
+            dim3 threads(32, 8);
+            dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
+
+            float sx = static_cast<float>(src.cols) / dst.cols;
+            float sy = static_cast<float>(src.rows) / dst.rows;
+
+            resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaUnbindTexture(tex) );
+        }
+
+        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
+        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+    } // namespace hog
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaobjdetect/src/cuda/lbp.cu b/modules/cudaobjdetect/src/cuda/lbp.cu
new file mode 100644
index 00000000000..e99d28cea9c
--- /dev/null
+++ b/modules/cudaobjdetect/src/cuda/lbp.cu
@@ -0,0 +1,303 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "lbp.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace lbp
+    {
+        struct LBP
+        {
+            __host__ __device__ __forceinline__ LBP() {}
+
+            __device__ __forceinline__ int operator() (const int* integral, int ty, int fh, int fw, int& shift) const
+            {
+                int anchors[9];
+
+                anchors[0]  = integral[ty];
+                anchors[1]  = integral[ty + fw];
+                anchors[0] -= anchors[1];
+                anchors[2]  = integral[ty + fw * 2];
+                anchors[1] -= anchors[2];
+                anchors[2] -= integral[ty + fw * 3];
+
+                ty += fh;
+                anchors[3]  = integral[ty];
+                anchors[4]  = integral[ty + fw];
+                anchors[3] -= anchors[4];
+                anchors[5]  = integral[ty + fw * 2];
+                anchors[4] -= anchors[5];
+                anchors[5] -= integral[ty + fw * 3];
+
+                anchors[0] -= anchors[3];
+                anchors[1] -= anchors[4];
+                anchors[2] -= anchors[5];
+                // 0 - 2 contains s0 - s2
+
+                ty += fh;
+                anchors[6]  = integral[ty];
+                anchors[7]  = integral[ty + fw];
+                anchors[6] -= anchors[7];
+                anchors[8]  = integral[ty + fw * 2];
+                anchors[7] -= anchors[8];
+                anchors[8] -= integral[ty + fw * 3];
+
+                anchors[3] -= anchors[6];
+                anchors[4] -= anchors[7];
+                anchors[5] -= anchors[8];
+                // 3 - 5 contains s3 - s5
+
+                anchors[0] -= anchors[4];
+                anchors[1] -= anchors[4];
+                anchors[2] -= anchors[4];
+                anchors[3] -= anchors[4];
+                anchors[5] -= anchors[4];
+
+                int response = (~(anchors[0] >> 31)) & 4;
+                response |= (~(anchors[1] >> 31)) & 2;;
+                response |= (~(anchors[2] >> 31)) & 1;
+
+                shift = (~(anchors[5] >> 31)) & 16;
+                shift |= (~(anchors[3] >> 31)) & 1;
+
+                ty += fh;
+                anchors[0]  = integral[ty];
+                anchors[1]  = integral[ty + fw];
+                anchors[0] -= anchors[1];
+                anchors[2]  = integral[ty + fw * 2];
+                anchors[1] -= anchors[2];
+                anchors[2] -= integral[ty + fw * 3];
+
+                anchors[6] -= anchors[0];
+                anchors[7] -= anchors[1];
+                anchors[8] -= anchors[2];
+                // 0 -2 contains s6 - s8
+
+                anchors[6] -= anchors[4];
+                anchors[7] -= anchors[4];
+                anchors[8] -= anchors[4];
+
+                shift |= (~(anchors[6] >> 31)) & 2;
+                shift |= (~(anchors[7] >> 31)) & 4;
+                shift |= (~(anchors[8] >> 31)) & 8;
+                return response;
+            }
+        };
+
+        template<typename Pr>
+        __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        {
+            unsigned int tid = threadIdx.x;
+            extern __shared__ int sbuff[];
+
+            int* labels = sbuff;
+            int* rrects = sbuff + n;
+
+            Pr predicate(grouping_eps);
+            partition(candidates, n, labels, predicate);
+
+            rrects[tid * 4 + 0] = 0;
+            rrects[tid * 4 + 1] = 0;
+            rrects[tid * 4 + 2] = 0;
+            rrects[tid * 4 + 3] = 0;
+            __syncthreads();
+
+            int cls = labels[tid];
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
+
+            __syncthreads();
+            labels[tid] = 0;
+
+            __syncthreads();
+            Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
+
+            __syncthreads();
+            *nclasses = 0;
+
+            int active = labels[tid];
+            if (active)
+            {
+                int* r1 = rrects + tid * 4;
+                float s = 1.f / active;
+                r1[0] = saturate_cast<int>(r1[0] * s);
+                r1[1] = saturate_cast<int>(r1[1] * s);
+                r1[2] = saturate_cast<int>(r1[2] * s);
+                r1[3] = saturate_cast<int>(r1[3] * s);
+            }
+            __syncthreads();
+
+            if (active && active >= groupThreshold)
+            {
+                int* r1 = rrects + tid * 4;
+                int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
+
+                int aidx = Emulation::smem::atomicInc(nclasses, n);
+                objects[aidx] = r_out;
+            }
+        }
+
+        void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        {
+            if (!ncandidates) return;
+            int block = ncandidates;
+            int smem  = block * ( sizeof(int) + sizeof(int4) );
+            disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        struct Cascade
+        {
+            __host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
+                const int* _subsets, const uchar4* _features, int _subsetSize)
+
+            : stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
+
+            __device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch) const
+            {
+                int current_node = 0;
+                int current_leave = 0;
+
+                for (int s = 0; s < nstages; ++s)
+                {
+                    float sum = 0;
+                    Stage stage = stages[s];
+                    for (int t = 0; t < stage.ntrees; t++)
+                    {
+                        ClNode node = nodes[current_node];
+                        uchar4 feature = features[node.featureIdx];
+
+                        int shift;
+                        int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
+                        int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
+                        sum += leaves[idx];
+
+                        current_node += 1;
+                        current_leave += 2;
+                    }
+
+                    if (sum < stage.threshold)
+                        return false;
+                }
+
+                return true;
+            }
+
+            const Stage*  stages;
+            const int nstages;
+
+            const ClNode* nodes;
+            const float* leaves;
+            const int* subsets;
+            const uchar4* features;
+
+            const int subsetSize;
+            const LBP evaluator;
+        };
+
+        // stepShift, scale, width_k, sum_prev => y =  sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
+        __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
+            const int total, int* integral, const int pitch, PtrStepSz<int4> objects, unsigned int* classified)
+        {
+            int ftid = blockIdx.x * blockDim.x + threadIdx.x;
+            if (ftid >= total) return;
+
+            int step = (scale <= 2.f);
+
+            int windowsForLine = (__float2int_rn( __fdividef(frameW, scale)) - windowW) >> step;
+            int stotal = windowsForLine * ( (__float2int_rn( __fdividef(frameH, scale)) - windowH) >> step);
+            int wshift = 0;
+
+            int scaleTid = ftid;
+
+            while (scaleTid >= stotal)
+            {
+                scaleTid -= stotal;
+                wshift += __float2int_rn(__fdividef(frameW, scale)) + 1;
+                scale *= factor;
+                step = (scale <= 2.f);
+                windowsForLine = ( ((__float2int_rn(__fdividef(frameW, scale)) - windowW) >> step));
+                stotal = windowsForLine * ( (__float2int_rn(__fdividef(frameH, scale)) - windowH) >> step);
+            }
+
+            int y = __fdividef(scaleTid, windowsForLine);
+            int x = scaleTid - y * windowsForLine;
+
+            x <<= step;
+            y <<= step;
+
+            if (cascade(y, x + wshift, integral, pitch))
+            {
+                if(x >= __float2int_rn(__fdividef(frameW, scale)) - windowW) return;
+
+                int4 rect;
+                rect.x = __float2int_rn(x * scale);
+                rect.y = __float2int_rn(y * scale);
+                rect.z = __float2int_rn(windowW * scale);
+                rect.w = __float2int_rn(windowH * scale);
+
+                int res = atomicInc(classified, (unsigned int)objects.cols);
+                objects(0, res) = rect;
+            }
+        }
+
+        void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
+            const PtrStepSzb& mstages, const int nstages, const PtrStepSzi& mnodes, const PtrStepSzf& mleaves, const PtrStepSzi& msubsets, const PtrStepSzb& mfeatures,
+            const int subsetSize, PtrStepSz<int4> objects, unsigned int* classified, PtrStepSzi integral)
+        {
+            const int block = 128;
+            int grid = divUp(workAmount, block);
+            cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1);
+            Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
+            lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified);
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaobjdetect/src/cuda/lbp.hpp b/modules/cudaobjdetect/src/cuda/lbp.hpp
new file mode 100644
index 00000000000..417d7994f97
--- /dev/null
+++ b/modules/cudaobjdetect/src/cuda/lbp.hpp
@@ -0,0 +1,112 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CUDA_DEVICE_LBP_HPP_
+#define __OPENCV_CUDA_DEVICE_LBP_HPP_
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+
+namespace cv { namespace cuda { namespace device {
+
+namespace lbp {
+
+    struct Stage
+    {
+        int    first;
+        int    ntrees;
+        float  threshold;
+    };
+
+    struct ClNode
+    {
+        int   left;
+        int   right;
+        int   featureIdx;
+    };
+
+    struct InSameComponint
+    {
+    public:
+        __device__ __forceinline__ InSameComponint(float _eps) : eps(_eps) {}
+        __device__ __forceinline__ InSameComponint(const InSameComponint& other) : eps(other.eps) {}
+
+        __device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
+        {
+            float delta = eps * (::min(r1.z, r2.z) + ::min(r1.w, r2.w)) * 0.5f;
+
+            return ::abs(r1.x - r2.x) <= delta && ::abs(r1.y - r2.y) <= delta
+                && ::abs(r1.x + r1.z - r2.x - r2.z) <= delta && ::abs(r1.y + r1.w - r2.y - r2.w) <= delta;
+        }
+        float eps;
+    };
+
+    template<typename Pr>
+    __device__ __forceinline__ void partition(int4* vec, unsigned int n, int* labels, Pr predicate)
+    {
+        unsigned tid = threadIdx.x;
+        labels[tid] = tid;
+        __syncthreads();
+        for (unsigned int id = 0; id < n; id++)
+        {
+            if (tid != id && predicate(vec[tid], vec[id]))
+            {
+                int p = labels[tid];
+                int q = labels[id];
+                if (p < q)
+                {
+                    Emulation::smem::atomicMin(labels + id, p);
+                }
+                else if (p > q)
+                {
+                    Emulation::smem::atomicMin(labels + tid, q);
+                }
+            }
+        }
+        __syncthreads();
+    }
+
+} // lbp
+
+} } }// namespaces
+
+#endif
diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp
new file mode 100644
index 00000000000..db391b64ea1
--- /dev/null
+++ b/modules/cudaobjdetect/src/hog.cpp
@@ -0,0 +1,1754 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::HOG> cv::cuda::HOG::create(Size, Size, Size, Size, int) { throw_no_cuda(); return Ptr<cuda::HOG>(); }
+
+#else
+
+/****************************************************************************************\
+      The code below is implementation of HOG (Histogram-of-Oriented Gradients)
+      descriptor and object detection, introduced by Navneet Dalal and Bill Triggs.
+
+      The computed feature vectors are compatible with the
+      INRIA Object Detection and Localization Toolkit
+      (http://pascal.inrialpes.fr/soft/olt/)
+\****************************************************************************************/
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hog
+    {
+        void set_up_constants(int nbins,
+                              int block_stride_x, int block_stride_y,
+                              int nblocks_win_x, int nblocks_win_y,
+                              int ncells_block_x, int ncells_block_y,
+                              const cudaStream_t& stream);
+
+        void compute_hists(int nbins,
+                           int block_stride_x, int block_stride_y,
+                           int height, int width,
+                           const PtrStepSzf& grad, const PtrStepSzb& qangle,
+                           float sigma,
+                           float* block_hists,
+                           int cell_size_x, int cell_size_y,
+                           int ncells_block_x, int ncells_block_y,
+                           const cudaStream_t& stream);
+
+        void normalize_hists(int nbins,
+                             int block_stride_x, int block_stride_y,
+                             int height, int width,
+                             float* block_hists,
+                             float threshold,
+                             int cell_size_x, int cell_size_y,
+                             int ncells_block_x, int ncells_block_y,
+                             const cudaStream_t& stream);
+
+        void classify_hists(int win_height, int win_width, int block_stride_y,
+                            int block_stride_x, int win_stride_y, int win_stride_x, int height,
+                            int width, float* block_hists, float* coefs, float free_coef,
+                            float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels);
+
+        void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                                      int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                      float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences);
+
+        void extract_descrs_by_rows(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists,
+                                    int cell_size_x, int ncells_block_x,
+                                    cv::cuda::PtrStepSzf descriptors,
+                                    const cudaStream_t& stream);
+        void extract_descrs_by_cols(int win_height, int win_width,
+                                    int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x,
+                                    int height, int width,
+                                    float* block_hists,
+                                    int cell_size_x, int ncells_block_x,
+                                    cv::cuda::PtrStepSzf descriptors,
+                                    const cudaStream_t& stream);
+
+        void compute_gradients_8UC1(int nbins,
+                                    int height, int width, const cv::cuda::PtrStepSzb& img,
+                                    float angle_scale,
+                                    cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle,
+                                    bool correct_gamma,
+                                    const cudaStream_t& stream);
+        void compute_gradients_8UC4(int nbins,
+                                    int height, int width, const cv::cuda::PtrStepSzb& img,
+                                    float angle_scale,
+                                    cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle,
+                                    bool correct_gamma,
+                                    const cudaStream_t& stream);
+
+        void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
+        void resize_8UC4(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
+    }
+}}}
+
+using namespace cv::cuda::device;
+
+namespace
+{
+    class HOG_Impl : public cv::cuda::HOG
+    {
+    public:
+        HOG_Impl(Size win_size,
+                 Size block_size,
+                 Size block_stride,
+                 Size cell_size,
+                 int nbins);
+
+        virtual void setWinSigma(double win_sigma) { win_sigma_ = win_sigma; }
+        virtual double getWinSigma() const;
+
+        virtual void setL2HysThreshold(double threshold_L2hys) { threshold_L2hys_ = threshold_L2hys; }
+        virtual double getL2HysThreshold() const { return threshold_L2hys_; }
+
+        virtual void setGammaCorrection(bool gamma_correction) { gamma_correction_ = gamma_correction; }
+        virtual bool getGammaCorrection() const { return gamma_correction_; }
+
+        virtual void setNumLevels(int nlevels) { nlevels_ = nlevels; }
+        virtual int getNumLevels() const { return nlevels_; }
+
+        virtual void setHitThreshold(double hit_threshold) { hit_threshold_ = hit_threshold; }
+        virtual double getHitThreshold() const { return hit_threshold_; }
+
+        virtual void setWinStride(Size win_stride) { win_stride_ = win_stride; }
+        virtual Size getWinStride() const { return win_stride_; }
+
+        virtual void setScaleFactor(double scale0) { scale0_ = scale0; }
+        virtual double getScaleFactor() const { return scale0_; }
+
+        virtual void setGroupThreshold(int group_threshold) { group_threshold_ = group_threshold; }
+        virtual int getGroupThreshold() const { return group_threshold_; }
+
+        virtual void setDescriptorFormat(int descr_format) { descr_format_ = descr_format; }
+        virtual int getDescriptorFormat() const { return descr_format_; }
+
+        virtual size_t getDescriptorSize() const;
+
+        virtual size_t getBlockHistogramSize() const;
+
+        virtual void setSVMDetector(InputArray detector);
+
+        virtual Mat getDefaultPeopleDetector() const;
+
+        virtual void detect(InputArray img,
+                            std::vector<Point>& found_locations,
+                            std::vector<double>* confidences);
+
+        virtual void detectMultiScale(InputArray img,
+                                      std::vector<Rect>& found_locations,
+                                      std::vector<double>* confidences);
+
+        virtual void compute(InputArray img,
+                             OutputArray descriptors,
+                             Stream& stream);
+
+    private:
+        Size win_size_;
+        Size block_size_;
+        Size block_stride_;
+        Size cell_size_;
+        int nbins_;
+
+        double win_sigma_;
+        double threshold_L2hys_;
+        bool gamma_correction_;
+        int nlevels_;
+        double hit_threshold_;
+        Size win_stride_;
+        double scale0_;
+        int group_threshold_;
+        int descr_format_;
+        Size cells_per_block_;
+
+    private:
+        int getTotalHistSize(Size img_size) const;
+        void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists, Stream& stream);
+//        void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle, Stream& stream);
+
+        // Coefficients of the separating plane
+        float free_coef_;
+        GpuMat detector_;
+    };
+
+    HOG_Impl::HOG_Impl(Size win_size,
+                       Size block_size,
+                       Size block_stride,
+                       Size cell_size,
+                       int nbins) :
+        win_size_(win_size),
+        block_size_(block_size),
+        block_stride_(block_stride),
+        cell_size_(cell_size),
+        nbins_(nbins),
+
+        win_sigma_(-1.0),
+        threshold_L2hys_(0.2),
+        gamma_correction_(true),
+        nlevels_(64),
+        hit_threshold_(0.0),
+        win_stride_(block_stride),
+        scale0_(1.05),
+        group_threshold_(2),
+        descr_format_(DESCR_FORMAT_COL_BY_COL),
+        cells_per_block_(block_size.width / cell_size.width, block_size.height / cell_size.height)
+    {
+        CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
+                  (win_size.height - block_size.height) % block_stride.height == 0);
+
+        CV_Assert(block_size.width % cell_size.width == 0 &&
+                  block_size.height % cell_size.height == 0);
+
+        // Navneet Dalal and Bill Triggs. Histograms of oriented gradients for
+        // human detection. In International Conference on Computer Vision and
+        // Pattern Recognition, volume 2, pages 886-893, June 2005
+        // http://lear.inrialpes.fr/people/triggs/pubs/Dalal-cvpr05.pdf (28.07.2015) [Figure 5]
+        CV_Assert(block_stride == (block_size / 2));
+
+        CV_Assert(cell_size.width == cell_size.height);
+    }
+
+    static int numPartsWithin(int size, int part_size, int stride)
+    {
+        return (size - part_size + stride) / stride;
+    }
+
+    static Size numPartsWithin(Size size, Size part_size, Size stride)
+    {
+        return Size(numPartsWithin(size.width, part_size.width, stride.width),
+                    numPartsWithin(size.height, part_size.height, stride.height));
+    }
+
+    size_t HOG_Impl::getDescriptorSize() const
+    {
+        return numPartsWithin(win_size_, block_size_, block_stride_).area() * getBlockHistogramSize();
+    }
+
+    size_t HOG_Impl::getBlockHistogramSize() const
+    {
+        return nbins_ * cells_per_block_.area();
+    }
+
+    double HOG_Impl::getWinSigma() const
+    {
+        return win_sigma_ >= 0 ? win_sigma_ : (block_size_.width + block_size_.height) / 8.0;
+    }
+
+    void HOG_Impl::setSVMDetector(InputArray _detector)
+    {
+        const int descriptor_size = static_cast<int>(getDescriptorSize());
+
+        const Mat detector = _detector.getMat();
+
+        CV_Assert( detector.type() == CV_32FC1 );
+        CV_Assert( detector.rows == 1 );
+        CV_Assert( detector.cols == descriptor_size || detector.cols == descriptor_size + 1 );
+
+        std::vector<float> detector_reordered(detector.ptr<float>(), detector.ptr<float>() + detector.cols);
+
+        size_t block_hist_size = getBlockHistogramSize();
+        Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
+
+        for (int i = 0; i < blocks_per_win.height; ++i)
+        {
+            for (int j = 0; j < blocks_per_win.width; ++j)
+            {
+                const float* src = detector.ptr<float>() + (j * blocks_per_win.height + i) * block_hist_size;
+                float* dst = &detector_reordered[0] + (i * blocks_per_win.width + j) * block_hist_size;
+                for (size_t k = 0; k < block_hist_size; ++k)
+                    dst[k] = src[k];
+            }
+        }
+
+        detector_.upload(Mat(detector_reordered).reshape(1, 1));
+        free_coef_ = detector.cols > descriptor_size ? detector.at<float>(0, descriptor_size) : 0;
+    }
+
+    static Mat getPeopleDetector64x128();
+    static Mat getPeopleDetector48x96();
+
+    Mat HOG_Impl::getDefaultPeopleDetector() const
+    {
+        CV_Assert( win_size_ == Size(64, 128) || win_size_ == Size(48, 96) );
+
+        if (win_size_ == Size(64, 128))
+            return getPeopleDetector64x128();
+        else
+            return getPeopleDetector48x96();
+    }
+
+    void HOG_Impl::detect(InputArray _img, std::vector<Point>& hits, std::vector<double>* confidences)
+    {
+        const GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
+        CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );
+
+        hits.clear();
+        if (detector_.empty())
+            return;
+
+        BufferPool pool(Stream::Null());
+
+        GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
+        computeBlockHistograms(img, block_hists, Stream::Null());
+
+        Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);
+
+        if (confidences == NULL)
+        {
+            GpuMat labels = pool.getBuffer(1, wins_per_img.area(), CV_8UC1);
+
+            hog::classify_hists(win_size_.height, win_size_.width,
+                                block_stride_.height, block_stride_.width,
+                                win_stride_.height, win_stride_.width,
+                                img.rows, img.cols,
+                                block_hists.ptr<float>(),
+                                detector_.ptr<float>(),
+                                (float)free_coef_,
+                                (float)hit_threshold_,
+                                cell_size_.width, cells_per_block_.width,
+                                labels.ptr());
+
+            Mat labels_host;
+            labels.download(labels_host);
+            unsigned char* vec = labels_host.ptr();
+
+            for (int i = 0; i < wins_per_img.area(); i++)
+            {
+                int y = i / wins_per_img.width;
+                int x = i - wins_per_img.width * y;
+                if (vec[i])
+                    hits.push_back(Point(x * win_stride_.width, y * win_stride_.height));
+            }
+        }
+        else
+        {
+            GpuMat labels = pool.getBuffer(1, wins_per_img.area(), CV_32FC1);
+
+            hog::compute_confidence_hists(win_size_.height, win_size_.width,
+                                          block_stride_.height, block_stride_.width,
+                                          win_stride_.height, win_stride_.width,
+                                          img.rows, img.cols,
+                                          block_hists.ptr<float>(),
+                                          detector_.ptr<float>(),
+                                          (float)free_coef_,
+                                          (float)hit_threshold_,
+                                          cell_size_.width, cells_per_block_.width,
+                                          labels.ptr<float>());
+
+            Mat labels_host;
+            labels.download(labels_host);
+            float* vec = labels_host.ptr<float>();
+
+            confidences->clear();
+            for (int i = 0; i < wins_per_img.area(); i++)
+            {
+                int y = i / wins_per_img.width;
+                int x = i - wins_per_img.width * y;
+
+                if (vec[i] >= hit_threshold_)
+                {
+                    hits.push_back(Point(x * win_stride_.width, y * win_stride_.height));
+                    confidences->push_back((double)vec[i]);
+                }
+            }
+        }
+    }
+
+    void HOG_Impl::detectMultiScale(InputArray _img,
+                                    std::vector<Rect>& found_locations,
+                                    std::vector<double>* confidences)
+    {
+        const GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
+        CV_Assert( confidences == NULL || group_threshold_ == 0 );
+
+        std::vector<double> level_scale;
+        double scale = 1.0;
+        int levels = 0;
+        for (levels = 0; levels < nlevels_; levels++)
+        {
+            level_scale.push_back(scale);
+
+            if (cvRound(img.cols / scale) < win_size_.width ||
+                cvRound(img.rows / scale) < win_size_.height ||
+                scale0_ <= 1)
+            {
+                break;
+            }
+
+            scale *= scale0_;
+        }
+        levels = std::max(levels, 1);
+        level_scale.resize(levels);
+
+        std::vector<Point> level_hits;
+        std::vector<double> level_confidences;
+
+        BufferPool pool(Stream::Null());
+
+        found_locations.clear();
+        for (size_t i = 0; i < level_scale.size(); i++)
+        {
+            scale = level_scale[i];
+
+            Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
+
+            GpuMat smaller_img;
+            if (sz == img.size())
+            {
+                smaller_img = img;
+            }
+            else
+            {
+                smaller_img = pool.getBuffer(sz, img.type());
+                switch (img.type())
+                {
+                    case CV_8UC1: hog::resize_8UC1(img, smaller_img); break;
+                    case CV_8UC4: hog::resize_8UC4(img, smaller_img); break;
+                }
+            }
+
+            detect(smaller_img, level_hits,
+                   confidences ? &level_confidences : NULL);
+
+            Size scaled_win_size(cvRound(win_size_.width * scale),
+                                 cvRound(win_size_.height * scale));
+
+            for (size_t j = 0; j < level_hits.size(); j++)
+            {
+                found_locations.push_back(Rect(Point2d(level_hits[j]) * scale, scaled_win_size));
+                if (confidences)
+                    confidences->push_back(level_confidences[j]);
+            }
+        }
+
+        if (group_threshold_ > 0)
+        {
+            groupRectangles(found_locations, group_threshold_, 0.2/*magic number copied from CPU version*/);
+        }
+    }
+
+    void HOG_Impl::compute(InputArray _img,
+                           OutputArray _descriptors,
+                           Stream& stream)
+    {
+        const GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
+        CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );
+
+        BufferPool   pool(stream);
+        GpuMat       block_hists     = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
+        Size         wins_per_img    = numPartsWithin(img.size(), win_size_, win_stride_);
+        Size         blocks_per_win  = numPartsWithin(win_size_, block_size_, block_stride_);
+        const size_t block_hist_size = getBlockHistogramSize();
+        _descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32FC1);
+        GpuMat       descriptors     = _descriptors.getGpuMat();
+
+        computeBlockHistograms(img, block_hists, stream);
+
+        switch (descr_format_)
+        {
+        case DESCR_FORMAT_ROW_BY_ROW:
+            hog::extract_descrs_by_rows(win_size_.height, win_size_.width,
+                                        block_stride_.height, block_stride_.width,
+                                        win_stride_.height, win_stride_.width,
+                                        img.rows, img.cols,
+                                        block_hists.ptr<float>(),
+                                        cell_size_.width, cells_per_block_.width,
+                                        descriptors,
+                                        StreamAccessor::getStream(stream));
+            break;
+        case DESCR_FORMAT_COL_BY_COL:
+            hog::extract_descrs_by_cols(win_size_.height, win_size_.width,
+                                        block_stride_.height, block_stride_.width,
+                                        win_stride_.height, win_stride_.width,
+                                        img.rows, img.cols,
+                                        block_hists.ptr<float>(),
+                                        cell_size_.width, cells_per_block_.width,
+                                        descriptors,
+                                        StreamAccessor::getStream(stream));
+            break;
+        default:
+            CV_Error(cv::Error::StsBadArg, "Unknown descriptor format");
+        }
+    }
+
+    int HOG_Impl::getTotalHistSize(Size img_size) const
+    {
+        size_t block_hist_size = getBlockHistogramSize();
+        Size blocks_per_img = numPartsWithin(img_size, block_size_, block_stride_);
+        return static_cast<int>(block_hist_size * blocks_per_img.area());
+    }
+
+    void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists, Stream& stream)
+    {
+        BufferPool pool(stream);
+        cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
+        float  angleScale = static_cast<float>(nbins_ / CV_PI);
+        GpuMat grad       = pool.getBuffer(img.size(), CV_32FC2);
+        GpuMat qangle     = pool.getBuffer(img.size(), CV_8UC2);
+
+        hog::set_up_constants(nbins_,
+                              block_stride_.width, block_stride_.height,
+                              blocks_per_win.width, blocks_per_win.height,
+                              cells_per_block_.width, cells_per_block_.height,
+                              StreamAccessor::getStream(stream));
+
+        switch (img.type())
+        {
+            case CV_8UC1:
+                hog::compute_gradients_8UC1(nbins_,
+                                            img.rows, img.cols, img,
+                                            angleScale,
+                                            grad, qangle,
+                                            gamma_correction_,
+                                            StreamAccessor::getStream(stream));
+                break;
+            case CV_8UC4:
+                hog::compute_gradients_8UC4(nbins_,
+                                            img.rows, img.cols, img,
+                                            angleScale,
+                                            grad, qangle,
+                                            gamma_correction_,
+                                            StreamAccessor::getStream(stream));
+                break;
+        }
+
+        hog::compute_hists(nbins_,
+                           block_stride_.width, block_stride_.height,
+                           img.rows, img.cols,
+                           grad, qangle,
+                           (float)getWinSigma(),
+                           block_hists.ptr<float>(),
+                           cell_size_.width, cell_size_.height,
+                           cells_per_block_.width, cells_per_block_.height,
+                           StreamAccessor::getStream(stream));
+
+        hog::normalize_hists(nbins_,
+                             block_stride_.width, block_stride_.height,
+                             img.rows, img.cols,
+                             block_hists.ptr<float>(),
+                             (float)threshold_L2hys_,
+                             cell_size_.width, cell_size_.height,
+                             cells_per_block_.width, cells_per_block_.height,
+                             StreamAccessor::getStream(stream));
+    }
+}
+
+Ptr<cuda::HOG> cv::cuda::HOG::create(Size win_size,
+                                     Size block_size,
+                                     Size block_stride,
+                                     Size cell_size,
+                                     int nbins)
+{
+    return makePtr<HOG_Impl>(win_size, block_size, block_stride, cell_size, nbins);
+}
+
+namespace
+{
+    static Mat getPeopleDetector48x96()
+    {
+        static float detector[] = {
+            0.294350f, -0.098796f, -0.129522f, 0.078753f, 0.387527f, 0.261529f,
+            0.145939f, 0.061520f, 0.328699f, 0.227148f, -0.066467f, -0.086723f,
+            0.047559f, 0.106714f, 0.037897f, 0.111461f, -0.024406f, 0.304769f,
+            0.254676f, -0.069235f, 0.082566f, 0.147260f, 0.326969f, 0.148888f,
+            0.055270f, -0.087985f, 0.261720f, 0.143442f, 0.026812f, 0.238212f,
+            0.194020f, 0.056341f, -0.025854f, -0.034444f, -0.156631f, 0.205174f,
+            0.089008f, -0.139811f, -0.100147f, -0.037830f, -0.029230f, -0.055641f,
+            0.033248f, -0.016512f, 0.155244f, 0.247315f, -0.124694f, -0.048414f,
+            -0.062219f, 0.193683f, 0.004574f, 0.055089f, 0.093565f, 0.167712f,
+            0.167581f, 0.018895f, 0.215258f, 0.122609f, 0.090520f, -0.067219f,
+            -0.049029f, -0.099615f, 0.241804f, -0.094893f, -0.176248f, 0.001727f,
+            -0.134473f, 0.104442f, 0.050942f, 0.081165f, 0.072156f, 0.121646f,
+            0.002656f, -0.297974f, -0.133587f, -0.060121f, -0.092515f, -0.048974f,
+            -0.084754f, -0.180111f, -0.038590f, 0.086283f, -0.134636f, -0.107249f,
+            0.132890f, 0.141556f, 0.249425f, 0.130273f, -0.030031f, 0.073212f,
+            -0.008155f, 0.019931f, 0.071688f, 0.000300f, -0.019525f, -0.021725f,
+            -0.040993f, -0.086841f, 0.070124f, 0.240033f, 0.265350f, 0.043208f,
+            0.166754f, 0.091453f, 0.060916f, -0.036972f, -0.091043f, 0.079873f,
+            0.219781f, 0.158102f, -0.140618f, -0.043016f, 0.124802f, 0.093668f,
+            0.103208f, 0.094872f, 0.080541f, 0.137711f, 0.160566f, -0.169231f,
+            0.013983f, 0.309508f, -0.004217f, -0.057200f, -0.064489f, 0.014066f,
+            0.361009f, 0.251328f, -0.080983f, -0.044183f, 0.061436f, -0.037381f,
+            -0.078786f, 0.030993f, 0.066314f, 0.037683f, 0.152325f, -0.091683f,
+            0.070203f, 0.217856f, 0.036435f, -0.076462f, 0.006254f, -0.094431f,
+            0.154829f, -0.023038f, -0.196961f, -0.024594f, 0.178465f, -0.050139f,
+            -0.045932f, -0.000965f, 0.109112f, 0.046165f, -0.159373f, -0.008713f,
+            0.041307f, 0.097129f, -0.057211f, -0.064599f, 0.077165f, 0.176167f,
+            0.138322f, 0.065753f, -0.104950f, 0.017933f, 0.136255f, -0.011598f,
+            0.047007f, 0.080550f, 0.068619f, 0.084661f, -0.035493f, -0.091314f,
+            -0.041411f, 0.060971f, -0.101912f, -0.079870f, -0.085977f, -0.022686f,
+            0.079788f, -0.098064f, -0.054603f, 0.040383f, 0.300794f, 0.128603f,
+            0.094844f, 0.047407f, 0.101825f, 0.061832f, -0.162160f, -0.204553f,
+            -0.035165f, 0.101450f, -0.016641f, -0.027140f, -0.134392f, -0.008743f,
+            0.102331f, 0.114853f, 0.009644f, 0.062823f, 0.237339f, 0.167843f,
+            0.053066f, -0.012592f, 0.043158f, 0.002305f, 0.065001f, -0.038929f,
+            -0.020356f, 0.152343f, 0.043469f, -0.029967f, -0.042948f, 0.032481f,
+            0.068488f, -0.110840f, -0.111083f, 0.111980f, -0.002072f, -0.005562f,
+            0.082926f, 0.006635f, -0.108153f, 0.024242f, -0.086464f, -0.189884f,
+            -0.017492f, 0.191456f, -0.007683f, -0.128769f, -0.038017f, -0.132380f,
+            0.091926f, 0.079696f, -0.106728f, -0.007656f, 0.172744f, 0.011576f,
+            0.009883f, 0.083258f, -0.026516f, 0.145534f, 0.153924f, -0.130290f,
+            -0.108945f, 0.124490f, -0.003186f, -0.100485f, 0.015024f, -0.060512f,
+            0.026288f, -0.086713f, -0.169012f, 0.076517f, 0.215778f, 0.043701f,
+            -0.131642f, -0.012585f, -0.045181f, -0.118183f, -0.241544f, -0.167293f,
+            -0.020107f, -0.019917f, -0.101827f, -0.107096f, -0.010503f, 0.044938f,
+            0.189680f, 0.217119f, -0.046086f, 0.044508f, 0.199716f, -0.036004f,
+            -0.148927f, 0.013355f, -0.078279f, 0.030451f, 0.056301f, -0.024609f,
+            0.083224f, 0.099533f, -0.039432f, -0.138880f, 0.005482f, -0.024120f,
+            -0.140468f, -0.066381f, -0.017057f, 0.009260f, -0.058004f, -0.028486f,
+            -0.061610f, 0.007483f, -0.158309f, -0.150687f, -0.044595f, -0.105121f,
+            -0.045763f, -0.006618f, -0.024419f, -0.117713f, -0.119366f, -0.175941f,
+            -0.071542f, 0.119027f, 0.111362f, 0.043080f, 0.034889f, 0.093003f,
+            0.007842f, 0.057368f, -0.108834f, -0.079968f, 0.230959f, 0.020205f,
+            0.011470f, 0.098877f, 0.101310f, -0.030215f, -0.018018f, -0.059552f,
+            -0.106157f, 0.021866f, -0.036471f, 0.080051f, 0.041165f, -0.082101f,
+            0.117726f, 0.030961f, -0.054763f, -0.084102f, -0.185778f, -0.061305f,
+            -0.038089f, -0.110728f, -0.264010f, 0.076675f, -0.077111f, -0.137644f,
+            0.036232f, 0.277995f, 0.019116f, 0.107738f, 0.144003f, 0.080304f,
+            0.215036f, 0.228897f, 0.072713f, 0.077773f, 0.120168f, 0.075324f,
+            0.062730f, 0.122478f, -0.049008f, 0.164912f, 0.162450f, 0.041246f,
+            0.009891f, -0.097827f, -0.038700f, -0.023027f, -0.120020f, 0.203364f,
+            0.248474f, 0.149810f, -0.036276f, -0.082814f, -0.090343f, -0.027143f,
+            -0.075689f, -0.320310f, -0.000500f, -0.143334f, -0.065077f, -0.186936f,
+            0.129372f, 0.116431f, 0.181699f, 0.170436f, 0.418854f, 0.460045f,
+            0.333719f, 0.230515f, 0.047822f, -0.044954f, -0.068086f, 0.140179f,
+            -0.044821f, 0.085550f, 0.092483f, -0.107296f, -0.130670f, -0.206629f,
+            0.114601f, -0.317869f, -0.076663f, 0.038680f, 0.212753f, -0.016059f,
+            -0.126526f, -0.163602f, 0.210154f, 0.099887f, -0.126366f, 0.118453f,
+            0.019309f, -0.021611f, -0.096499f, -0.111809f, -0.200489f, 0.142854f,
+            0.228840f, -0.353346f, -0.179151f, 0.116834f, 0.252389f, -0.031728f,
+            -0.188135f, -0.158998f, 0.386523f, 0.122315f, 0.209944f, 0.394023f,
+            0.359030f, 0.260717f, 0.170335f, 0.013683f, -0.142596f, -0.026138f,
+            -0.011878f, -0.150519f, 0.047159f, -0.107062f, -0.147347f, -0.187689f,
+            -0.186027f, -0.208048f, 0.058468f, -0.073026f, -0.236556f, -0.079788f,
+            -0.146216f, -0.058563f, -0.101361f, -0.071294f, -0.071093f, 0.116919f,
+            0.234304f, 0.306781f, 0.321866f, 0.240000f, 0.073261f, -0.012173f,
+            0.026479f, 0.050173f, 0.166127f, 0.228955f, 0.061905f, 0.156460f,
+            0.205990f, 0.120672f, 0.037350f, 0.167884f, 0.290099f, 0.420900f,
+            -0.012601f, 0.189839f, 0.306378f, 0.118383f, -0.095598f, -0.072360f,
+            -0.132496f, -0.224259f, -0.126021f, 0.022714f, 0.284039f, 0.051369f,
+            -0.000927f, -0.058735f, -0.083354f, -0.141254f, -0.187578f, -0.202669f,
+            0.048902f, 0.246597f, 0.441863f, 0.342519f, 0.066979f, 0.215286f,
+            0.188191f, -0.072240f, -0.208142f, -0.030196f, 0.178141f, 0.136985f,
+            -0.043374f, -0.181098f, 0.091815f, 0.116177f, -0.126690f, -0.386625f,
+            0.368165f, 0.269149f, -0.088042f, -0.028823f, 0.092961f, 0.024099f,
+            0.046112f, 0.176756f, 0.135849f, 0.124955f, 0.195467f, -0.037218f,
+            0.167217f, 0.188938f, 0.053528f, -0.066561f, 0.133721f, -0.070565f,
+            0.115898f, 0.152435f, -0.116993f, -0.110592f, -0.179005f, 0.026668f,
+            0.080530f, 0.075084f, -0.070401f, 0.012497f, 0.021849f, -0.139764f,
+            -0.022020f, -0.096301f, -0.064954f, -0.127446f, -0.013806f, -0.108315f,
+            0.156285f, 0.149867f, -0.011382f, 0.064532f, 0.029168f, 0.027393f,
+            0.069716f, 0.153735f, 0.038459f, 0.230714f, 0.253840f, 0.059522f,
+            -0.045053f, 0.014083f, 0.071103f, 0.068747f, 0.095887f, 0.005832f,
+            0.144887f, 0.026357f, -0.067359f, -0.044151f, -0.123283f, -0.019911f,
+            0.005318f, 0.109208f, -0.003201f, -0.021734f, 0.142025f, -0.066907f,
+            -0.120070f, -0.188639f, 0.012472f, -0.048704f, -0.012366f, -0.184828f,
+            0.168591f, 0.267166f, 0.058208f, -0.044101f, 0.033500f, 0.178558f,
+            0.104550f, 0.122418f, 0.080177f, 0.173246f, 0.298537f, 0.064173f,
+            0.053397f, 0.174341f, 0.230984f, 0.117025f, 0.166242f, 0.227781f,
+            0.120623f, 0.176952f, -0.011393f, -0.086483f, -0.008270f, 0.051700f,
+            -0.153369f, -0.058837f, -0.057639f, -0.060115f, 0.026349f, -0.160745f,
+            -0.037894f, -0.048575f, 0.041052f, -0.022112f, 0.060365f, 0.051906f,
+            0.162657f, 0.138519f, -0.050185f, -0.005938f, 0.071301f, 0.127686f,
+            0.062342f, 0.144400f, 0.072600f, 0.198436f, 0.246219f, -0.078185f,
+            -0.036169f, 0.075934f, 0.047328f, -0.013601f, 0.087205f, 0.019900f,
+            0.022606f, -0.015365f, -0.092506f, 0.075275f, -0.116375f, 0.050500f,
+            0.045118f, 0.166567f, 0.072073f, 0.060371f, 0.131747f, -0.169863f,
+            -0.039352f, -0.047486f, -0.039797f, -0.204312f, 0.021710f, 0.129443f,
+            -0.021173f, 0.173416f, -0.070794f, -0.063986f, 0.069689f, -0.064099f,
+            -0.123201f, -0.017372f, -0.206870f, 0.065863f, 0.113226f, 0.024707f,
+            -0.071341f, -0.066964f, -0.098278f, -0.062927f, 0.075840f, 0.014716f,
+            0.019378f, 0.132699f, -0.074191f, -0.089557f, -0.078446f, -0.197488f,
+            -0.173665f, 0.052583f, 0.044361f, 0.113549f, 0.098492f, 0.077379f,
+            -0.011146f, -0.192593f, -0.164435f, 0.045568f, 0.205699f, 0.049187f,
+            -0.082281f, 0.134874f, 0.185499f, 0.034968f, -0.119561f, -0.112372f,
+            -0.115091f, -0.054042f, -0.183816f, -0.078100f, 0.190695f, 0.091617f,
+            0.004257f, -0.041135f, -0.061453f, -0.141592f, -0.194809f, -0.120638f,
+            0.020168f, 0.109672f, 0.067398f, -0.015238f, -0.239145f, -0.264671f,
+            -0.185176f, 0.050472f, 0.020793f, 0.035678f, 0.022839f, -0.052055f,
+            -0.127968f, -0.113049f, -0.228416f, -0.258281f, -0.053437f, 0.076424f,
+            0.061450f, 0.237478f, 0.003618f, -0.055865f, -0.108087f, -0.028937f,
+            0.045585f, 0.052829f, -0.001471f, 0.022826f, 0.059565f, -0.104430f,
+            -0.077266f, -0.211882f, -0.212078f, 0.028074f, 0.075846f, 0.016265f,
+            0.161879f, 0.134477f, 0.008935f, -0.048041f, 0.074692f, 0.004928f,
+            -0.025156f, 0.192874f, 0.074410f, 0.308732f, 0.267400f, 0.094208f,
+            -0.005251f, 0.042041f, -0.032148f, 0.015588f, 0.252869f, 0.175302f,
+            0.022892f, 0.081673f, 0.063208f, 0.162626f, 0.194426f, 0.233890f,
+            0.262292f, 0.186930f, 0.084079f, -0.286388f, -0.213034f, -0.048867f,
+            -0.207669f, -0.170050f, 0.011673f, -0.092958f, -0.192786f, -0.273536f,
+            0.230904f, 0.266732f, 0.320519f, 0.297155f, 0.548169f, 0.304922f,
+            0.132687f, 0.247333f, 0.212488f, -0.271472f, -0.142105f, -0.002627f,
+            -0.119215f, 0.128383f, 0.100079f, -0.057490f, -0.121902f, -0.228892f,
+            0.202292f, -0.399795f, -0.371326f, -0.095836f, -0.063626f, -0.161375f,
+            -0.311180f, -0.294797f, 0.242122f, 0.011788f, 0.095573f, 0.322523f,
+            0.511840f, 0.322880f, 0.313259f, 0.173331f, 0.002542f, -0.029802f,
+            0.324766f, -0.326170f, -0.340547f, -0.138288f, -0.002963f, -0.114060f,
+            -0.377312f, -0.442570f, 0.212446f, -0.007759f, -0.011576f, 0.169711f,
+            0.308689f, 0.317348f, 0.539390f, 0.332845f, 0.057331f, -0.068180f,
+            0.101994f, 0.266995f, 0.209570f, 0.355730f, 0.091635f, 0.170238f,
+            0.125215f, 0.274154f, 0.070223f, 0.025515f, 0.049946f, -0.000550f,
+            0.043715f, -0.141843f, 0.020844f, 0.129871f, 0.256588f, 0.105015f,
+            0.148339f, 0.170682f, 0.028792f, 0.074037f, 0.160042f, 0.405137f,
+            0.246187f, 0.352160f, 0.168951f, 0.222263f, 0.264439f, 0.065945f,
+            0.021963f, -0.075084f, 0.093105f, 0.027318f, 0.098864f, 0.057566f,
+            -0.080282f, 0.185032f, 0.314419f, 0.333727f, 0.125798f, 0.294919f,
+            0.386002f, 0.217619f, -0.183517f, -0.278622f, -0.002342f, -0.027821f,
+            -0.134266f, -0.331843f, -0.008296f, 0.124564f, 0.053712f, -0.369016f,
+            -0.095036f, 0.209381f, 0.423760f, 0.371760f, 0.106397f, 0.369408f,
+            0.485608f, 0.231201f, -0.138685f, -0.349208f, -0.070083f, 0.028991f,
+            -0.081630f, -0.395992f, -0.146791f, -0.027354f, 0.063396f, -0.272484f,
+            0.058299f, 0.338207f, 0.110767f, -0.052642f, -0.233848f, -0.027448f,
+            0.030328f, 0.155572f, -0.093826f, 0.019331f, 0.120638f, 0.006292f,
+            -0.106083f, -0.236290f, -0.140933f, -0.088067f, -0.025138f, -0.208395f,
+            -0.025502f, 0.144192f, -0.048353f, -0.106144f, -0.305121f, -0.114147f,
+            0.090963f, 0.327727f, 0.035606f, -0.093779f, 0.002651f, -0.171081f,
+            -0.188131f, -0.216571f, -0.209101f, -0.054402f, 0.157147f, -0.057127f,
+            0.066584f, 0.008988f, 0.041191f, 0.034456f, -0.078255f, 0.052099f,
+            -0.022239f, 0.066981f, -0.117520f, -0.072637f, 0.062512f, 0.037570f,
+            -0.057544f, -0.312359f, 0.034357f, -0.031549f, 0.002566f, -0.207375f,
+            -0.070654f, -0.018786f, -0.044815f, -0.012814f, -0.076320f, 0.078183f,
+            0.023877f, 0.117078f, 0.022292f, -0.205424f, -0.060430f, -0.017296f,
+            -0.004827f, -0.321036f, -0.092155f, 0.038837f, 0.073190f, -0.067513f,
+            0.026521f, 0.171945f, 0.087318f, 0.034495f, -0.034089f, 0.154410f,
+            -0.061431f, 0.007435f, -0.111094f, -0.095976f, 0.014741f, -0.132324f,
+            -0.029517f, -0.192160f, 0.098667f, 0.020762f, 0.177050f, -0.064510f,
+            -0.054437f, -0.058678f, -0.001858f, 0.167602f, 0.015735f, 0.054338f,
+            0.016477f, 0.186381f, -0.010667f, 0.054692f, 0.126742f, 0.013140f,
+            0.090353f, -0.133608f, -0.018017f, -0.152619f, 0.027600f, -0.138700f,
+            -0.050274f, 0.045141f, -0.118731f, 0.094797f, -0.167605f, 0.097461f,
+            -0.009131f, 0.199920f, -0.052976f, 0.158194f, 0.178568f, -0.107600f,
+            0.009671f, -0.084072f, -0.040258f, -0.205673f, 0.102891f, 0.223511f,
+            0.042699f, 0.118548f, -0.021274f, 0.110997f, -0.155121f, 0.027696f,
+            -0.149968f, 0.051552f, -0.129219f, 0.173524f, 0.073972f, -0.189045f,
+            -0.034523f, -0.106655f, -0.011843f, -0.197381f, 0.219413f, 0.183197f,
+            -0.054920f, 0.144955f, 0.036517f, -0.085412f, -0.229070f, -0.143710f,
+            -0.049486f, 0.156634f, -0.008673f, -0.064778f, 0.082344f, 0.145673f,
+            0.002912f, -0.210121f, -0.116564f, 0.078425f, 0.220908f, -0.067594f,
+            0.048610f, 0.084912f, -0.066202f, -0.112515f, -0.217767f, -0.082640f,
+            -0.017414f, 0.230265f, -0.070735f, 0.066073f, 0.215256f, 0.071157f,
+            -0.087220f, -0.202235f, -0.011918f, 0.099562f, 0.174716f, -0.063845f,
+            -0.121055f, 0.014367f, 0.132709f, -0.005060f, -0.244606f, -0.179693f,
+            -0.134690f, 0.023239f, -0.193116f, -0.076975f, -0.021164f, -0.001938f,
+            -0.163799f, -0.111437f, -0.210362f, -0.166376f, 0.034754f, 0.010036f,
+            -0.021917f, 0.068014f, -0.086893f, -0.251746f, -0.267171f, 0.037383f,
+            0.003966f, 0.033571f, -0.151506f, 0.025437f, -0.020626f, -0.308454f,
+            -0.343143f, -0.092263f, -0.026261f, -0.028345f, 0.036036f, 0.035169f,
+            0.129470f, 0.122205f, 0.015661f, -0.070612f, -0.094333f, -0.066055f,
+            -0.041083f, 0.159146f, 0.073184f, 0.110044f, 0.174471f, 0.078069f,
+            -0.014881f, 0.008116f, 0.013209f, 0.075857f, 0.195605f, 0.062714f,
+            0.067955f, 0.056544f, -0.153908f, -0.141749f, -0.072550f, 0.033523f,
+            -0.024665f, 0.134487f, 0.079076f, 0.133562f, 0.227130f, 0.018054f,
+            0.004928f, 0.169162f, 0.065152f, 0.072160f, 0.131631f, 0.096303f,
+            0.054288f, 0.106256f, 0.114632f, 0.119038f, 0.515200f, 0.247429f,
+            0.199134f, 0.211957f, 0.127558f, -0.294684f, -0.194890f, -0.049988f,
+            -0.112247f, -0.008122f, -0.006176f, 0.037035f, -0.110881f, -0.249989f,
+            0.152434f, 0.234621f, 0.153340f, 0.349283f, 0.683049f, 0.157174f,
+            0.124844f, 0.099136f, 0.064407f, -0.248400f, -0.155323f, -0.026498f,
+            -0.023450f, 0.049051f, -0.114187f, 0.007195f, -0.176825f, -0.376926f,
+            0.366159f, -0.179938f, -0.148508f, 0.006043f, 0.170048f, 0.097866f,
+            -0.102658f, -0.260430f, 0.248868f, 0.037019f, -0.118111f, 0.078176f,
+            0.194171f, 0.211328f, 0.368612f, 0.361213f, 0.130013f, 0.094650f,
+            0.227396f, -0.178058f, -0.114782f, -0.008093f, 0.231080f, -0.011843f,
+            -0.097917f, -0.325788f, 0.141879f, 0.119738f, -0.230427f, -0.117419f,
+            -0.114153f, 0.037903f, 0.116383f, 0.218773f, -0.101884f, 0.059466f,
+            0.119255f, 0.010874f, -0.031449f, 0.045996f, 0.119931f, 0.273760f,
+            0.311700f, 0.261794f, 0.194809f, 0.339829f, 0.239449f, 0.064140f,
+            0.077597f, 0.098996f, 0.143534f, 0.184602f, 0.037507f, 0.225494f,
+            0.096142f, -0.147370f, -0.207833f, -0.174742f, -0.086391f, -0.038942f,
+            0.159577f, -0.088492f, -0.000989f, 0.108154f, -0.025890f, -0.072713f,
+            0.025997f, -0.006803f, -0.086879f, -0.011290f, -0.269200f, -0.103450f,
+            -0.124910f, -0.116340f, 0.141459f, 0.208800f, 0.042268f, 0.265034f,
+            0.516474f, 0.217591f, -0.018843f, -0.313328f, -0.168363f, 0.047129f,
+            0.090480f, -0.109852f, -0.018761f, 0.210669f, 0.281269f, -0.043591f,
+            -0.034147f, -0.237772f, -0.134843f, -0.072481f, -0.103831f, 0.038355f,
+            0.308619f, 0.148023f, -0.045867f, -0.123950f, -0.210860f, -0.064973f,
+            -0.036308f, -0.046731f, -0.022099f, 0.095776f, 0.409423f, 0.060635f,
+            -0.065196f, 0.051828f, 0.027981f, -0.009609f, -0.137681f, -0.095011f,
+            -0.019045f, 0.177278f, 0.009759f, -0.092119f, -0.016958f, -0.133860f,
+            -0.118421f, -0.032039f, -0.006214f, -0.084541f, 0.063971f, -0.073642f,
+            0.165676f, 0.110443f, 0.044131f, 0.046568f, 0.053292f, -0.055466f,
+            0.015512f, 0.371947f, 0.232102f, -0.016923f, 0.103979f, -0.091758f,
+            0.005907f, 0.209100f, 0.157433f, 0.030518f, 0.250366f, 0.062322f,
+            0.036720f, 0.094676f, 0.017306f, -0.010328f, -0.079012f, 0.016781f,
+            -0.112435f, 0.061795f, 0.042543f, -0.126799f, -0.009975f, -0.056760f,
+            0.046424f, -0.194712f, -0.139399f, -0.037731f, 0.157989f, -0.016261f,
+            0.123345f, 0.230563f, 0.083300f, -0.016392f, 0.059567f, -0.016035f,
+            -0.064767f, 0.231945f, 0.156629f, 0.034602f, 0.145628f, 0.041315f,
+            0.034535f, 0.019967f, -0.089188f, -0.012091f, 0.307857f, 0.211405f,
+            -0.025091f, -0.148249f, -0.129384f, 0.063536f, -0.068603f, -0.067941f,
+            -0.035104f, 0.210832f, 0.063810f, 0.062764f, -0.089889f, -0.030554f,
+            0.014791f, -0.053362f, -0.037818f, -0.196640f, 0.008388f, -0.082654f,
+            0.143056f, 0.064221f, 0.069795f, 0.191040f, 0.097321f, -0.028679f,
+            0.075794f, 0.313154f, 0.086240f, 0.207643f, 0.017809f, 0.122867f,
+            0.224586f, 0.167403f, -0.023884f, 0.047434f, 0.344091f, 0.187745f,
+            0.136177f, 0.141738f, 0.063799f, 0.045233f, -0.077342f, -0.003525f,
+            -0.165041f, -0.025616f, -0.073745f, 0.164439f, 0.011200f, -0.145896f,
+            -0.027954f, -0.061987f, -0.039874f, -0.142775f, 0.151042f, -0.038238f,
+            0.053152f, 0.078615f, 0.086061f, 0.100593f, 0.128046f, -0.071006f,
+            -0.116558f, 0.208445f, 0.051086f, 0.076843f, 0.023191f, -0.084781f,
+            -0.011790f, 0.147807f, -0.048554f, -0.113932f, 0.283322f, 0.190934f,
+            0.092789f, 0.033018f, -0.142428f, -0.142480f, -0.099023f, -0.041020f,
+            -0.042760f, 0.203295f, -0.053475f, 0.042424f, 0.222839f, -0.019167f,
+            -0.133176f, -0.276216f, -0.031998f, 0.117290f, 0.177827f, -0.059973f,
+            -0.064744f, -0.117040f, -0.155482f, -0.099531f, 0.164121f, -0.026682f,
+            -0.093810f, 0.238993f, -0.006506f, 0.007830f, 0.065819f, -0.203643f,
+            -0.100925f, -0.053652f, -0.130770f, 0.026277f, 0.131796f, 0.032742f,
+            0.127186f, 0.116694f, -0.161122f, -0.279773f, -0.252515f, -0.002638f,
+            0.042812f, 0.096776f, -0.123280f, 0.064858f, -0.010455f, -0.219760f,
+            -0.239331f, -0.104363f, -0.058022f, -0.053584f, 0.025611f, 0.005129f,
+            -0.100418f, -0.045712f, -0.194418f, -0.126366f, -0.030530f, 0.051168f,
+            0.215959f, 0.172402f, -0.054700f, -0.185995f, -0.278360f, -0.193693f,
+            -0.040309f, 0.003735f, -0.007770f, 0.123556f, 0.190179f, -0.077315f,
+            0.117403f, 0.212942f, 0.012160f, 0.000113f, 0.027331f, 0.040202f,
+            0.033293f, 0.219438f, 0.184174f, 0.259349f, 0.311206f, 0.082547f,
+            -0.047875f, -0.078417f, 0.010746f, 0.082620f, 0.311931f, 0.307605f,
+            0.003863f, 0.021405f, -0.026388f, -0.019572f, 0.020582f, -0.059353f,
+            0.025199f, 0.261319f, 0.086316f, 0.143614f, 0.107780f, 0.003900f,
+            -0.188397f, -0.038563f, -0.106045f, -0.125154f, -0.010509f, 0.054021f,
+            0.242130f, 0.279152f, 0.215546f, 0.346995f, 0.440856f, 0.237452f,
+            0.234154f, 0.301646f, 0.168929f, -0.208358f, -0.126848f, 0.010260f,
+            0.121018f, -0.062975f, -0.052848f, 0.050341f, -0.061103f, -0.266482f,
+            0.107186f, 0.140221f, 0.280065f, 0.287889f, 0.373198f, 0.151596f,
+            0.013593f, 0.115616f, 0.014616f, -0.281710f, -0.237597f, -0.117305f,
+            -0.000034f, -0.136739f, -0.196275f, -0.095225f, -0.125310f, -0.250514f,
+            0.236804f, -0.071805f, -0.037421f, 0.048230f, 0.321596f, 0.063632f,
+            0.024039f, -0.029133f, 0.230983f, 0.160593f, -0.154355f, -0.013086f,
+            -0.079929f, 0.094692f, 0.160391f, 0.180239f, 0.053895f, 0.100759f,
+            0.288631f, 0.038191f, 0.181692f, 0.229682f, 0.440166f, 0.063401f,
+            0.006273f, 0.020865f, 0.338695f, 0.256244f, -0.043927f, 0.115617f,
+            0.003296f, 0.173965f, 0.021318f, -0.040936f, -0.118932f, 0.182380f,
+            0.235922f, -0.053233f, -0.015053f, -0.101057f, 0.095341f, 0.051111f,
+            0.161831f, 0.032614f, 0.159496f, 0.072375f, 0.025089f, 0.023748f,
+            0.029151f, 0.161284f, -0.117717f, -0.036191f, -0.176822f, -0.162006f,
+            0.226542f, -0.078329f, 0.043079f, -0.119172f, 0.054614f, -0.101365f,
+            -0.064541f, -0.115304f, 0.135170f, 0.298872f, 0.098060f, 0.089428f,
+            -0.007497f, 0.110391f, -0.028824f, 0.020835f, -0.036804f, 0.125411f,
+            0.192105f, -0.048931f, 0.003086f, -0.010681f, 0.074698f, -0.016263f,
+            0.096063f, 0.060267f, -0.007277f, 0.139139f, -0.080635f, 0.036628f,
+            0.086058f, 0.131979f, 0.085707f, 0.025301f, 0.226094f, 0.194759f,
+            0.042193f, -0.157846f, -0.068402f, -0.141450f, -0.112659f, -0.076305f,
+            -0.069085f, -0.114332f, -0.102005f, 0.132193f, -0.067042f, 0.106643f,
+            0.198964f, 0.171616f, 0.167237f, -0.033730f, -0.026755f, 0.083621f,
+            0.149459f, -0.002799f, -0.000318f, 0.011753f, 0.065889f, -0.089375f,
+            -0.049610f, 0.224579f, 0.216548f, -0.034908f, -0.017851f, -0.088144f,
+            0.007530f, 0.240268f, 0.073270f, 0.013263f, 0.175323f, 0.012082f,
+            0.093993f, 0.015282f, 0.105854f, 0.107990f, 0.077798f, -0.096166f,
+            -0.079607f, 0.177820f, 0.142392f, 0.033337f, -0.078100f, -0.081616f,
+            -0.046993f, 0.139459f, 0.020272f, -0.123161f, 0.175269f, 0.105217f,
+            0.057328f, 0.080909f, -0.012612f, -0.097081f, 0.082060f, -0.096716f,
+            -0.063921f, 0.201884f, 0.128166f, -0.035051f, -0.032227f, -0.068139f,
+            -0.115915f, 0.095080f, -0.086007f, -0.067543f, 0.030776f, 0.032712f,
+            0.088937f, 0.054336f, -0.039329f, -0.114022f, 0.171672f, -0.112321f,
+            -0.217646f, 0.065186f, 0.060223f, 0.192174f, 0.055580f, -0.131107f,
+            -0.144338f, 0.056730f, -0.034707f, -0.081616f, -0.135298f, -0.000614f,
+            0.087189f, 0.014614f, 0.067709f, 0.107689f, 0.225780f, 0.084361f,
+            -0.008544f, 0.051649f, -0.048369f, -0.037739f, -0.060710f, 0.002654f,
+            0.016935f, 0.085563f, -0.015961f, -0.019265f, 0.111788f, 0.062376f,
+            0.202019f, 0.047713f, 0.042261f, 0.069716f, 0.242913f, 0.021052f,
+            -0.072812f, -0.155920f, -0.026436f, 0.035621f, -0.079300f, -0.028787f,
+            -0.048329f, 0.084718f, -0.060565f, -0.083750f, -0.164075f, -0.040742f,
+            -0.086219f, 0.015271f, -0.005204f, -0.016038f, 0.045816f, -0.050433f,
+            -0.077652f, 0.117109f, 0.009611f, -0.009045f, -0.008634f, -0.055373f,
+            -0.085968f, 0.028527f, -0.054736f, -0.168089f, 0.175839f, 0.071205f,
+            -0.023603f, 0.037907f, -0.004561f, -0.022634f, 0.123831f, 0.094469f,
+            -0.072920f, -0.133642f, -0.014032f, -0.142754f, -0.026999f, -0.199409f,
+            0.013268f, 0.226989f, 0.048650f, -0.170988f, -0.050141f, 0.007880f,
+            0.061880f, 0.019078f, -0.043578f, -0.038139f, 0.134814f, 0.054097f,
+            -0.081670f, 0.176838f, 0.047920f, -0.038176f, 0.050406f, -0.107181f,
+            -0.036279f, 0.027060f, 0.081594f, -0.002820f, 0.090507f, -0.033338f,
+            -0.059571f, 0.013404f, -0.099860f, 0.073371f, 0.342805f, 0.098305f,
+            -0.150910f, -0.020822f, -0.056960f, 0.046262f, -0.043413f, -0.149405f,
+            -0.129105f, -0.010899f, -0.014229f, -0.179949f, -0.113044f, -0.049468f,
+            -0.065513f, 0.090269f, -0.011919f, 0.087846f, 0.095796f, 0.146127f,
+            0.101599f, 0.078066f, -0.084348f, -0.100002f, -0.020134f, -0.050169f,
+            0.062122f, 0.014640f, 0.019143f, 0.036543f, 0.180924f, -0.013976f,
+            -0.066768f, -0.001090f, -0.070419f, -0.004839f, -0.001504f, 0.034483f,
+            -0.044954f, -0.050336f, -0.088638f, -0.174782f, -0.116082f, -0.205507f,
+            0.015587f, -0.042839f, -0.096879f, -0.144097f, -0.050268f, -0.196796f,
+            0.109639f, 0.271411f, 0.173732f, 0.108070f, 0.156437f, 0.124255f,
+            0.097242f, 0.238693f, 0.083941f, 0.109105f, 0.223940f, 0.267188f,
+            0.027385f, 0.025819f, 0.125070f, 0.093738f, 0.040353f, 0.038645f,
+            -0.012730f, 0.144063f, 0.052931f, -0.009138f, 0.084193f, 0.160272f,
+            -0.041366f, 0.011951f, -0.121446f, -0.106713f, -0.047566f, 0.047984f,
+            -0.255224f, -0.076116f, 0.098685f, -0.150845f, -0.171513f, -0.156590f,
+            0.058331f, 0.187493f, 0.413018f, 0.554265f, 0.372242f, 0.237943f,
+            0.124571f, 0.110829f, 0.010322f, -0.174477f, -0.067627f, -0.001979f,
+            0.142913f, 0.040597f, 0.019907f, 0.025963f, -0.043585f, -0.120732f,
+            0.099937f, 0.091059f, 0.247307f, 0.204226f, -0.042753f, -0.068580f,
+            -0.119002f, 0.026722f, 0.034853f, -0.060934f, -0.025054f, -0.093026f,
+            -0.035372f, -0.233209f, -0.049869f, -0.039151f, -0.022279f, -0.065380f,
+            -9.063785f };
+
+        return Mat(1, static_cast<int>(sizeof(detector)/sizeof(detector[0])), CV_32FC1, detector);
+    }
+
+    Mat getPeopleDetector64x128()
+    {
+        static float detector[] = {
+           0.05359386f, -0.14721455f, -0.05532170f, 0.05077307f,
+           0.11547081f, -0.04268804f, 0.04635834f, -0.05468199f, 0.08232084f,
+           0.10424068f, -0.02294518f, 0.01108519f, 0.01378693f, 0.11193510f,
+           0.01268418f, 0.08528346f, -0.06309239f, 0.13054633f, 0.08100729f,
+           -0.05209739f, -0.04315529f, 0.09341384f, 0.11035026f, -0.07596218f,
+           -0.05517511f, -0.04465296f, 0.02947334f, 0.04555536f,
+           -3.55954492e-003f, 0.07818956f, 0.07730991f, 0.07890715f, 0.06222893f,
+           0.09001380f, -0.03574381f, 0.03414327f, 0.05677258f, -0.04773581f,
+           0.03746637f, -0.03521175f, 0.06955440f, -0.03849038f, 0.01052293f,
+           0.01736112f, 0.10867710f, 0.08748853f, 3.29739624e-003f, 0.10907028f,
+           0.07913758f, 0.10393070f, 0.02091867f, 0.11594022f, 0.13182420f,
+           0.09879354f, 0.05362710f, -0.06745391f, -7.01260753e-003f,
+           5.24702156e-003f, 0.03236255f, 0.01407916f, 0.02207983f, 0.02537322f,
+           0.04547948f, 0.07200756f, 0.03129894f, -0.06274468f, 0.02107014f,
+           0.06035208f, 0.08636236f, 4.53164103e-003f, 0.02193363f, 0.02309801f,
+           0.05568166f, -0.02645093f, 0.04448695f, 0.02837519f, 0.08975694f,
+           0.04461516f, 0.08975355f, 0.07514391f, 0.02306982f, 0.10410084f,
+           0.06368385f, 0.05943464f, 4.58420580e-003f, 0.05220337f, 0.06675851f,
+           0.08358569f, 0.06712101f, 0.06559004f, -0.03930482f, -9.15936660e-003f,
+           -0.05897915f, 0.02816453f, 0.05032348f, 0.06780671f, 0.03377650f,
+           -6.09417039e-004f, -0.01795146f, -0.03083684f, -0.01302475f,
+           -0.02972313f, 7.88706727e-003f, -0.03525961f, -2.50397739e-003f,
+           0.05245084f, 0.11791293f, -0.02167498f, 0.05299332f, 0.06640524f,
+           0.05190265f, -8.27316567e-003f, 0.03033127f, 0.05842173f,
+           -4.01050318e-003f, -6.25105947e-003f, 0.05862958f, -0.02465461f,
+           0.05546781f, -0.08228195f, -0.07234028f, 0.04640540f, -0.01308254f,
+           -0.02506191f, 0.03100746f, -0.04665651f, -0.04591486f, 0.02949927f,
+           0.06035462f, 0.02244646f, -0.01698639f, 0.01040041f, 0.01131170f,
+           0.05419579f, -0.02130277f, -0.04321722f, -0.03665198f, 0.01126490f,
+           -0.02606488f, -0.02228328f, -0.02255680f, -0.03427236f,
+           -7.75165204e-003f, -0.06195229f, 8.21638294e-003f, 0.09535975f,
+           -0.03709979f, -0.06942501f, 0.14579427f, -0.05448192f, -0.02055904f,
+           0.05747357f, 0.02781788f, -0.07077577f, -0.05178314f, -0.10429011f,
+           -0.11235505f, 0.07529039f, -0.07559302f, -0.08786739f, 0.02983843f,
+           0.02667585f, 0.01382199f, -0.01797496f, -0.03141199f, -0.02098101f,
+           0.09029204f, 0.04955018f, 0.13718739f, 0.11379953f, 1.80019124e-003f,
+           -0.04577610f, -1.11108483e-003f, -0.09470536f, -0.11596080f,
+           0.04489342f, 0.01784211f, 3.06850672e-003f, 0.10781866f,
+           3.36498418e-003f, -0.10842580f, -0.07436839f, -0.10535070f,
+           -0.01866805f, 0.16057891f, -5.07316366e-003f, -0.04295658f,
+           -5.90488780e-003f, 8.82003549e-003f, -0.01492646f, -0.05029279f,
+           -0.12875880f, 8.78831954e-004f, -0.01297184f, -0.07592774f,
+           -0.02668831f, -6.93787413e-004f, 0.02406698f, -0.01773298f,
+           -0.03855745f, -0.05877856f, 0.03259695f, 0.12826584f, 0.06292590f,
+           -4.10733931e-003f, 0.10996531f, 0.01332991f, 0.02088735f, 0.04037504f,
+           -0.05210760f, 0.07760046f, 0.06399347f, -0.05751930f, -0.10053057f,
+           0.07505023f, -0.02139782f, 0.01796176f, 2.34400877e-003f, -0.04208319f,
+           0.07355055f, 0.05093350f, -0.02996780f, -0.02219072f, 0.03355330f,
+           0.04418742f, -0.05580705f, -0.05037573f, -0.04548179f, 0.01379514f,
+           0.02150671f, -0.02194211f, -0.13682702f, 0.05464972f, 0.01608082f,
+           0.05309116f, 0.04701022f, 1.33690401e-003f, 0.07575664f, 0.09625306f,
+           8.92647635e-003f, -0.02819123f, 0.10866830f, -0.03439325f,
+           -0.07092371f, -0.06004780f, -0.02712298f, -7.07467366e-003f,
+           -0.01637020f, 0.01336790f, -0.10313606f, 0.04906582f, -0.05732445f,
+           -0.02731079f, 0.01042235f, -0.08340668f, 0.03686501f, 0.06108340f,
+           0.01322748f, -0.07809529f, 0.03774724f, -0.03413248f, -0.06096525f,
+           -0.04212124f, -0.07982176f, -1.25973229e-003f, -0.03045501f,
+           -0.01236493f, -0.06312395f, 0.04789570f, -0.04602066f, 0.08576570f,
+           0.02521080f, 0.02988098f, 0.10314583f, 0.07060035f, 0.04520544f,
+           -0.04426654f, 0.13146530f, 0.08386490f, 0.02164590f, -2.12280243e-003f,
+           -0.03686353f, -0.02074944f, -0.03829959f, -0.01530596f, 0.02689708f,
+           0.11867401f, -0.06043470f, -0.02785023f, -0.04775074f, 0.04878745f,
+           0.06350956f, 0.03494788f, 0.01467400f, 1.17890188e-003f, 0.04379614f,
+           2.03681854e-003f, -0.03958609f, -0.01072688f, 6.43705716e-003f,
+           0.02996500f, -0.03418507f, -0.01960307f, -0.01219154f,
+           -4.37000440e-003f, -0.02549453f, 0.02646318f, -0.01632513f,
+           6.46516960e-003f, -0.01929734f, 4.78711911e-003f, 0.04962371f,
+           0.03809111f, 0.07265724f, 0.05758125f, -0.03741554f, 0.01648608f,
+           -8.45285598e-003f, 0.03996826f, -0.08185477f, 0.02638875f,
+           -0.04026615f, -0.02744674f, -0.04071517f, 1.05096330e-003f,
+           -0.04741232f, -0.06733172f, 8.70434940e-003f, -0.02192543f,
+           1.35350740e-003f, -0.03056974f, -0.02975521f, -0.02887780f,
+           -0.01210713f, -0.04828526f, -0.09066251f, -0.09969629f, -0.03665164f,
+           -8.88111943e-004f, -0.06826669f, -0.01866150f, -0.03627640f,
+           -0.01408288f, 0.01874239f, -0.02075835f, 0.09145175f, -0.03547291f,
+           0.05396780f, 0.04198981f, 0.01301925f, -0.03384354f, -0.12201976f,
+           0.06830920f, -0.03715654f, 9.55848210e-003f, 5.05685573e-003f,
+           0.05659294f, 3.90764466e-003f, 0.02808490f, -0.05518097f, -0.03711621f,
+           -0.02835565f, -0.04420464f, -0.01031947f, 0.01883466f,
+           -8.49525444e-003f, -0.09419250f, -0.01269387f, -0.02133371f,
+           -0.10190815f, -0.07844430f, 2.43644323e-003f, -4.09610150e-003f,
+           0.01202551f, -0.06452291f, -0.10593818f, -0.02464746f, -0.02199699f,
+           -0.07401930f, 0.07285886f, 8.87513801e-004f, 9.97662079e-003f,
+           8.46779719e-003f, 0.03730333f, -0.02905126f, 0.03573337f, -0.04393689f,
+           -0.12014472f, 0.03176554f, -2.76015815e-003f, 0.10824566f, 0.05090732f,
+           -3.30179278e-003f, -0.05123822f, 5.04784798e-003f, -0.05664124f,
+           -5.99415926e-003f, -0.05341901f, -0.01221393f, 0.01291318f,
+           9.91760660e-003f, -7.56987557e-003f, -0.06193124f, -2.24549137e-003f,
+           0.01987562f, -0.02018840f, -0.06975540f, -0.06601523f, -0.03349112f,
+           -0.08910118f, -0.03371435f, -0.07406893f, -0.02248047f, -0.06159951f,
+           2.77751544e-003f, -0.05723337f, -0.04792468f, 0.07518548f,
+           2.77279224e-003f, 0.04211938f, 0.03100502f, 0.05278448f, 0.03954679f,
+           -0.03006846f, -0.03851741f, -0.02792403f, -0.02875333f, 0.01531280f,
+           0.02186953f, -0.01989829f, 2.50679464e-003f, -0.10258728f,
+           -0.04785743f, -0.02887216f, 3.85063468e-003f, 0.01112236f,
+           8.29218887e-003f, -0.04822981f, -0.04503597f, -0.03713100f,
+           -0.06988008f, -0.11002295f, -2.69209221e-003f, 1.85383670e-003f,
+           -0.05921049f, -0.06105053f, -0.08458050f, -0.04527602f,
+           8.90329306e-004f, -0.05875023f, -2.68602883e-003f, -0.01591195f,
+           0.03631859f, 0.05493166f, 0.07300330f, 5.53333294e-003f, 0.06400407f,
+           0.01847740f, -5.76280477e-003f, -0.03210877f, 4.25160583e-003f,
+           0.01166520f, -1.44864211e-003f, 0.02253744f, -0.03367080f, 0.06983195f,
+           -4.22323542e-003f, -8.89401045e-003f, -0.07943393f, 0.05199728f,
+           0.06065201f, 0.04133492f, 1.44032843e-003f, -0.09585235f, -0.03964731f,
+           0.04232114f, 0.01750465f, -0.04487902f, -7.59733608e-003f, 0.02011171f,
+           0.04673622f, 0.09011173f, -0.07869188f, -0.04682482f, -0.05080139f,
+           -3.99383716e-003f, -0.05346331f, 0.01085723f, -0.03599333f,
+           -0.07097908f, 0.03551549f, 0.02680387f, 0.03471529f, 0.01790393f,
+           0.05471273f, 9.62048303e-003f, -0.03180215f, 0.05864431f, 0.02330614f,
+           0.01633144f, -0.05616681f, -0.10245429f, -0.08302189f, 0.07291322f,
+           -0.01972590f, -0.02619633f, -0.02485327f, -0.04627592f,
+           1.48853404e-003f, 0.05514185f, -0.01270860f, -0.01948900f, 0.06373586f,
+           0.05002292f, -0.03009798f, 8.76216311e-003f, -0.02474238f,
+           -0.05504891f, 1.74034527e-003f, -0.03333667f, 0.01524987f, 0.11663762f,
+           -1.32344989e-003f, -0.06608453f, 0.05687166f, -6.89525274e-004f,
+           -0.04402352f, 0.09450210f, -0.04222684f, -0.05360983f, 0.01779531f,
+           0.02561388f, -0.11075410f, -8.77790991e-003f, -0.01099504f,
+           -0.10380266f, 0.03103457f, -0.02105741f, -0.07371717f, 0.05146710f,
+           0.10581432f, -0.08617968f, -0.02892107f, 0.01092199f, 0.14551543f,
+           -2.24320893e-003f, -0.05818033f, -0.07390742f, 0.05701261f,
+           0.12937020f, -0.04986651f, 0.10182415f, 0.05028650f, 0.12515625f,
+           0.09175041f, 0.06404983f, 0.01523394f, 0.09460562f, 0.06106631f,
+           -0.14266998f, -0.02926703f, 0.02762171f, 0.02164151f,
+           -9.58488265e-004f, -0.04231362f, -0.09866509f, 0.04322244f,
+           0.05872034f, -0.04838847f, 0.06319253f, 0.02443798f, -0.03606876f,
+           9.38737206e-003f, 0.04289991f, -0.01027411f, 0.08156885f, 0.08751175f,
+           -0.13191354f, 8.16054735e-003f, -0.01452161f, 0.02952677f, 0.03615945f,
+           -2.09128903e-003f, 0.02246693f, 0.09623287f, 0.09412123f, -0.02924758f,
+           -0.07815186f, -0.02203079f, -2.02566991e-003f, 0.01094733f,
+           -0.01442332f, 0.02838561f, 0.11882371f, 7.28798332e-003f, -0.10345965f,
+           0.07561217f, -0.02049661f, 4.44177445e-003f, 0.01609347f, -0.04893158f,
+           -0.08758243f, -7.67420698e-003f, 0.08862378f, 0.06098121f, 0.06565887f,
+           7.32981879e-003f, 0.03558407f, -0.03874352f, -0.02490055f,
+           -0.06771075f, 0.09939223f, -0.01066077f, 0.01382995f, -0.07289080f,
+           7.47184316e-003f, 0.10621431f, -0.02878659f, 0.02383525f, -0.03274646f,
+           0.02137008f, 0.03837290f, 0.02450992f, -0.04296818f, -0.02895143f,
+           0.05327370f, 0.01499020f, 0.04998732f, 0.12938657f, 0.09391870f,
+           0.04292390f, -0.03359194f, -0.06809492f, 0.01125796f, 0.17290455f,
+           -0.03430733f, -0.06255233f, -0.01813114f, 0.11726857f, -0.06127599f,
+           -0.08677909f, -0.03429872f, 0.04684938f, 0.08161420f, 0.03538774f,
+           0.01833884f, 0.11321855f, 0.03261845f, -0.04826299f, 0.01752407f,
+           -0.01796414f, -0.10464549f, -3.30041884e-003f, 2.29343961e-004f,
+           0.01457292f, -0.02132982f, -0.02602923f, -9.87351313e-003f,
+           0.04273872f, -0.02103316f, -0.07994065f, 0.02614958f, -0.02111666f,
+           -0.06964913f, -0.13453490f, -0.06861878f, -6.09341264e-003f,
+           0.08251446f, 0.15612499f, 2.46531400e-003f, 8.88424646e-003f,
+           -0.04152999f, 0.02054853f, 0.05277953f, -0.03087788f, 0.02817579f,
+           0.13939077f, 0.07641046f, -0.03627627f, -0.03015098f, -0.04041540f,
+           -0.01360690f, -0.06227205f, -0.02738223f, 0.13577610f, 0.15235767f,
+           -0.05392922f, -0.11175954f, 0.02157129f, 0.01146481f, -0.05264937f,
+           -0.06595174f, -0.02749175f, 0.11812254f, 0.17404149f, -0.06137035f,
+           -0.11003478f, -0.01351621f, -0.01745916f, -0.08577441f, -0.04469909f,
+           -0.06106115f, 0.10559758f, 0.20806813f, -0.09174948f, 7.09621934e-004f,
+           0.03579374f, 0.07215115f, 0.02221742f, 0.01827742f, -7.90785067e-003f,
+           0.01489554f, 0.14519960f, -0.06425831f, 0.02990399f, -1.80181325e-003f,
+           -0.01401528f, -0.04171134f, -3.70530109e-003f, -0.09090481f,
+           0.09520713f, 0.08845516f, -0.02651753f, -0.03016730f, 0.02562448f,
+           0.03563816f, -0.03817881f, 0.01433385f, 0.02256983f, 0.02872120f,
+           0.01001934f, -0.06332260f, 0.04338406f, 0.07001807f, -0.04705722f,
+           -0.07318907f, 0.02630457f, 0.03106382f, 0.06648342f, 0.10913180f,
+           -0.01630815f, 0.02910308f, 0.02895109f, 0.08040254f, 0.06969310f,
+           0.06797734f, 6.08639978e-003f, 4.16588830e-003f, 0.08926726f,
+           -0.03123648f, 0.02700146f, 0.01168734f, -0.01631594f, 4.61015804e-003f,
+           8.51359498e-003f, -0.03544224f, 0.03571994f, 4.29766066e-003f,
+           -0.01970077f, -8.79793242e-003f, 0.09607988f, 0.01544222f,
+           -0.03923707f, 0.07308586f, 0.06061262f, 1.31683104e-004f,
+           -7.98222050e-003f, 0.02399261f, -0.06084389f, -0.02743429f,
+           -0.05475523f, -0.04131311f, 0.03559756f, 0.03055342f, 0.02981433f,
+           0.14860515f, 0.01766787f, 0.02945257f, 0.04898238f, 0.01026922f,
+           0.02811658f, 0.08267091f, 0.02732154f, -0.01237693f, 0.11760156f,
+           0.03802063f, -0.03309754f, 5.24957618e-003f, -0.02460510f, 0.02691451f,
+           0.05399988f, -0.10133506f, 0.06385437f, -0.01818005f, 0.02259503f,
+           0.03573135f, 0.01042848f, -0.04153402f, -0.04043029f, 0.01643575f,
+           0.08326677f, 4.61383024e-004f, -0.05308095f, -0.08536223f,
+           -1.61011645e-003f, -0.02163720f, -0.01783352f, 0.03859637f,
+           0.08498885f, -0.01725216f, 0.08625131f, 0.10995087f, 0.09177644f,
+           0.08498347f, 0.07646490f, 0.05580502f, 0.02693516f, 0.09996913f,
+           0.09070327f, 0.06667200f, 0.05873008f, -0.02247842f, 0.07772321f,
+           0.12408436f, 0.12629253f, -8.41997913e-004f, 0.01477783f, 0.09165990f,
+           -2.98401713e-003f, -0.06466447f, -0.07057302f, 2.09516948e-004f,
+           0.02210209f, -0.02158809f, -0.08602506f, -0.02284836f,
+           4.01876355e-003f, 9.56660323e-003f, -0.02073978f, -0.04635138f,
+           -7.59423291e-003f, -0.01377393f, -0.04559359f, -0.13284740f,
+           -0.08671406f, -0.03654395f, 0.01142869f, 0.03287891f, -0.04392983f,
+           0.06142959f, 0.17710890f, 0.10385257f, 0.01329137f, 0.10067633f,
+           0.12450829f, -0.04476709f, 0.09049144f, 0.04589312f, 0.11167907f,
+           0.08587538f, 0.04767583f, 1.67188141e-003f, 0.02359802f, -0.03808852f,
+           0.03126272f, -0.01919029f, -0.05698918f, -0.02365112f, -0.06519032f,
+           -0.05599358f, -0.07097308f, -0.03301812f, -0.04719102f, -0.02566297f,
+           0.01324074f, -0.09230672f, -0.05518232f, -0.04712864f, -0.03380903f,
+           -0.06719479f, 0.01183908f, -0.09326738f, 0.01642865f, 0.03789867f,
+           -6.61567831e-003f, 0.07796386f, 0.07246574f, 0.04706347f, -0.02523437f,
+           -0.01696830f, -0.08068866f, 0.06030888f, 0.10527060f, -0.06611756f,
+           0.02977346f, 0.02621830f, 0.01913855f, -0.08479366f, -0.06322418f,
+           -0.13570616f, -0.07644490f, 9.31900274e-003f, -0.08095149f,
+           -0.10197903f, -0.05204025f, 0.01413151f, -0.07800411f, -0.01885122f,
+           -0.07509381f, -0.10136326f, -0.05212355f, -0.09944065f,
+           -1.33606605e-003f, -0.06342617f, -0.04178550f, -0.12373723f,
+           -0.02832736f, -0.06057501f, 0.05830070f, 0.07604282f, -0.06462587f,
+           8.02447461e-003f, 0.11580125f, 0.12332212f, 0.01978462f,
+           -2.72378162e-003f, 0.05850752f, -0.04674481f, 0.05148062f,
+           -2.62542837e-003f, 0.11253355f, 0.09893716f, 0.09785093f, -0.04659257f,
+           -0.01102429f, -0.07002308f, 0.03088913f, -0.02565549f, -0.07671449f,
+           3.17443861e-003f, -0.10783514f, -0.02314270f, -0.11089555f,
+           -0.01024768f, 0.03116021f, -0.04964825f, 0.02281825f, 5.50005678e-003f,
+           -0.08427856f, -0.14685495f, -0.07719755f, -0.13342668f, -0.04525511f,
+           -0.09914210f, 0.02588859f, 0.03469279f, 0.04664020f, 0.11688190f,
+           0.09647275f, 0.10857815f, -0.01448726f, 0.04299758f, -0.06763151f,
+           1.33257592e-003f, 0.14331576f, 0.07574340f, 0.09166205f, 0.05674926f,
+           0.11325553f, -0.01106494f, 0.02062161f, -0.11484840f, -0.07492137f,
+           -0.02864293f, -0.01275638f, -0.06946032f, -0.10101652f, -0.04113498f,
+           -0.02214783f, -0.01273942f, -0.07480393f, -0.10556041f, -0.07622112f,
+           -0.09988393f, -0.11453961f, -0.12073903f, -0.09412795f, -0.07146588f,
+           -0.04054537f, -0.06127083f, 0.04221122f, 0.07688113f, 0.04099256f,
+           0.12663734f, 0.14683802f, 0.21761774f, 0.12525328f, 0.18431792f,
+           -1.66402373e-003f, 2.37777247e-003f, 0.01445475f, 0.03509416f,
+           0.02654697f, 0.01716739f, 0.05374011f, 0.02944174f, 0.11323927f,
+           -0.01485456f, -0.01611330f, -1.85554172e-003f, -0.01708549f,
+           -0.05435753f, -0.05302101f, 0.05260378f, -0.03582945f,
+           -3.42867890e-004f, 1.36076682e-003f, -0.04436073f, -0.04228432f,
+           0.03281291f, -0.05480836f, -0.10197772f, -0.07206279f, -0.10741059f,
+           -0.02366946f, 0.10278475f, -2.74783419e-003f, -0.03242477f,
+           0.02308955f, 0.02835869f, 0.10348799f, 0.19580358f, 0.10252027f,
+           0.08039929f, 0.05525554f, -0.13250865f, -0.14395352f, 3.13586881e-003f,
+           -0.03387071f, 8.94669443e-003f, 0.05406157f, -4.97324532e-003f,
+           -0.01189114f, 2.82919413e-004f, -0.03901557f, -0.04898705f,
+           0.02164520f, -0.01382906f, -0.01850416f, 0.01869347f, -0.02450060f,
+           0.02291678f, 0.08196463f, 0.03309153f, -0.10629974f, 0.02473924f,
+           0.05344394f, -0.02404823f, -0.03243643f, -5.55244600e-003f,
+           -0.08009996f, 0.02811539f, 0.04235742f, 0.01859004f, 0.04902123f,
+           -0.01438252f, -0.01526853f, 0.02044195f, -0.05008660f, 0.04244113f,
+           0.07611816f, 0.04950470f, -0.06020549f, -4.26026015e-003f, 0.13133512f,
+           -0.01438738f, -0.01958807f, -0.04044152f, -0.12425045f,
+           2.84353318e-003f, -0.05042776f, -0.09121484f, 7.34345755e-003f,
+           0.09388847f, 0.11800314f, 4.72295098e-003f, 4.44378285e-003f,
+           -0.07984917f, -0.03613737f, 0.04490915f, -0.02246483f, 0.04681071f,
+           0.05240871f, 0.02157206f, -0.04603431f, -0.01197929f, -0.02748779f,
+           0.13621049f, 0.08812155f, -0.07802048f, 4.86458559e-003f, -0.01598836f,
+           0.01024450f, -0.03463517f, -0.02304239f, -0.08692665f, 0.06655128f,
+           0.05785803f, -0.12640759f, 0.02307472f, 0.07337402f, 0.07525434f,
+           0.04943763f, -0.02241034f, -0.09978238f, 0.14487994f, -0.06570521f,
+           -0.07855482f, 0.02830222f, -5.29603509e-004f, -0.04669895f,
+           -0.11822784f, -0.12246452f, -0.15365660f, -0.02969127f, 0.08078201f,
+           0.13512598f, 0.11505685f, 0.04740673f, 0.01376022f, -0.05852978f,
+           -0.01537809f, -0.05541119f, 0.02491065f, -0.02870786f, 0.02760978f,
+           0.23836176f, 0.22347429f, 0.10306466f, -0.06919070f, -0.10132039f,
+           -0.20198342f, -0.05040560f, 0.27163076f, 0.36987007f, 0.34540465f,
+           0.29095781f, 0.05649706f, 0.04125737f, 0.07505883f, -0.02737836f,
+           -8.43431335e-003f, 0.07368195f, 0.01653876f, -0.09402955f,
+           -0.09574359f, 0.01474337f, -0.07128561f, -0.03460737f, 0.11438941f,
+           0.13752601f, -0.06385452f, -0.06310338f, 8.19548313e-003f, 0.11622470f,
+           5.05133113e-003f, -0.07602754f, 0.06695660f, 0.25723928f, 0.09037900f,
+           0.28826267f, 0.13165380f, -0.05312614f, -0.02137198f, -0.03442232f,
+           -0.06255679f, 0.03899667f, 0.18391028f, 0.26016650f, 0.03374462f,
+           0.01860465f, 0.19077586f, 0.18160543f, 3.43634398e-003f, -0.03036782f,
+           0.19683038f, 0.35378191f, 0.24968483f, -0.03222649f, 0.28972381f,
+           0.43091634f, 0.30778357f, 0.02335266f, -0.09877399f, -6.85245218e-003f,
+           0.08945240f, -0.08150686f, 0.02792493f, 0.24806842f, 0.17338486f,
+           0.06231801f, -0.10432383f, -0.16653322f, -0.13197899f, -0.08531576f,
+           -0.19271527f, -0.13536365f, 0.22240199f, 0.39219588f, 0.26597717f,
+           -0.01231649f, 0.01016179f, 0.13379875f, 0.12018334f, -0.04852953f,
+           -0.07915270f, 0.07036012f, 3.87723115e-003f, -0.06126805f,
+           -0.15015170f, -0.11406515f, -0.08556531f, -0.07429333f, -0.16115491f,
+           0.13214062f, 0.25691369f, 0.05697750f, 0.06861912f, -6.02903729e-003f,
+           -7.94562511e-003f, 0.04799571f, 0.06695165f, -0.01926842f, 0.06206308f,
+           0.13450983f, -0.06381495f, -2.98370165e-003f, -0.03482971f,
+           7.53991678e-003f, 0.03895611f, 0.11464261f, 0.01669971f,
+           8.27818643e-003f, -7.49160210e-003f, -0.11712562f, -0.10650621f,
+           -0.10353880f, -0.04994106f, -7.65618810e-004f, 0.03023767f,
+           -0.04759270f, -0.07302686f, -0.05825012f, -0.13156348f, -0.10639747f,
+           -0.19393684f, -0.09973683f, -0.07918908f, 4.63177625e-004f,
+           -6.61382044e-004f, 0.15853868f, 0.08561199f, -0.07660093f,
+           -0.08015265f, -0.06164073f, 0.01882577f, -7.29908410e-004f,
+           0.06840892f, 0.03843764f, 0.20274927f, 0.22028814f, -5.26101235e-003f,
+           0.01452435f, -0.06331623f, 0.02865064f, 0.05673740f, 0.12171564f,
+           0.03837196f, 0.03555467f, -0.02662914f, -0.10280123f, -0.06526285f,
+           -0.11066351f, -0.08988424f, -0.10103678f, 8.10526591e-003f,
+           5.95238712e-003f, 0.02617721f, -0.01705742f, -0.10897956f,
+           -0.08004991f, -0.11271993f, -0.06185647f, -0.06103712f, 0.01597041f,
+           -0.05923606f, 0.09410726f, 0.22858568f, 0.03263380f, 0.06772990f,
+           -0.09003516f, 0.01017870f, 0.01931688f, 0.08628357f, -0.01430009f,
+           0.10954945f, 0.16612452f, -0.02434544f, -0.03310068f, -0.04236627f,
+           0.01212392f, -6.15046406e-003f, 0.06954194f, 0.03015283f, 0.01787957f,
+           0.02781667f, -0.05561153f, -8.96244217e-003f, -0.04971489f,
+           0.07510284f, 0.01775282f, 0.05889897f, -0.07981427f, 0.03647643f,
+           -3.73833324e-003f, -0.08894575f, -0.06429435f, -0.08068276f,
+           0.03567704f, -0.07131936f, -7.21910037e-003f, -0.09566668f,
+           0.17886090f, 0.14911725f, 0.02070032f, -0.05017120f, -0.04992622f,
+           0.01570143f, -0.09906903f, 0.06456193f, 0.15329507f, 0.18820767f,
+           0.11689861f, -0.01178513f, -0.02225163f, -0.01905318f, 0.10271224f,
+           -7.27029052e-003f, 0.11664233f, 0.14796902f, 0.07771893f, 0.02400013f,
+           -0.05361797f, -0.01972888f, 0.01376177f, 0.06740040f, -0.06525395f,
+           0.05726178f, -0.02404981f, -0.14018567f, -0.02074987f, -0.04621970f,
+           -0.04688627f, -0.01842059f, 0.07722727f, -0.04852883f, 0.01529004f,
+           -0.19639495f, 0.10817073f, 0.03795860f, -0.09435206f, -0.07984378f,
+           -0.03383440f, 0.11081333f, 0.02237366f, 0.12703256f, 0.21613893f,
+           0.02918790f, 4.66472283e-003f, -0.10274266f, -0.04854131f,
+           -3.46305710e-003f, 0.08652268f, 0.02251546f, 0.09636052f, 0.17180754f,
+           -0.09272388f, 4.59174305e-004f, -0.11723048f, -0.12210111f,
+           -0.15547538f, 0.07218186f, -0.05297846f, 0.03779940f, 0.05150875f,
+           -0.03802310f, 0.03870645f, -0.15250699f, -0.08696499f, -0.02021560f,
+           0.04118926f, -0.15177974f, 0.01577647f, 0.10249301f, 7.50041893e-003f,
+           0.01721806f, -0.06828983f, -0.02397596f, -0.06598977f, -0.04317593f,
+           -0.08064980f, 6.66632550e-003f, 0.03333484f, 0.07093620f, 0.08231064f,
+           -0.06577903f, -0.06698844f, -0.06984019f, -0.06508023f, -0.14145090f,
+           -0.02393239f, 0.06485303f, 8.83263443e-003f, 0.09251080f, -0.07557579f,
+           -0.05067699f, -0.09798748f, -0.06703258f, -0.14056294f, 0.03245994f,
+           0.12554143f, 0.01761621f, 0.12980327f, -0.04081950f, -0.11906909f,
+           -0.14813015f, -0.08376863f, -0.12200681f, 0.04988137f, 0.05424247f,
+           -3.90952639e-003f, 0.03255733f, -0.12717837f, -0.07461493f,
+           -0.05703964f, -0.01736189f, -0.08026433f, -0.05433894f, -0.01719359f,
+           0.02886275f, 0.01772653f, -0.09163518f, 3.57789593e-003f, -0.10129993f,
+           -0.02653764f, -0.08131415f, -0.03847986f, -7.62157550e-004f,
+           0.06486648f, 0.19675669f, -0.04919156f, -0.07059129f, -0.04857785f,
+           -0.01042383f, -0.08328653f, 0.03660302f, -0.03696846f, 0.04969259f,
+           0.08241162f, -0.12514858f, -0.06122676f, -0.03750202f,
+           6.52989605e-003f, -0.10247213f, 0.02568346f, 4.51781414e-003f,
+           -0.03734229f, -0.01131264f, -0.05412074f, 8.89345480e-004f,
+           -0.12388977f, -0.05959237f, -0.12418608f, -0.06151643f, -0.07310260f,
+           0.02441575f, 0.07023528f, -0.07548289f, -7.57147965e-004f,
+           -0.09061348f, -0.08112976f, -0.06920306f, 9.54394229e-003f,
+           -0.01219902f, 1.21273217e-003f, -8.88989680e-003f, -0.08309301f,
+           -0.04552661f, -0.10739882f, -0.05691034f, -0.13928030f, 0.09027749f,
+           0.15123098f, 0.03175976f, 0.17763577f, 3.29913251e-004f, 0.05151888f,
+           -0.09844074f, -0.09475287f, -0.08571247f, 0.16241577f, 0.19336018f,
+           8.57454538e-003f, 0.11474732f, -0.01493934f, 0.03352379f, -0.08966240f,
+           -0.02322310f, 0.02663568f, 0.05448750f, -0.03536883f, -0.07210463f,
+           -0.06807277f, -0.03121621f, -0.05932408f, -0.17282860f, -0.15873498f,
+           -0.04956378f, 0.01603377f, -0.12385946f, 0.13878587f, 0.21468069f,
+           0.13510075f, 0.20992437f, 0.08845878f, 0.08104013f, 0.03754176f,
+           0.12173114f, 0.11103114f, 0.10643122f, 0.13941477f, 0.11640384f,
+           0.14786847f, 0.01218238f, 0.01160753f, 0.03547940f, 0.08794311f,
+           -0.01695384f, -0.07692261f, -0.08236158f, 6.79194089e-003f,
+           -0.02458403f, 0.13022894f, 0.10953187f, 0.09857773f, 0.04735930f,
+           -0.04353498f, -0.15173385f, -0.17904443f, -0.10450364f, -0.13418166f,
+           -0.06633098f, -0.03170381f, -0.06839000f, -0.11350126f, -0.06983913f,
+           0.19083543f, 0.17604128f, 0.07730632f, 0.10022651f, 0.36428109f,
+           0.28291923f, 0.12688625f, 0.15942036f, 0.14064661f, -0.11201853f,
+           -0.13969108f, -0.09088077f, -0.14107047f, 0.05117374f,
+           -2.63348082e-003f, -0.10794610f, -0.09715455f, -0.05284977f,
+           0.01565668f, 0.05031200f, 0.07021113f, -0.02963028f, 0.01766960f,
+           0.08333644f, -0.03211382f, 4.90096770e-003f, 0.05186674f, -0.05045737f,
+           -0.09624767f, -0.02525997f, 0.06916669f, 0.01213916f, 0.05333899f,
+           -0.03443280f, -0.10055527f, -0.06291115f, 5.42851724e-003f,
+           -6.30360236e-003f, 0.02270257f, -0.01769792f, 0.03273688f, 0.07746078f,
+           7.77099328e-003f, 0.05041346f, 0.01648103f, -0.02321534f, -0.09930186f,
+           -0.02293853f, 0.02034990f, -0.08324204f, 0.08510064f, -0.03732836f,
+           -0.06465405f, -0.06086946f, 0.13680504f, -0.11469388f, -0.03896406f,
+           -0.07142810f, 2.67581246e-003f, -0.03639632f, -0.09849060f,
+           -0.11014334f, 0.17489147f, 0.17610909f, -0.16091567f, -0.07248894f,
+           0.01567141f, 0.23742996f, 0.07552249f, -0.06270349f, -0.07303379f,
+           0.25442186f, 0.16903116f, -0.08168741f, -0.05913896f, -0.03954096f,
+           6.81776879e-003f, -0.05615319f, -0.07303037f, -0.12176382f,
+           0.12385108f, 0.22084464f, -0.05543206f, -0.03310431f, 0.05731593f,
+           0.19481890f, 0.04016430f, -0.06480758f, -0.12353460f, 0.18733442f,
+           -0.09631214f, -0.11192076f, 0.12404587f, 0.15671748f, 0.19256128f,
+           0.10895617f, 0.03391477f, -0.13032004f, -0.05626907f, -0.09025607f,
+           0.23485197f, 0.27812332f, 0.26725492f, 0.07255980f, 0.16565137f,
+           0.22388470f, 0.07441066f, -0.21003133f, -0.08075339f, -0.15031935f,
+           0.07023834f, 0.10872041f, 0.18156518f, 0.20037253f, 0.13571967f,
+           -0.11915682f, -0.11131983f, -0.18878011f, 0.06074620f, 0.20578890f,
+           0.12413109f, 0.03930207f, 0.29176015f, 0.29502738f, 0.27856228f,
+           -0.01803601f, 0.16646385f, 0.19268319f, 0.01900682f, 0.06026287f,
+           2.35868432e-003f, 0.01558199f, 0.02707230f, 0.11383014f, 0.12103992f,
+           0.03907350f, 0.04637353f, 0.09020995f, 0.11919726f, -3.63007211e-003f,
+           0.02220155f, 0.10336831f, 0.17351882f, 0.12259731f, 0.18983354f,
+           0.15736865f, 0.01160725f, -0.01690723f, -9.69582412e-004f, 0.07213813f,
+           0.01161613f, 0.17864859f, 0.24486147f, 0.18208991f, 0.20177495f,
+           0.05972528f, -8.93934630e-003f, -0.02316955f, 0.14436610f, 0.14114498f,
+           0.05520950f, 0.06353590f, -0.19124921f, 0.10174713f, 0.29414919f,
+           0.26448128f, 0.09344960f, 0.15284036f, 0.19797507f, 0.11369792f,
+           -0.12722753f, -0.21396367f, -0.02008235f, -0.06566695f, -0.01662150f,
+           -0.03937003f, 0.04778343f, 0.05017274f, -0.02299062f, -0.20208496f,
+           -0.06395898f, 0.13721776f, 0.22544557f, 0.14888357f, 0.08687132f,
+           0.27088094f, 0.32206613f, 0.09782200f, -0.18523243f, -0.17232181f,
+           -0.01041531f, 0.04008654f, 0.04199702f, -0.08081299f, -0.03755421f,
+           -0.04809646f, -0.05222081f, -0.21709201f, -0.06622940f, 0.02945281f,
+           -0.04600435f, -0.05256077f, -0.08432942f, 0.02848100f, 0.03490564f,
+           8.28621630e-003f, -0.11051246f, -0.11210597f, -0.01998289f,
+           -0.05369405f, -0.08869293f, -0.18799506f, -0.05436598f, -0.05011634f,
+           -0.05419716f, -0.06151857f, -0.10827805f, 0.04346735f, 0.04016083f,
+           0.01520820f, -0.12173316f, -0.04880285f, -0.01101406f, 0.03250847f,
+           -0.06009551f, -0.03082932f, -0.02295134f, -0.06856834f, -0.08775249f,
+           -0.23793389f, -0.09174541f, -0.05538322f, -0.04321031f, -0.11874759f,
+           -0.04221844f, -0.06070468f, 0.01194489f, 0.02608565f, -0.03892140f,
+           -0.01643151f, -0.02602034f, -0.01305472f, 0.03920100f, -0.06514261f,
+           0.01126918f, -6.27710763e-003f, -0.02720047f, -0.11133634f,
+           0.03300330f, 0.02398472f, 0.04079665f, -0.10564448f, 0.05966159f,
+           0.01195221f, -0.03179441f, -0.01692590f, -0.06177841f, 0.01841576f,
+           -5.51078189e-003f, -0.06821765f, -0.03191888f, -0.09545476f,
+           0.03030550f, -0.04896152f, -0.02914624f, -0.13283344f, -0.04783419f,
+           6.07836898e-003f, -0.01449538f, -0.13358212f, -0.09687774f,
+           -0.02813793f, 0.01213498f, 0.06650011f, -0.02039067f, 0.13356198f,
+           0.05986415f, -9.12760664e-003f, -0.18780160f, -0.11992817f,
+           -0.06342237f, 0.01229534f, 0.07143231f, 0.10713009f, 0.11085765f,
+           0.06569190f, -0.02956399f, -0.16288325f, -0.13993549f, -0.01292515f,
+           0.03833013f, 0.09130384f, -0.05086257f, 0.05617329f, -0.03896667f,
+           -0.06282311f, -0.11490010f, -0.14264110f, -0.04530499f, 0.01598189f,
+           0.09167797f, 0.08663294f, 0.04885277f, -0.05741219f, -0.07565769f,
+           -0.17136464f, -0.02619422f, -0.02477579f, 0.02679587f, 0.11621952f,
+           0.08788391f, 0.15520640f, 0.04709549f, 0.04504483f, -0.10214074f,
+           -0.12293372f, -0.04820546f, -0.05484834f, 0.05473754f, 0.07346445f,
+           0.05577277f, -0.08209965f, 0.03462975f, -0.20962234f, -0.09324598f,
+           3.79481679e-003f, 0.03617633f, 0.16742408f, 0.07058107f, 0.10204960f,
+           -0.06795346f, 3.22807301e-003f, -0.12589309f, -0.17496960f,
+           0.02078314f, -0.07694324f, 0.12184640f, 0.08997164f, 0.04793497f,
+           -0.11383379f, -0.08046359f, -0.25716835f, -0.08080962f,
+           6.80711539e-003f, -0.02930280f, -3.04938294e-003f, -0.11106286f,
+           -0.04628860f, -0.07821649f, 7.70127494e-003f, -0.10247706f,
+           1.21042714e-003f, 0.20573859f, -0.03241005f, 8.42972286e-003f,
+           0.01946464f, -0.01197973f, -0.14579976f, 0.04233614f,
+           -4.14096704e-003f, -0.06866436f, -0.02431862f, -0.13529138f,
+           1.25891645e-003f, -0.11425111f, -0.04303651f, -0.01694815f,
+           0.05720210f, -0.16040207f, 0.02772896f, 0.05498345f, -0.15010567f,
+           0.01450866f, 0.02350303f, -0.04301004f, -0.04951802f, 0.21702233f,
+           -0.03159155f, -0.01963303f, 0.18232647f, -0.03263875f,
+           -2.88476888e-003f, 0.01587562f, -1.94303901e-003f, -0.07789494f,
+           0.04674156f, -6.25576358e-003f, 0.08925962f, 0.21353747f, 0.01254677f,
+           -0.06999976f, -0.05931328f, -0.01884327f, -0.04306272f, 0.11794136f,
+           0.03842728f, -0.03907030f, 0.05636114f, -0.09766009f, -0.02104000f,
+           8.72711372e-003f, -0.02736877f, -0.05112274f, 0.16996814f, 0.02955785f,
+           0.02094014f, 0.08414304f, -0.03335762f, -0.03617457f, -0.05808248f,
+           -0.08872101f, 0.02927705f, 0.27077839f, 0.06075108f, 0.07478261f,
+           0.15282831f, -0.03908454f, -0.05101782f, -9.51998029e-003f,
+           -0.03272416f, -0.08735625f, 0.07633440f, -0.07185312f, 0.13841286f,
+           0.07812646f, -0.12901451f, -0.05488589f, -0.05644578f, -0.03290703f,
+           -0.11184757f, 0.03751570f, -0.05978153f, -0.09155276f, 0.05657315f,
+           -0.04328186f, -0.03047933f, -0.01413135f, -0.10181040f, -0.01384013f,
+           0.20132534f, -0.01536873f, -0.07641169f, 0.05906778f, -0.07833145f,
+           -0.01523801f, -0.07502609f, -0.09461885f, -0.15013233f, 0.16050665f,
+           0.09021381f, 0.08473236f, 0.03386267f, -0.09147339f, -0.09170618f,
+           -0.08498498f, -0.05119187f, -0.10431040f, 0.01041618f, -0.03064913f,
+           0.09340212f, 0.06448522f, -0.03881054f, -0.04985436f, -0.14794017f,
+           -0.05200112f, -0.02144495f, 0.04000821f, 0.12420804f, -0.01851651f,
+           -0.04116732f, -0.11951703f, -0.04879033f, -0.08722515f, -0.08454733f,
+           -0.10549165f, 0.11251976f, 0.10766345f, 0.19201984f, 0.06128913f,
+           -0.02734615f, -0.08834923f, -0.16999826f, -0.03548348f,
+           -5.36092324e-003f, 0.08297954f, 0.07226378f, 0.04194529f, 0.04668673f,
+           8.73902347e-003f, 0.06980139f, 0.05652480f, 0.05879445f, 0.02477076f,
+           0.02451423f, 0.12433673f, 0.05600227f, 0.06886370f, 0.03863076f,
+           0.07459056f, 0.02264139f, 0.01495469f, 0.06344220f, 0.06945208f,
+           0.02931899f, 0.11719371f, 0.04527427f, 0.03248192f, 2.08271481e-003f,
+           0.02044626f, 0.11403449f, 0.04303892f, 0.06444661f, 0.04959024f,
+           0.08174094f, 0.09240247f, 0.04894639f, 0.02252937f, -0.01652530f,
+           0.07587013f, 0.06064249f, 0.13954395f, 0.02772832f, 0.07093039f,
+           0.08501238f, 0.01701301f, 0.09055722f, 0.33421436f, 0.20163782f,
+           0.09821030f, 0.07951369f, 0.08695120f, -0.12757730f, -0.13865978f,
+           -0.06610068f, -0.10985506f, 0.03406816f, -0.01116336f, -0.07281768f,
+           -0.13525715f, -0.12844718f, 0.08956250f, 0.09171610f, 0.10092317f,
+           0.23385370f, 0.34489515f, 0.09901748f, 0.02002922f, 0.12335990f,
+           0.07606190f, -0.14899330f, -0.15634622f, -0.06494618f, -0.01760547f,
+           0.03404277f, -0.13208845f, -0.12101169f, -0.18294574f, -0.16560709f,
+           0.02183887f, -0.02752613f, 0.01813638f, 0.02000757f, 0.01319924f,
+           0.08030242f, 0.01220535f, 2.98233377e-003f, -0.01307070f, 0.05970297f,
+           -0.05345284f, -0.03381982f, -9.87543724e-003f, -0.06869387f,
+           0.03956730f, -0.03108176f, -0.05732809f, 0.02172386f, 0.04159765f,
+           2.62783933e-003f, 0.04813229f, 0.09358983f, -8.18389002e-003f,
+           0.01724574f, -0.02547474f, -0.04967288f, -0.02390376f, 0.06640504f,
+           -0.06306566f, 0.01137518f, 0.05589378f, -0.08237787f, 0.02455001f,
+           -0.03059422f, -0.08953978f, 0.06851497f, 0.07190268f, -0.07610799f,
+           7.87237938e-003f, -7.85830803e-003f, 0.06006952f, -0.01126728f,
+           -2.85743061e-003f, -0.04772895f, 0.01884944f, 0.15005857f,
+           -0.06268821f, -0.01989072f, 0.01138399f, 0.08760451f, 0.03879007f,
+           -9.66926850e-003f, -0.08012961f, 0.06414555f, -0.01362950f,
+           -0.09135523f, 0.01755159f, 0.04459474f, 0.09650917f, 0.05219948f,
+           -2.19440833e-003f, -0.07037939f, -0.01599054f, 0.13103317f,
+           -0.02492603f, -0.01032540f, -0.02903307f, 0.04489160f, 0.05148086f,
+           0.01858173f, -0.02919228f, 0.08299296f, -0.04590359f, -0.15745632f,
+           -0.09068198f, -0.02972453f, 0.12985018f, 0.22320485f, 0.24261914f,
+           0.03642650f, -0.05506422f, 2.67413049e-003f, -0.03834032f, 0.06449424f,
+           0.03834866f, 0.03816991f, 0.25039271f, 0.34212017f, 0.32433882f,
+           0.18824573f, -0.08599839f, -0.17599408f, -0.15317015f, -0.09913155f,
+           -0.02856072f, -0.05304699f, -1.06437842e-003f, -0.06641813f,
+           -0.07509298f, 0.01463361f, -0.07551918f, -0.04510373f,
+           -8.44620075e-003f, 0.01772176f, 0.04068235f, 0.20295307f, 0.15719447f,
+           0.05712103f, 0.26296997f, 0.14657754f, 0.01547317f, -0.05052776f,
+           -0.03881342f, -0.01437883f, -0.04930177f, 0.11719568f, 0.24098417f,
+           0.26468599f, 0.31698579f, 0.10103608f, -0.01096375f, -0.01367013f,
+           0.17104232f, 0.20065314f, 2.67622480e-003f, -0.01190034f, 0.18301608f,
+           0.09459770f, -0.06357619f, -0.06473801f, 0.01377906f, -0.10032775f,
+           -0.06388740f, 3.80393048e-003f, 0.06206078f, 0.10349120f, 0.26804337f,
+           8.17918684e-003f, -0.02314351f, 9.34422202e-003f, 0.09198381f,
+           0.03681326f, -8.77339672e-003f, -0.09662418f, -0.02715708f,
+           0.13503517f, 0.08962728f, -6.57071499e-003f, -0.03201199f, 0.28510824f,
+           0.32095715f, 0.18512695f, -0.14230858f, -0.14048551f, -0.07181299f,
+           -0.08575408f, -0.08661680f, -0.17416079f, 7.54326640e-004f,
+           0.05601677f, 0.13585392f, -0.04960437f, -0.07708392f, 0.10676333f,
+           -0.04407546f, -0.07209078f, 0.03663663f, 0.28949317f, 0.41127121f,
+           0.27431169f, -0.06900328f, -0.21474190f, -0.15578632f, -0.19555484f,
+           -0.15209621f, -0.11269179f, 0.07416003f, 0.18991330f, 0.26858172f,
+           0.01952259f, 0.01017922f, 0.02159843f, -4.95165400e-003f, -0.04368168f,
+           -0.12721671f, -0.06673957f, -0.11275250f, 0.04413409f, 0.05578312f,
+           0.03896771f, 0.03566417f, -0.05871816f, -0.07388090f, -0.17965563f,
+           -0.08570268f, -0.15273231f, -0.06022318f, -0.06999847f,
+           -6.81510568e-003f, 0.06294262f, -6.54901436e-004f, -0.01128654f,
+           -0.02289657f, 0.04849290f, 0.04140804f, 0.23681939f, 0.14545733f,
+           0.01989965f, 0.12032662f, 3.87463090e-003f, -6.02597650e-003f,
+           -0.05919775f, -0.03067224f, -0.07787777f, 0.10834727f, 0.02153730f,
+           0.02765649f, 0.03975543f, -0.12182906f, -0.04900113f, -0.09940100f,
+           -0.06453611f, -0.13757215f, -0.03721382f, 0.02827376f, -0.04351249f,
+           0.01907038f, -0.10284120f, -0.05671160f, -0.10760647f, -0.09624009f,
+           -0.09565596f, -0.01303654f, 0.03080539f, 0.01416511f, 0.05846142f,
+           -5.42971538e-003f, 0.06221476f, -0.03320325f, -0.06791797f,
+           -0.05791342f, 0.12851369f, 0.14990346f, 0.03634374f, 0.14262885f,
+           0.04330391f, 0.05032569f, -0.05631914f, 0.01606137f, 0.04387223f,
+           0.22344995f, 0.15722635f, -0.04693628f, 0.03006579f, -2.52882647e-003f,
+           0.05717621f, -0.07529724f, -0.02848588f, -0.06868757f,
+           -4.51729307e-003f, 0.06466042f, -0.05935378f, -0.04704857f,
+           -0.07363959f, 0.04843248f, -0.13421375f, -0.09789340f, -0.10255270f,
+           0.03509852f, 0.04751543f, -0.03822323f, 0.09740467f, 0.04762916f,
+           0.03940146f, -0.08283259f, 0.09552965f, 0.05038739f, 0.21258622f,
+           0.09646992f, 0.03241193f, 0.05167701f, 0.04614570f, 0.04330090f,
+           -0.02671840f, -0.06259909f, -0.02301898f, 0.18829170f, 0.10522786f,
+           0.04313190f, 0.01670948f, -0.08421925f, 0.05911417f, -0.10582602f,
+           -0.04855484f, -0.08373898f, 0.07775915f, 0.03723533f, -0.12047344f,
+           4.86345543e-003f, -0.10520902f, 0.06571782f, -0.07528137f,
+           -0.03245651f, -0.09869066f, -0.02917477f, -0.18293270f, 0.14810945f,
+           9.24033765e-003f, -0.04354914f, 0.02266885f, -0.11872729f,
+           -0.04016589f, 0.02830229f, 0.22539048f, 0.20565644f, 0.16701797f,
+           0.09019924f, 0.01300652f, 0.09760600f, -0.03675831f, -0.01935448f,
+           -0.06894835f, 0.08077277f, 0.19047537f, 0.11312226f, 0.04106043f,
+           -0.11187182f, 0.04312806f, -0.18548580f, -0.11287174f, -0.08794551f,
+           0.02078281f, -0.15295486f, 0.11806386f, -0.01103218f, -0.15971117f,
+           0.02153538f, -0.05232147f, -0.10835317f, -0.13910367f, 0.05920752f,
+           -0.10122602f, 0.20174250f, 0.09105796f, -0.01881348f, 0.09559010f,
+           -0.03725745f, -0.09442931f, -0.09763174f, 0.05854454f, 0.08287182f,
+           0.12919849f, 0.08594352f, -2.49806582e-003f, 0.02398440f,
+           5.67950122e-003f, -0.06296340f, -0.12993270f, 0.03855852f, 0.05186560f,
+           0.10839908f, -0.03380463f, -0.12654832f, -0.05399339f, -0.07456800f,
+           -0.04736232f, -0.10164231f, 0.07496139f, 0.08125214f, 0.07656177f,
+           -0.04999603f, -0.12823077f, -0.07692395f, -0.11317524f, -0.09118655f,
+           -0.05695669f, 0.10477209f, 0.07468581f, 0.01630048f, -8.00961629e-003f,
+           -0.06582128f, -0.04019095f, -0.04682907f, -0.01907842f, -0.10997720f,
+           0.04911406f, 0.02931030f, 0.04197735f, -0.05773980f, -0.09670641f,
+           -0.03594951f, -0.03402121f, -0.07149299f, -0.10566200f, 0.10601286f,
+           0.06340689f, -0.01518632f, -5.96402306e-003f, -0.07628012f,
+           -3.52779147e-003f, -0.02683854f, -0.10265494f, -0.02680815f,
+           0.16338381f, 0.03103515f, 0.02296976f, 0.01624348f, -0.10831620f,
+           -0.02314233f, -0.04789969f, -0.05530700f, -0.06461314f, 0.10494506f,
+           0.04642856f, -0.07592955f, -0.06197905f, -0.09042154f, -0.01445521f,
+           -0.04297818f, -0.11262015f, -0.11430512f, 0.03174541f, -0.03677487f,
+           -0.02963996f, -0.06610169f, -0.13292049f, -0.07059067f, -0.08444111f,
+           -0.02640536f, -0.07136250f, 0.04559967f, 0.01459980f, 0.17989251f,
+           0.04435328f, -0.12464730f, -0.02871115f, -0.10752209f, -0.03393742f,
+           -0.03791408f, 0.02548251f, 0.01956050f, 0.19245651f, 0.13963254f,
+           -0.05904696f, -0.07424626f, -0.10411884f, 1.54176133e-003f,
+           0.01797429f, 0.13025844f, 0.04547642f, -0.05710349f, -0.10697161f,
+           -0.13489437f, -0.06515755f, -0.06406886f, -4.08572936e-003f,
+           -0.01336483f, 0.04368737f, -0.11259720f, -0.05701635f, -0.06469971f,
+           -0.08346602f, -0.04166770f, -0.05795543f, -0.08247511f, -0.05742628f,
+           0.08452254f, -0.03350224f, 0.13980860f, 0.13252275f, 0.07589617f,
+           0.07539988f, 0.12155797f, 0.19087289f, 0.15050751f, 0.21250245f,
+           0.14206800f, 0.01298489f, 0.07450245f, 0.06559097f, 0.01700557f,
+           0.04512971f, 0.16950700f, 0.10261577f, 0.16389982f, 0.05505059f,
+           -0.03453077f, 0.08622462f, 0.07935954f, 0.03976260f, 0.02036091f,
+           3.95744899e-003f, 0.03267065f, 0.15235919f, 0.01297494f, -0.08109194f,
+           0.01407558f, 4.40693414e-003f, -0.15157418f, -0.11390478f,
+           -0.07487597f, -7.81322457e-003f, -0.02749545f, -0.10181408f,
+           0.13755716f, 0.14007211f, 0.13482562f, 0.27517235f, 0.34251109f,
+           0.07639657f, 0.07268607f, 0.19823882f, 0.16135791f, -0.04186463f,
+           -0.12784107f, -0.09846287f, 0.03169041f, 0.10974082f, -0.15051922f,
+           -0.08916726f, -0.07138767f, -0.04153349f, 6.25418453e-003f,
+           0.01266654f, 0.10533249f, 0.12749144f, 0.15148053f, 0.01498513f,
+           0.06305949f, -0.01247123f, -0.08778401f, -0.08551880f, -0.11955146f,
+           -0.08493572f, -0.02901620f, -0.02394859f, -0.13427313f, -0.11053200f,
+           -0.14413260f, -0.15203285f, 0.03972760f, -3.72127310e-004f,
+           -0.04200919f, 0.06105104f, 0.01904975f, -0.01106191f,
+           -7.27445772e-003f, -0.01520341f, 1.10228511e-003f, -0.04949187f,
+           -0.08013099f, 5.72071038e-003f, 0.08415454f, -0.06523152f, 0.03664081f,
+           -0.02673042f, -0.12066154f, -0.03702074f, 0.06006580f, 0.01628682f,
+           -6.17772620e-003f, 0.08192339f, -3.41629819e-003f, 0.02870512f,
+           0.05807141f, 0.04959986f, 0.04618251f, -0.04901629f, -0.10579574f,
+           0.02274442f, 0.12070961f, 2.23597488e-003f, 0.09831765f, -0.03019848f,
+           -0.11181970f, -0.04961075f, 0.02498928f, -0.03714991f, -0.01619653f,
+           0.02643486f, -7.62964319e-003f, -0.02882290f, -0.06242594f,
+           -0.08439861f, 0.07220893f, 0.07263952f, 0.01561574f, 0.03091968f,
+           0.01708712f, -0.03797151f, -3.18561122e-003f, 0.01624021f,
+           -0.02828573f, 0.11284444f, -1.32280716e-003f, -0.07784860f,
+           -0.07209100f, 0.03372242f, 0.12154529f, 0.02278104f, -0.05275500f,
+           -0.01918484f, 0.12989293f, 0.05424401f, 0.02333086f, 0.04029022f,
+           0.12392918f, 0.09495489f, 0.09190340f, 0.07935889f, 8.76816828e-003f,
+           0.17148446f, -8.51302687e-003f, -0.08011249f, -0.06796283f,
+           0.04884845f, 0.01112272f, -0.07835306f, -1.14811445e-003f,
+           -0.03440760f, 0.02845243f, 0.07695542f, -0.07069533f, -0.01151784f,
+           -8.53884313e-003f, -0.01662786f, -0.04163864f, 0.05400505f,
+           0.02859163f, 0.02921852f, 0.05003135f, -6.85718050e-003f, -0.01632611f,
+           0.07780217f, 0.04042810f, -0.01216440f, 3.60914599e-003f, -0.06322435f,
+           0.09516726f, 0.12877031f, -9.69162490e-003f, 0.01031179f, 0.05180895f,
+           -9.34659224e-003f, -0.01644533f, -0.04849347f, -0.04343236f,
+           0.10514783f, 0.08046635f, -0.04615205f, -0.03975486f, -0.01485525f,
+           0.13096830f, -0.01517950f, -0.06571898f, -0.04016372f, 0.01849786f,
+           0.02439670f, 0.08067258f, 1.74824719e-003f, 0.07053747f, 0.08819518f,
+           -5.08352555e-003f, -0.06550863f, -0.08266170f, -0.07780605f,
+           0.01453450f, -0.08756890f, 0.01096501f, -8.71319138e-003f, 0.10110464f,
+           0.02420769f, -0.06708383f, 0.02007811f, 5.93133038e-003f, 0.05398923f,
+           0.07538138f, 0.02049227f, 0.02242589f, 0.04011070f, -1.44875818e-003f,
+           -4.19115182e-003f, 0.06367654f, 0.02506934f, 0.02434536f, 0.05879405f,
+           -8.22952855e-003f, -0.01242441f, 0.04224926f, -0.01754923f,
+           0.05958161f, 0.03818886f, -0.01830363f, -0.04308917f, -0.04422197f,
+           -0.02432721f, 0.02264866f, 2.03751423e-003f, 0.01197031f, 0.04439203f,
+           0.12169247f, 0.03602713f, -0.02599251f, -1.98226492e-003f, 0.02046336f,
+           -0.02639058f, -1.91242550e-003f, -0.09334669f, -0.03595153f,
+           -9.88179818e-003f, -0.06848445f, -0.04666303f, -0.09955736f,
+           -0.04206430f, 0.02609075f, 9.09005292e-003f, -0.07138551f,
+           -4.22313227e-004f, 0.01766645f, 0.02756404f, 0.01308276f, 0.04052891f,
+           0.02387515f, 0.05337298f, 0.02500631f, -0.04970853f, -0.12467445f,
+           0.17604403f, 0.12256411f, -0.07512254f, 8.70451052e-003f, -0.05697548f,
+           -0.03626474f, -8.76623299e-003f, -0.01210897f, -0.09451522f,
+           0.07490732f, -0.02008001f, -0.02681278f, -0.06463405f, -0.01517507f,
+           7.33757764e-003f, 6.07147906e-003f, -0.09316964f, -0.04575328f,
+           0.13261597f, 0.15424870f, -0.01655918f, -0.02772390f, -0.05243644f,
+           -0.02356456f, -0.02351753f, -0.10211615f, -0.12873036f, 0.14549787f,
+           0.12519856f, 4.38762689e-003f, 0.02795992f, 0.05170322f, 0.09223596f,
+           0.05890015f, 0.02376701f, -0.02777346f, 0.09506908f, 0.02328936f,
+           -0.02319928f, -0.03218696f, -0.01527841f, -0.01016694f, -0.02674719f,
+           0.05137179f, 0.01980666f, 0.06544447f, -0.01746171f, 0.01026380f,
+           0.01561806f, 7.97004555e-004f, 0.07601810f, 0.01907250f, -0.03083035f,
+           -0.05987392f, 0.09242783f, 0.14555025f, 0.01035827f, 0.03092401f,
+           -0.09562709f, -0.03802354f, 0.02531144f, 0.03079449f, -0.07100715f,
+           0.03330721f, -2.69116857e-003f, 0.03167490f, 0.05744999f, 0.03259895f,
+           1.91266940e-003f, 0.03194578f, 0.07389776f, 0.02198060f, 0.07633314f,
+           0.03293105f, -0.09103648f, 0.04718142f, 0.06102672f, -0.01003063f,
+           5.85481385e-003f, -0.01522574f, 0.02323526f, 0.10584345f,
+           4.35879454e-003f, 0.06107873f, 0.05868603f, -0.03115531f, 0.01214679f,
+           0.08567052f, 3.93926632e-003f, -0.02521488f, -1.88425183e-003f,
+           0.02038053f, -6.26854831e-004f, 0.04897438f, -0.04280585f,
+           -0.04819689f, -0.04812867f, -0.01451186f, 0.05101469f,
+           -9.01125465e-003f, -0.03333859f, 0.03917955f, 0.04196448f, 0.04292135f,
+           0.02809529f, 0.02999715f, 0.04081348f, 9.10039060e-003f, 0.09703232f,
+           0.10379741f, 0.02348725f, -4.72756615e-003f, 0.01027325f, 0.10402658f,
+           0.12071823f, 0.09817299f, -0.02612033f, 0.03638414f, 0.05896405f,
+           0.04865025f, 0.04793910f, -0.03882321f, -0.02962117f, -0.01222268f,
+           0.04071597f, 0.01922777f, -0.02287866f, 0.03328381f, 0.01859092f,
+           0.09024994f, 0.03804455f, -0.01424510f, 0.01953739f, 0.02509617f,
+           -0.03390914f, -0.05663941f, -0.01641979f, 0.05848591f, 0.04639670f,
+           0.02092116f, 0.12911791f, 0.19918139f, 0.07739855f, -7.25806039e-003f,
+           0.04074838f, 0.03183993f, 1.39251316e-003f, -0.01428625f, 0.01865480f,
+           0.08529541f, 0.13547510f, 0.11189661f, 0.03998901f, 0.09575938f,
+           -0.02631102f, -0.03458253f, -0.04749985f, -0.06070716f,
+           4.71884012e-003f, 0.06445789f, -0.02450038f, -0.05483776f,
+           -0.04657237f, -0.02030717f, -0.03480766f, -0.09397731f, -0.06399718f,
+           -0.01804585f, 5.62348310e-003f, -6.64811488e-003f, -0.06517869f,
+           6.96210237e-003f, -0.01860148f, -0.04245830f, -0.05850367f,
+           -3.24417115e-003f, 0.07700698f, 0.11290991f, 0.09923030f, -0.02970599f,
+           0.05592411f, 0.04813979f, -0.09811195f, -0.09357996f, -0.03276114f,
+           0.05218338f, 0.04141375f, 3.92977800e-003f, -0.05047480f, 0.15960084f,
+           0.04612800f, -0.03114098f, -0.04650044f, -0.03249795f, -0.02425641f,
+           -0.04311355f, 0.04307659f, -0.09401883f, -0.04742785f, -0.01254499f,
+           -0.06598741f, 3.41369561e-003f, -0.05620445f, -7.28127593e-003f,
+           -0.05998361f, -0.03274450f, -0.07376868f, 3.19015374e-003f,
+           -0.07733069f, 0.05815864f, -0.02471071f, 0.03850617f, 0.13838784f,
+           0.15399861f, 0.01731321f, -0.01477586f, 0.10393341f, 0.05159833f,
+           -0.01945555f, -0.03427503f, -0.04867341f, 0.09237480f, 0.10732719f,
+           0.06071450f, -0.01355071f, 0.01844356f, -0.03480803f, -0.03796671f,
+           2.15628621e-004f, -0.05440186f, 0.01889855f, -0.01443413f,
+           -0.02607902f, -0.02938001f, 0.02720689f, -0.06228397f, -0.02970936f,
+           -0.03426210f, -0.10280876f, -0.06739304f, -0.05227850f, 0.03360292f,
+           -0.11278441f, -0.06966180f, -0.13937433f, 9.10932291e-003f,
+           2.52020749e-004f, -4.07359656e-003f, 0.12310639f, 0.09343060f,
+           0.07302511f, 0.03222093f, 0.07532879f, 0.03792387f, -0.04985180f,
+           0.01804602f, 0.02694195f, 0.13481498f, 0.04601225f, 0.04106982f,
+           0.08511057f, 0.12314661f, 0.01320830f, 0.05044121f, -5.52943908e-003f,
+           -0.08992624f, -0.02249301f, -0.08181777f, 0.06165213f, -0.03256603f,
+           -0.01068920f, -0.01323473f, -0.11970232f, -0.04616347f, -0.12088681f,
+           -0.06762606f, -0.08676834f, -0.06434575f, 0.01772529f, 0.03469615f,
+           -0.10926618f, 0.03013873f, 0.14030397f, 0.16130108f, 0.17985588f,
+           0.11281928f, 0.10530639f, 0.08905948f, 0.07733764f, 0.06695238f,
+           0.02142088f, 0.06438877f, 0.09794453f, 0.05745072f, 0.02788557f,
+           0.02632830f, 0.07985807f, 4.24902979e-003f, 8.47890321e-003f,
+           -0.02679466f, -5.28812688e-003f, -0.02162580f, -0.07490715f,
+           -0.08251337f, -0.02056576f, -0.01026194f, -1.15492963e-003f,
+           -5.75720915e-004f, -0.07210591f, -0.07320981f, -0.04883312f,
+           -0.10897151f, -0.07477258f, -0.08867134f, -0.09222437f, -0.10924666f,
+           -0.10430276f, 0.07953499f, 0.02767959f, 0.11393359f, 0.18779543f,
+           0.03313421f, 0.02143700f, 0.05852016f, -2.12067598e-003f,
+           -3.76984011e-003f, 0.02774167f, -0.03124610f, 0.01465141f, 0.01616004f,
+           -0.01391913f, -0.04404102f, -0.05444227f, -0.14684731f, -0.15016587f,
+           0.04509468f, 1.29563001e-003f, 0.01398350f, 0.05610404f, -0.04868806f,
+           -0.04776716f, -8.16873740e-003f, -2.30126386e-003f, -0.02286313f,
+           0.11983398f, -0.04703261f, -0.08814441f, -0.07585249f, -0.10799607f,
+           -0.03232087f, 0.01509786f, -0.04843464f, -0.03967846f, 0.09589416f,
+           0.01352560f, -0.01458119f, 0.01050829f, -0.03038946f, 0.01608388f,
+           1.11975556e-003f, -0.01250656f, 2.86211423e-003f, 0.04333691f,
+           -0.14603497f, -0.01946543f, -0.02327525f, -0.01973944f, 0.07944400f,
+           -0.02224544f, -0.06701808f, 0.03476532f, 0.11505594f, -0.02712801f,
+           -0.01665113f, 0.06315716f, -0.08205860f, 0.07431999f, 0.04915778f,
+           -0.04468752f, -0.01490402f, 0.07400476f, -0.11650901f, 0.05102430f,
+           0.04559118f, -0.05916039f, 0.08840760f, -0.01587902f, -0.14890194f,
+           0.07857784f, 0.04710254f, -0.05381983f, -0.07331945f, -0.03604643f,
+           0.15611970f, 0.07649943f, -0.05959348f, -0.02776607f, 0.11098688f,
+           0.03758875f, -0.04446875f, 0.04933187f, 0.01345535f, 0.06921103f,
+           0.07364785f, 0.05518956f, 0.02899585f, 0.09375840f, 0.10518434f,
+           -0.04420241f, 0.01915282f, -3.56386811e-003f, 0.14586878f, 0.10286101f,
+           -0.04360626f, -0.12723237f, 0.09076386f, 0.11119842f, -0.06035013f,
+           0.09674817f, 0.08938243f, 0.07065924f, 0.02603180f, 5.84815582e-003f,
+           -0.05922065f, 0.12360309f, 3.59695964e-003f, 2.99844006e-003f,
+           0.03697936f, 0.02043072f, 0.04168725f, 0.01025975f, -0.01359980f,
+           -0.01600920f, 0.02581056f, 0.02329250f, 2.98100687e-003f, 0.01629762f,
+           0.06652115f, 0.05855627f, 0.01237463f, -0.01297135f, 0.01761587f,
+           0.05090865f, 0.06549342f, -0.04425945f, 2.43203156e-003f,
+           3.07327788e-003f, 0.06678630f, -0.04303836f, 0.01082393f, -0.06476044f,
+           0.04077786f, 0.12441979f, 0.08237778f, 0.07424165f, 0.04065890f,
+           0.06905543f, 0.09556347f, 0.12724875f, -0.02132082f, 0.08514154f,
+           -0.04175328f, -0.02666954f, 0.01897836f, 0.03317382f, 9.45465732e-003f,
+           -0.01238974f, -0.04242500f, -0.01419479f, -0.03545213f, -0.02440874f,
+           0.08684119f, 0.04212951f, 0.02462858f, -0.01104825f, -5.01706870e-003f,
+           0.02968982f, 0.02597476f, -0.01568939f, 0.04514892f, 0.06974549f,
+           0.08670278f, 0.06828108f, 0.10238872f, 0.05405957f, 0.06548470f,
+           -0.03763957f, 0.01366090f, 0.07069602f, 0.05363748f, 0.04798120f,
+           0.11706422f, 0.05466456f, -0.01869259f, 0.06344382f, 0.03106543f,
+           0.08432506f, -0.02061096f, 0.03821088f, -6.92190882e-003f,
+           6.40467042e-003f, -0.01271779f, 6.89014705e-005f, 0.04541415f,
+           -0.01899539f, -0.05020239f, 0.03000903f, 0.01090422f, 4.52452758e-003f,
+           0.02573632f, -0.02388454f, -0.04200457f, 1.72783900e-003f,
+           -0.05978370f, -0.02720562f, 0.06573715f, 0.01154317f, 0.01265615f,
+           0.07375994f, -9.19828378e-003f, -0.04914120f, 0.02124831f, 0.06455322f,
+           0.04372910f, -0.03310043f, 0.03605788f, -6.78055827e-003f,
+           9.36202332e-003f, 0.01747596f, -0.06406314f, -0.06812935f, 0.08080816f,
+           -0.02778088f, 0.02735260f, 0.06393493f, 0.06652229f, 0.05676993f,
+           0.08640018f, -7.59188086e-003f, -0.02012847f, -0.04741159f,
+           -0.01657069f, -0.01624399f, 0.05547778f, -2.33309763e-003f,
+           0.01120033f, 0.06141156f, -0.06285004f, -0.08732341f, -0.09313398f,
+           -0.04267832f, 5.57443965e-003f, 0.04809862f, 0.01773641f,
+           5.37361018e-003f, 0.14842421f, -0.06298012f, -0.02935147f, 0.11443478f,
+           -0.05034208f, 5.65494271e-003f, 0.02076526f, -0.04577984f,
+           -0.04735741f, 0.02961071f, -0.09307127f, -0.04417921f, -0.04990027f,
+           -0.03940028f, 0.01306016f, 0.06267900f, 0.03758737f, 0.08460117f,
+           0.13858789f, 0.04862388f, -0.06319809f, -0.05655516f, 0.01885816f,
+           -0.03285607f, 0.03371567f, -0.07040928f, -0.04514049f, 0.01392166f,
+           0.08184422f, -0.07230316f, 0.02386871f, 0.02184591f, 0.02605764f,
+           -0.01033954f, 9.29878280e-003f, 7.67351175e-003f, 0.15189242f,
+           0.02069071f, -0.09738296f, -0.08894105f, -0.07768748f, 0.02332268f,
+           -0.01778995f, -0.03258888f, -0.08180822f, -0.08492987f, 0.02290156f,
+           -0.11368170f, -0.03554465f, -0.04533844f, -0.02861580f, 0.06782424f,
+           0.01113123f, 0.02453644f, 0.12721945f, 0.08084814f, -0.03607795f,
+           0.01109122f, 0.04803548f, -0.03489929f, 0.03399536f, -0.05682014f,
+           8.59533902e-003f, -4.27904585e-003f, 0.03230887f, -0.01300198f,
+           -0.01038137f, -0.07930113f, 8.33097473e-003f, 0.02296994f,
+           -0.01306500f, -0.01881626f, 0.04413369f, 0.05729880f, -0.03761553f,
+           0.01942326f, 1.64540811e-003f, -0.03811319f, 0.04190650f, -0.14978096f,
+           -0.04514487f, 0.01209545f, -5.46460645e-003f, -0.01647195f,
+           7.63064111e-003f, -0.07494587f, 0.08415288f, 0.10020141f, -0.01228561f,
+           0.06553826f, 0.04554005f, 0.07890417f, 0.03041138f, 0.01752007f,
+           0.09208256f, -3.74419295e-004f, 0.10549527f, 0.04686913f, 0.01894833f,
+           -0.02651412f, -4.34682379e-003f, 5.44942822e-003f, 0.01444484f,
+           0.05882156f, -0.03336544f, 0.04603891f, -0.10432546f, 0.01923928f,
+           0.01842845f, -0.01712168f, -0.02222766f, 0.04693324f, -0.06202956f,
+           -0.01422159f, 0.08732220f, -0.07706107f, 0.02661049f, -0.04300238f,
+           -0.03092422f, -0.03552184f, -0.01886088f, -0.04979934f, 0.03906401f,
+           0.04608644f, 0.04966111f, 0.04275464f, -0.04621769f, -0.02653212f,
+           8.57011229e-003f, 0.03839684f, 0.05818764f, 0.03880796f,
+           -2.76100676e-004f, 0.03076511f, -0.03266929f, -0.05374557f,
+           0.04986527f, -9.45429131e-003f, 0.03582499f, -2.64564669e-003f,
+           -1.07461517e-003f, 0.02962313f, -0.01483363f, 0.03060869f, 0.02448327f,
+           0.01845641f, 0.03282966f, -0.03534438f, -0.01084059f, -0.01119136f,
+           -1.85360224e-003f, -5.94652840e-004f, -0.04451817f, 2.98327743e-003f,
+           0.06272484f, -0.02152076f, -3.05971340e-003f, -0.05070828f,
+           0.01531762f, 0.01282815f, 0.05167150f, 9.46266949e-003f,
+           -3.34558333e-003f, 0.11442288f, -0.03906701f, -2.67325155e-003f,
+           0.03069184f, -0.01134165f, 0.02949462f, 0.02879886f, 0.03855566f,
+           -0.03450781f, 0.09142872f, -0.02156654f, 0.06075062f, -0.06220816f,
+           0.01944680f, 6.68372354e-003f, -0.06656796f, 8.70784000e-003f,
+           0.03456013f, 0.02434320f, -0.13236357f, -0.04177035f, -0.02069627f,
+           0.01068112f, 0.01505432f, -0.07517391f, -3.83571628e-003f,
+           -0.06298508f, -0.02881260f, -0.13101046f, -0.07221562f,
+           -5.79945277e-003f, -8.57300125e-003f, 0.03782469f, 0.02762164f,
+           0.04942456f, -0.02936396f, 0.09597211f, 0.01921411f, 0.06101191f,
+           -0.04787507f, -0.01379578f, -7.40224449e-003f, -0.02220136f,
+           -0.01313756f, 7.77558051e-003f, 0.12296968f, 0.02939998f, 0.03594062f,
+           -0.07788624f, -0.01133144f, 3.99316690e-004f, -0.06090347f,
+           -0.01122066f, -4.68682544e-003f, 0.07633100f, -0.06748922f,
+           -0.05640298f, -0.05265681f, -0.01139122f, -0.01624347f, -0.04715714f,
+           -0.01099092f, 0.01048561f, 3.28499987e-003f, -0.05810167f,
+           -0.07699911f, -0.03330683f, 0.04185145f, 0.03478536f, 0.02275165f,
+           0.02304766f, 6.66040834e-003f, 0.10968148f, -5.93013782e-003f,
+           -0.04858336f, -0.04203213f, -0.09316786f, -6.13074889e-003f,
+           -0.02544625f, 0.01366201f, 9.18555818e-003f, -0.01846578f,
+           -0.05622401f, -0.03989377f, -0.07810296f, 6.91275718e-003f,
+           0.05957597f, -0.03901334f, 0.01572002f, -0.01193903f,
+           -6.89400872e-003f, -0.03093356f, -0.04136098f, -0.01562869f,
+           -0.04604580f, 0.02865234f, -0.08678447f, -0.03232484f, -0.05364593f,
+           -0.01445016f, -0.07003860f, -0.08669746f, -0.04520775f, 0.04274122f,
+           0.03117515f, 0.08175703f, 0.01081109f, 0.06379741f, 0.06199206f,
+           0.02865988f, 0.02360346f, 0.06725410f, -0.03248780f, -9.37702879e-003f,
+           0.08265898f, -0.02245839f, 0.05125763f, -0.01862395f, 0.01973453f,
+           -0.01994494f, -0.10770868f, 0.03180375f, 3.23935156e-003f,
+           -0.02142080f, -0.04256190f, 0.04760900f, 0.04282863f, 0.05635953f,
+           -0.01870849f, 0.05540622f, -0.03042666f, 0.01455277f, -0.06630179f,
+           -0.05843807f, -0.03739681f, -0.09739155f, -0.03220233f, -0.05620182f,
+           -0.10381401f, 0.07400211f, 4.20676917e-003f, 0.03258535f,
+           2.14308966e-003f, 0.05121966f, -0.01274337f, 0.02384761f, 0.06335578f,
+           -0.07905591f, 0.08375625f, -0.07898903f, -0.06508528f, -0.02498444f,
+           0.06535810f, 0.03970535f, 0.04895468f, -0.01169566f, -0.03980601f,
+           0.05682293f, 0.05925463f, -0.01165808f, -0.07936699f, -0.04208954f,
+           0.01333987f, 0.09051196f, 0.10098671f, -0.03974256f, 0.01238771f,
+           -0.07501741f, -0.03655440f, -0.04301528f, 0.09216860f,
+           4.63579083e-004f, 0.02851115f, 0.02142735f, 1.28244064e-004f,
+           0.02879687f, -0.08554889f, -0.04838862f, 0.08135369f, -0.05756533f,
+           0.01413900f, 0.03451880f, -0.06619488f, -0.03053130f, 0.02961676f,
+           -0.07384635f, 0.01135692f, 0.05283910f, -0.07778034f, -0.02107482f,
+           -0.05511716f, -0.13473752f, 0.03030157f, 0.06722020f, -0.06218817f,
+           -0.05826827f, 0.06254654f, 0.02895772f, -0.01664000f, -0.03620280f,
+           -0.01612278f, -1.46097376e-003f, 0.14013411f, -8.96181818e-003f,
+           -0.03250246f, 3.38630192e-003f, 2.64779478e-003f, 0.03359732f,
+           -0.02411991f, -0.04229729f, 0.10666174f, -6.66579151f };
+
+        return Mat(1, static_cast<int>(sizeof(detector)/sizeof(detector[0])), CV_32FC1, detector);
+    }
+}
+
+#endif
diff --git a/modules/cudaobjdetect/src/precomp.hpp b/modules/cudaobjdetect/src/precomp.hpp
new file mode 100644
index 00000000000..2e5ab7af3bd
--- /dev/null
+++ b/modules/cudaobjdetect/src/precomp.hpp
@@ -0,0 +1,62 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudawarping.hpp"
+#include "opencv2/objdetect.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/utility.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy/private.hpp"
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudaobjdetect/test/test_main.cpp b/modules/cudaobjdetect/test/test_main.cpp
new file mode 100644
index 00000000000..04f4fcf6e60
--- /dev/null
+++ b/modules/cudaobjdetect/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cudaobjdetect/test/test_objdetect.cpp b/modules/cudaobjdetect/test/test_objdetect.cpp
new file mode 100644
index 00000000000..8aa49102da1
--- /dev/null
+++ b/modules/cudaobjdetect/test/test_objdetect.cpp
@@ -0,0 +1,563 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+//#define DUMP
+
+struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Ptr<cv::cuda::HOG> hog;
+
+#ifdef DUMP
+    std::ofstream f;
+#else
+    std::ifstream f;
+#endif
+
+    int wins_per_img_x;
+    int wins_per_img_y;
+    int blocks_per_win_x;
+    int blocks_per_win_y;
+    int block_hist_size;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        hog = cv::cuda::HOG::create();
+    }
+
+#ifdef DUMP
+    void dump(const std::vector<cv::Point>& locations)
+    {
+        int nlocations = locations.size();
+        f.write((char*)&nlocations, sizeof(nlocations));
+
+        for (int i = 0; i < locations.size(); ++i)
+            f.write((char*)&locations[i], sizeof(locations[i]));
+    }
+#else
+    void compare(const std::vector<cv::Point>& locations)
+    {
+        // skip block_hists check
+        int rows, cols;
+        f.read((char*)&rows, sizeof(rows));
+        f.read((char*)&cols, sizeof(cols));
+        for (int i = 0; i < rows; ++i)
+        {
+            for (int j = 0; j < cols; ++j)
+            {
+                float val;
+                f.read((char*)&val, sizeof(val));
+            }
+        }
+
+        int nlocations;
+        f.read((char*)&nlocations, sizeof(nlocations));
+        ASSERT_EQ(nlocations, static_cast<int>(locations.size()));
+
+        for (int i = 0; i < nlocations; ++i)
+        {
+            cv::Point location;
+            f.read((char*)&location, sizeof(location));
+            ASSERT_EQ(location, locations[i]);
+        }
+    }
+#endif
+
+    void testDetect(const cv::Mat& img)
+    {
+        hog->setGammaCorrection(false);
+        hog->setSVMDetector(hog->getDefaultPeopleDetector());
+
+        std::vector<cv::Point> locations;
+
+        // Test detect
+        hog->detect(loadMat(img), locations);
+
+#ifdef DUMP
+        dump(locations);
+#else
+        compare(locations);
+#endif
+
+        // Test detect on smaller image
+        cv::Mat img2;
+        cv::resize(img, img2, cv::Size(img.cols / 2, img.rows / 2));
+        hog->detect(loadMat(img2), locations);
+
+#ifdef DUMP
+        dump(locations);
+#else
+        compare(locations);
+#endif
+
+        // Test detect on greater image
+        cv::resize(img, img2, cv::Size(img.cols * 2, img.rows * 2));
+        hog->detect(loadMat(img2), locations);
+
+#ifdef DUMP
+        dump(locations);
+#else
+        compare(locations);
+#endif
+    }
+};
+
+// desabled while resize does not fixed
+CUDA_TEST_P(HOG, DISABLED_Detect)
+{
+    cv::Mat img_rgb = readImage("hog/road.png");
+    ASSERT_FALSE(img_rgb.empty());
+
+    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
+    ASSERT_TRUE(f.is_open());
+
+    // Test on color image
+    cv::Mat img;
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+    testDetect(img);
+
+    // Test on gray image
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2GRAY);
+    testDetect(img);
+}
+
+CUDA_TEST_P(HOG, GetDescriptors)
+{
+    // Load image (e.g. train data, composed from windows)
+    cv::Mat img_rgb = readImage("hog/train_data.png");
+    ASSERT_FALSE(img_rgb.empty());
+
+    // Convert to C4
+    cv::Mat img;
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+
+    cv::cuda::GpuMat d_img(img);
+
+    // Convert train images into feature vectors (train table)
+    cv::cuda::GpuMat descriptors, descriptors_by_cols;
+
+    hog->setWinStride(Size(64, 128));
+
+    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_ROW_BY_ROW);
+    hog->compute(d_img, descriptors);
+
+    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_COL_BY_COL);
+    hog->compute(d_img, descriptors_by_cols);
+
+    // Check size of the result train table
+    wins_per_img_x = 3;
+    wins_per_img_y = 2;
+    blocks_per_win_x = 7;
+    blocks_per_win_y = 15;
+    block_hist_size = 36;
+    cv::Size descr_size_expected = cv::Size(blocks_per_win_x * blocks_per_win_y * block_hist_size,
+                                            wins_per_img_x * wins_per_img_y);
+    ASSERT_EQ(descr_size_expected, descriptors.size());
+
+    // Check both formats of output descriptors are handled correctly
+    cv::Mat dr(descriptors);
+    cv::Mat dc(descriptors_by_cols);
+    for (int i = 0; i < wins_per_img_x * wins_per_img_y; ++i)
+    {
+        const float* l = dr.rowRange(i, i + 1).ptr<float>();
+        const float* r = dc.rowRange(i, i + 1).ptr<float>();
+        for (int y = 0; y < blocks_per_win_y; ++y)
+            for (int x = 0; x < blocks_per_win_x; ++x)
+                for (int k = 0; k < block_hist_size; ++k)
+                    ASSERT_EQ(l[(y * blocks_per_win_x + x) * block_hist_size + k],
+                              r[(x * blocks_per_win_y + y) * block_hist_size + k]);
+    }
+}
+/*
+INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
+*/
+//============== caltech hog tests =====================//
+
+struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string> >
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Mat img;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+    }
+};
+
+CUDA_TEST_P(CalTech, HOG)
+{
+    cv::cuda::GpuMat d_img(img);
+    cv::Mat markedImage(img.clone());
+
+    cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+    d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());
+    d_hog->setNumLevels(d_hog->getNumLevels() + 32);
+
+    std::vector<cv::Rect> found_locations;
+    d_hog->detectMultiScale(d_img, found_locations);
+
+#if defined (LOG_CASCADE_STATISTIC)
+    for (int i = 0; i < (int)found_locations.size(); i++)
+    {
+        cv::Rect r = found_locations[i];
+
+        std::cout << r.x << " " << r.y  << " " << r.width << " " << r.height << std::endl;
+        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
+    }
+
+    cv::imshow("Res", markedImage);
+    cv::waitKey();
+#endif
+}
+
+INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES,
+    ::testing::Values<std::string>("caltech/image_00000009_0.png", "caltech/image_00000032_0.png",
+        "caltech/image_00000165_0.png", "caltech/image_00000261_0.png", "caltech/image_00000469_0.png",
+        "caltech/image_00000527_0.png", "caltech/image_00000574_0.png")));
+
+
+//------------------------variable GPU HOG Tests------------------------//
+struct Hog_var : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string> >
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Mat img, c_img;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        cv::Rect roi(0, 0, 16, 32);
+        img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+        c_img = img(roi);
+    }
+};
+
+CUDA_TEST_P(Hog_var, HOG)
+{
+    cv::cuda::GpuMat _img(c_img);
+    cv::cuda::GpuMat d_img;
+
+    int win_stride_width = 8;int win_stride_height = 8;
+    int win_width = 16;
+    int block_width = 8;
+    int block_stride_width = 4;int block_stride_height = 4;
+    int cell_width = 4;
+    int nbins = 9;
+
+    Size win_stride(win_stride_width, win_stride_height);
+    Size win_size(win_width, win_width * 2);
+    Size block_size(block_width, block_width);
+    Size block_stride(block_stride_width, block_stride_height);
+    Size cell_size(cell_width, cell_width);
+
+    cv::Ptr<cv::cuda::HOG> gpu_hog = cv::cuda::HOG::create(win_size, block_size, block_stride, cell_size, nbins);
+
+    gpu_hog->setNumLevels(13);
+    gpu_hog->setHitThreshold(0);
+    gpu_hog->setWinStride(win_stride);
+    gpu_hog->setScaleFactor(1.05);
+    gpu_hog->setGroupThreshold(8);
+    gpu_hog->compute(_img, d_img);
+
+    vector<float> gpu_desc_vec;
+    ASSERT_TRUE(gpu_desc_vec.empty());
+    cv::Mat R(d_img);
+
+    cv::HOGDescriptor cpu_hog(win_size, block_size, block_stride, cell_size, nbins);
+    cpu_hog.nlevels = 13;
+    vector<float> cpu_desc_vec;
+    ASSERT_TRUE(cpu_desc_vec.empty());
+    cpu_hog.compute(c_img, cpu_desc_vec, win_stride, Size(0,0));
+}
+
+INSTANTIATE_TEST_CASE_P(detect, Hog_var, testing::Combine(ALL_DEVICES,
+    ::testing::Values<std::string>("/hog/road.png")));
+
+struct Hog_var_cell : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string> >
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Mat img, c_img, c_img2, c_img3, c_img4;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        cv::Rect roi(0, 0, 48, 96);
+        img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+        c_img = img(roi);
+
+        cv::Rect roi2(0, 0, 54, 108);
+        c_img2 = img(roi2);
+
+        cv::Rect roi3(0, 0, 64, 128);
+        c_img3 = img(roi3);
+
+        cv::Rect roi4(0, 0, 32, 64);
+        c_img4 = img(roi4);
+    }
+};
+
+CUDA_TEST_P(Hog_var_cell, HOG)
+{
+    cv::cuda::GpuMat _img(c_img);
+    cv::cuda::GpuMat _img2(c_img2);
+    cv::cuda::GpuMat _img3(c_img3);
+    cv::cuda::GpuMat _img4(c_img4);
+    cv::cuda::GpuMat d_img;
+
+    ASSERT_FALSE(_img.empty());
+    ASSERT_TRUE(d_img.empty());
+
+    int win_stride_width = 8;int win_stride_height = 8;
+    int win_width = 48;
+    int block_width = 16;
+    int block_stride_width = 8;int block_stride_height = 8;
+    int cell_width = 8;
+    int nbins = 9;
+
+    Size win_stride(win_stride_width, win_stride_height);
+    Size win_size(win_width, win_width * 2);
+    Size block_size(block_width, block_width);
+    Size block_stride(block_stride_width, block_stride_height);
+    Size cell_size(cell_width, cell_width);
+
+    cv::Ptr<cv::cuda::HOG> gpu_hog = cv::cuda::HOG::create(win_size, block_size, block_stride, cell_size, nbins);
+
+    gpu_hog->setNumLevels(13);
+    gpu_hog->setHitThreshold(0);
+    gpu_hog->setWinStride(win_stride);
+    gpu_hog->setScaleFactor(1.05);
+    gpu_hog->setGroupThreshold(8);
+    gpu_hog->compute(_img, d_img);
+//------------------------------------------------------------------------------
+    cv::cuda::GpuMat d_img2;
+    ASSERT_TRUE(d_img2.empty());
+
+    int win_stride_width2 = 8;int win_stride_height2 = 8;
+    int win_width2 = 48;
+    int block_width2 = 16;
+    int block_stride_width2 = 8;int block_stride_height2 = 8;
+    int cell_width2 = 4;
+
+    Size win_stride2(win_stride_width2, win_stride_height2);
+    Size win_size2(win_width2, win_width2 * 2);
+    Size block_size2(block_width2, block_width2);
+    Size block_stride2(block_stride_width2, block_stride_height2);
+    Size cell_size2(cell_width2, cell_width2);
+
+    cv::Ptr<cv::cuda::HOG> gpu_hog2 = cv::cuda::HOG::create(win_size2, block_size2, block_stride2, cell_size2, nbins);
+    gpu_hog2->setWinStride(win_stride2);
+    gpu_hog2->compute(_img, d_img2);
+//------------------------------------------------------------------------------
+    cv::cuda::GpuMat d_img3;
+    ASSERT_TRUE(d_img3.empty());
+
+    int win_stride_width3 = 9;int win_stride_height3 = 9;
+    int win_width3 = 54;
+    int block_width3 = 18;
+    int block_stride_width3 = 9;int block_stride_height3 = 9;
+    int cell_width3 = 6;
+
+    Size win_stride3(win_stride_width3, win_stride_height3);
+    Size win_size3(win_width3, win_width3 * 2);
+    Size block_size3(block_width3, block_width3);
+    Size block_stride3(block_stride_width3, block_stride_height3);
+    Size cell_size3(cell_width3, cell_width3);
+
+    cv::Ptr<cv::cuda::HOG> gpu_hog3 = cv::cuda::HOG::create(win_size3, block_size3, block_stride3, cell_size3, nbins);
+    gpu_hog3->setWinStride(win_stride3);
+    gpu_hog3->compute(_img2, d_img3);
+//------------------------------------------------------------------------------
+    cv::cuda::GpuMat d_img4;
+    ASSERT_TRUE(d_img4.empty());
+
+    int win_stride_width4 = 16;int win_stride_height4 = 16;
+    int win_width4 = 64;
+    int block_width4 = 32;
+    int block_stride_width4 = 16;int block_stride_height4 = 16;
+    int cell_width4 = 8;
+
+    Size win_stride4(win_stride_width4, win_stride_height4);
+    Size win_size4(win_width4, win_width4 * 2);
+    Size block_size4(block_width4, block_width4);
+    Size block_stride4(block_stride_width4, block_stride_height4);
+    Size cell_size4(cell_width4, cell_width4);
+
+    cv::Ptr<cv::cuda::HOG> gpu_hog4 = cv::cuda::HOG::create(win_size4, block_size4, block_stride4, cell_size4, nbins);
+    gpu_hog4->setWinStride(win_stride4);
+    gpu_hog4->compute(_img3, d_img4);
+//------------------------------------------------------------------------------
+    cv::cuda::GpuMat d_img5;
+    ASSERT_TRUE(d_img5.empty());
+
+    int win_stride_width5 = 16;int win_stride_height5 = 16;
+    int win_width5 = 64;
+    int block_width5 = 32;
+    int block_stride_width5 = 16;int block_stride_height5 = 16;
+    int cell_width5 = 16;
+
+    Size win_stride5(win_stride_width5, win_stride_height5);
+    Size win_size5(win_width5, win_width5 * 2);
+    Size block_size5(block_width5, block_width5);
+    Size block_stride5(block_stride_width5, block_stride_height5);
+    Size cell_size5(cell_width5, cell_width5);
+
+    cv::Ptr<cv::cuda::HOG> gpu_hog5 = cv::cuda::HOG::create(win_size5, block_size5, block_stride5, cell_size5, nbins);
+    gpu_hog5->setWinStride(win_stride5);
+    gpu_hog5->compute(_img3, d_img5);
+//------------------------------------------------------------------------------
+}
+
+INSTANTIATE_TEST_CASE_P(detect, Hog_var_cell, testing::Combine(ALL_DEVICES,
+    ::testing::Values<std::string>("/hog/road.png")));
+//////////////////////////////////////////////////////////////////////////////////////////
+/// LBP classifier
+
+PARAM_TEST_CASE(LBP_Read_classifier, cv::cuda::DeviceInfo, int)
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(LBP_Read_classifier, Accuracy)
+{
+    std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
+
+    cv::Ptr<cv::cuda::CascadeClassifier> d_cascade;
+
+    ASSERT_NO_THROW(
+        d_cascade = cv::cuda::CascadeClassifier::create(classifierXmlPath);
+    );
+
+    ASSERT_FALSE(d_cascade.empty());
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, LBP_Read_classifier,
+                        testing::Combine(ALL_DEVICES, testing::Values<int>(0)));
+
+
+PARAM_TEST_CASE(LBP_classify, cv::cuda::DeviceInfo, int)
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(LBP_classify, Accuracy)
+{
+    std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
+    std::string imagePath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/er.png";
+
+    cv::CascadeClassifier cpuClassifier(classifierXmlPath);
+    ASSERT_FALSE(cpuClassifier.empty());
+
+    cv::Mat image = cv::imread(imagePath);
+    image = image.colRange(0, image.cols/2);
+    cv::Mat grey;
+    cvtColor(image, grey, cv::COLOR_BGR2GRAY);
+    ASSERT_FALSE(image.empty());
+
+    std::vector<cv::Rect> rects;
+    cpuClassifier.detectMultiScale(grey, rects);
+    cv::Mat markedImage = image.clone();
+
+    std::vector<cv::Rect>::iterator it = rects.begin();
+    for (; it != rects.end(); ++it)
+        cv::rectangle(markedImage, *it, cv::Scalar(255, 0, 0));
+
+    cv::Ptr<cv::cuda::CascadeClassifier> gpuClassifier =
+            cv::cuda::CascadeClassifier::create(classifierXmlPath);
+
+    cv::cuda::GpuMat tested(grey);
+    cv::cuda::GpuMat gpu_rects_buf;
+    gpuClassifier->detectMultiScale(tested, gpu_rects_buf);
+
+    std::vector<cv::Rect> gpu_rects;
+    gpuClassifier->convert(gpu_rects_buf, gpu_rects);
+
+#if defined (LOG_CASCADE_STATISTIC)
+    for (size_t i = 0; i < gpu_rects.size(); i++)
+    {
+        cv::Rect r = gpu_rects[i];
+
+        std::cout << r.x << " " << r.y  << " " << r.width << " " << r.height << std::endl;
+        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
+    }
+
+    cv::imshow("Res", markedImage);
+    cv::waitKey();
+#endif
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, LBP_classify,
+                        testing::Combine(ALL_DEVICES, testing::Values<int>(0)));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaobjdetect/test/test_precomp.hpp b/modules/cudaobjdetect/test/test_precomp.hpp
new file mode 100644
index 00000000000..b7967085086
--- /dev/null
+++ b/modules/cudaobjdetect/test/test_precomp.hpp
@@ -0,0 +1,55 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include <fstream>
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/objdetect.hpp"
+
+#include "cvconfig.h"
+
+#endif
diff --git a/modules/cudaoptflow/CMakeLists.txt b/modules/cudaoptflow/CMakeLists.txt
new file mode 100644
index 00000000000..d40dc97e9b9
--- /dev/null
+++ b/modules/cudaoptflow/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR WINRT OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudaoptflow)
+endif()
+
+set(the_description "CUDA-accelerated Optical Flow")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudaoptflow opencv_video opencv_cudaarithm opencv_cudawarping opencv_cudaimgproc OPTIONAL opencv_cudalegacy WRAP python)
diff --git a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
new file mode 100644
index 00000000000..ce1d183fe23
--- /dev/null
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -0,0 +1,349 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDAOPTFLOW_HPP
+#define OPENCV_CUDAOPTFLOW_HPP
+
+#ifndef __cplusplus
+#  error cudaoptflow.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudaoptflow Optical Flow
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudaoptflow
+//! @{
+
+//
+// Interface
+//
+
+/** @brief Base interface for dense optical flow algorithms.
+ */
+class CV_EXPORTS_W DenseOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates a dense optical flow.
+
+    @param I0 first input image.
+    @param I1 second input image of the same size and the same type as I0.
+    @param flow computed flow image that has the same size as I0 and type CV_32FC2.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream = Stream::Null()) = 0;
+};
+
+/** @brief Base interface for sparse optical flow algorithms.
+ */
+class CV_EXPORTS_W SparseOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates a sparse optical flow.
+
+    @param prevImg First input image.
+    @param nextImg Second input image of the same size and the same type as prevImg.
+    @param prevPts Vector of 2D points for which the flow needs to be found.
+    @param nextPts Output vector of 2D points containing the calculated new positions of input features in the second image.
+    @param status Output status vector. Each element of the vector is set to 1 if the
+                  flow for the corresponding features has been found. Otherwise, it is set to 0.
+    @param err Optional output vector that contains error response for each point (inverse confidence).
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void calc(InputArray prevImg, InputArray nextImg,
+                      InputArray prevPts, InputOutputArray nextPts,
+                      OutputArray status,
+                      OutputArray err = cv::noArray(),
+                      Stream& stream = Stream::Null()) = 0;
+};
+
+//
+// BroxOpticalFlow
+//
+
+/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm (@cite Brox2004).
+ */
+class CV_EXPORTS_W BroxOpticalFlow : public DenseOpticalFlow
+{
+public:
+    CV_WRAP virtual double getFlowSmoothness() const = 0;
+    CV_WRAP virtual void setFlowSmoothness(double alpha) = 0;
+
+    CV_WRAP virtual double getGradientConstancyImportance() const = 0;
+    CV_WRAP virtual void setGradientConstancyImportance(double gamma) = 0;
+
+    CV_WRAP virtual double getPyramidScaleFactor() const = 0;
+    CV_WRAP virtual void setPyramidScaleFactor(double scale_factor) = 0;
+
+    //! number of lagged non-linearity iterations (inner loop)
+    CV_WRAP virtual int getInnerIterations() const = 0;
+    CV_WRAP virtual void setInnerIterations(int inner_iterations) = 0;
+
+    //! number of warping iterations (number of pyramid levels)
+    CV_WRAP virtual int getOuterIterations() const = 0;
+    CV_WRAP virtual void setOuterIterations(int outer_iterations) = 0;
+
+    //! number of linear system solver iterations
+    CV_WRAP virtual int getSolverIterations() const = 0;
+    CV_WRAP virtual void setSolverIterations(int solver_iterations) = 0;
+
+    CV_WRAP static Ptr<BroxOpticalFlow> create(
+            double alpha = 0.197,
+            double gamma = 50.0,
+            double scale_factor = 0.8,
+            int inner_iterations = 5,
+            int outer_iterations = 150,
+            int solver_iterations = 10);
+};
+
+//
+// PyrLKOpticalFlow
+//
+
+/** @brief Class used for calculating a sparse optical flow.
+
+The class can calculate an optical flow for a sparse feature set using the
+iterative Lucas-Kanade method with pyramids.
+
+@sa calcOpticalFlowPyrLK
+
+@note
+   -   An example of the Lucas Kanade optical flow algorithm can be found at
+        opencv_source_code/samples/gpu/pyrlk_optical_flow.cpp
+ */
+class CV_EXPORTS_W SparsePyrLKOpticalFlow : public SparseOpticalFlow
+{
+public:
+    CV_WRAP virtual Size getWinSize() const = 0;
+    CV_WRAP virtual void setWinSize(Size winSize) = 0;
+
+    CV_WRAP virtual int getMaxLevel() const = 0;
+    CV_WRAP virtual void setMaxLevel(int maxLevel) = 0;
+
+    CV_WRAP virtual int getNumIters() const = 0;
+    CV_WRAP virtual void setNumIters(int iters) = 0;
+
+    CV_WRAP virtual bool getUseInitialFlow() const = 0;
+    CV_WRAP virtual void setUseInitialFlow(bool useInitialFlow) = 0;
+
+    CV_WRAP static Ptr<cuda::SparsePyrLKOpticalFlow> create(
+            Size winSize = Size(21, 21),
+            int maxLevel = 3,
+            int iters = 30,
+            bool useInitialFlow = false);
+};
+
+/** @brief Class used for calculating a dense optical flow.
+
+The class can calculate an optical flow for a dense optical flow using the
+iterative Lucas-Kanade method with pyramids.
+ */
+class CV_EXPORTS_W DensePyrLKOpticalFlow : public DenseOpticalFlow
+{
+public:
+    CV_WRAP virtual Size getWinSize() const = 0;
+    CV_WRAP virtual void setWinSize(Size winSize) = 0;
+
+    CV_WRAP virtual int getMaxLevel() const = 0;
+    CV_WRAP virtual void setMaxLevel(int maxLevel) = 0;
+
+    CV_WRAP virtual int getNumIters() const = 0;
+    CV_WRAP virtual void setNumIters(int iters) = 0;
+
+    CV_WRAP virtual bool getUseInitialFlow() const = 0;
+    CV_WRAP virtual void setUseInitialFlow(bool useInitialFlow) = 0;
+
+    CV_WRAP static Ptr<DensePyrLKOpticalFlow> create(
+            Size winSize = Size(13, 13),
+            int maxLevel = 3,
+            int iters = 30,
+            bool useInitialFlow = false);
+};
+
+//
+// FarnebackOpticalFlow
+//
+
+/** @brief Class computing a dense optical flow using the Gunnar Farneback's algorithm.
+ */
+class CV_EXPORTS_W FarnebackOpticalFlow : public DenseOpticalFlow
+{
+public:
+    CV_WRAP virtual int getNumLevels() const = 0;
+    CV_WRAP virtual void setNumLevels(int numLevels) = 0;
+
+    CV_WRAP virtual double getPyrScale() const = 0;
+    CV_WRAP virtual void setPyrScale(double pyrScale) = 0;
+
+    CV_WRAP virtual bool getFastPyramids() const = 0;
+    CV_WRAP virtual void setFastPyramids(bool fastPyramids) = 0;
+
+    CV_WRAP virtual int getWinSize() const = 0;
+    CV_WRAP virtual void setWinSize(int winSize) = 0;
+
+    CV_WRAP virtual int getNumIters() const = 0;
+    CV_WRAP virtual void setNumIters(int numIters) = 0;
+
+    CV_WRAP virtual int getPolyN() const = 0;
+    CV_WRAP virtual void setPolyN(int polyN) = 0;
+
+    CV_WRAP virtual double getPolySigma() const = 0;
+    CV_WRAP virtual void setPolySigma(double polySigma) = 0;
+
+    CV_WRAP virtual int getFlags() const = 0;
+    CV_WRAP virtual void setFlags(int flags) = 0;
+
+    CV_WRAP static Ptr<cuda::FarnebackOpticalFlow> create(
+            int numLevels = 5,
+            double pyrScale = 0.5,
+            bool fastPyramids = false,
+            int winSize = 13,
+            int numIters = 10,
+            int polyN = 5,
+            double polySigma = 1.1,
+            int flags = 0);
+};
+
+//
+// OpticalFlowDual_TVL1
+//
+
+/** @brief Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method.
+ *
+ * @sa C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+ * @sa Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+ */
+class CV_EXPORTS_W OpticalFlowDual_TVL1 : public DenseOpticalFlow
+{
+public:
+    /**
+     * Time step of the numerical scheme.
+     */
+    CV_WRAP virtual double getTau() const = 0;
+    CV_WRAP virtual void setTau(double tau) = 0;
+
+    /**
+     * Weight parameter for the data term, attachment parameter.
+     * This is the most relevant parameter, which determines the smoothness of the output.
+     * The smaller this parameter is, the smoother the solutions we obtain.
+     * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
+     */
+    CV_WRAP virtual double getLambda() const = 0;
+    CV_WRAP virtual void setLambda(double lambda) = 0;
+
+    /**
+     * Weight parameter for (u - v)^2, tightness parameter.
+     * It serves as a link between the attachment and the regularization terms.
+     * In theory, it should have a small value in order to maintain both parts in correspondence.
+     * The method is stable for a large range of values of this parameter.
+     */
+    CV_WRAP virtual double getGamma() const = 0;
+    CV_WRAP virtual void setGamma(double gamma) = 0;
+
+    /**
+     * parameter used for motion estimation. It adds a variable allowing for illumination variations
+     * Set this parameter to 1. if you have varying illumination.
+     * See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging
+     * Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145
+     */
+    CV_WRAP virtual double getTheta() const = 0;
+    CV_WRAP virtual void setTheta(double theta) = 0;
+
+    /**
+     * Number of scales used to create the pyramid of images.
+     */
+    CV_WRAP virtual int getNumScales() const = 0;
+    CV_WRAP virtual void setNumScales(int nscales) = 0;
+
+    /**
+     * Number of warpings per scale.
+     * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
+     * This is a parameter that assures the stability of the method.
+     * It also affects the running time, so it is a compromise between speed and accuracy.
+     */
+    CV_WRAP virtual int getNumWarps() const = 0;
+    CV_WRAP virtual void setNumWarps(int warps) = 0;
+
+    /**
+     * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
+     * A small value will yield more accurate solutions at the expense of a slower convergence.
+     */
+    CV_WRAP virtual double getEpsilon() const = 0;
+    CV_WRAP virtual void setEpsilon(double epsilon) = 0;
+
+    /**
+     * Stopping criterion iterations number used in the numerical scheme.
+     */
+    CV_WRAP virtual int getNumIterations() const = 0;
+    CV_WRAP virtual void setNumIterations(int iterations) = 0;
+
+    CV_WRAP virtual double getScaleStep() const = 0;
+    CV_WRAP virtual void setScaleStep(double scaleStep) = 0;
+
+    CV_WRAP virtual bool getUseInitialFlow() const = 0;
+    CV_WRAP virtual void setUseInitialFlow(bool useInitialFlow) = 0;
+
+    CV_WRAP static Ptr<OpticalFlowDual_TVL1> create(
+            double tau = 0.25,
+            double lambda = 0.15,
+            double theta = 0.3,
+            int nscales = 5,
+            int warps = 5,
+            double epsilon = 0.01,
+            int iterations = 300,
+            double scaleStep = 0.8,
+            double gamma = 0.0,
+            bool useInitialFlow = false);
+};
+
+//! @}
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_CUDAOPTFLOW_HPP */
diff --git a/modules/cudaoptflow/perf/perf_main.cpp b/modules/cudaoptflow/perf/perf_main.cpp
new file mode 100644
index 00000000000..863a5109bf7
--- /dev/null
+++ b/modules/cudaoptflow/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudaoptflow)
diff --git a/modules/cudaoptflow/perf/perf_optflow.cpp b/modules/cudaoptflow/perf/perf_optflow.cpp
new file mode 100644
index 00000000000..a17083e14ce
--- /dev/null
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
@@ -0,0 +1,329 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+typedef pair<string, string> pair_string;
+
+DEF_PARAM_TEST_1(ImagePair, pair_string);
+
+//////////////////////////////////////////////////////
+// BroxOpticalFlow
+
+PERF_TEST_P(ImagePair, BroxOpticalFlow,
+            Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(300);
+
+    cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    frame0.convertTo(frame0, CV_32FC1, 1.0 / 255.0);
+    frame1.convertTo(frame1, CV_32FC1, 1.0 / 255.0);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_frame0(frame0);
+        const cv::cuda::GpuMat d_frame1(frame1);
+        cv::cuda::GpuMat flow;
+
+        cv::Ptr<cv::cuda::BroxOpticalFlow> d_alg =
+                cv::cuda::BroxOpticalFlow::create(0.197 /*alpha*/, 50.0 /*gamma*/, 0.8 /*scale_factor*/,
+                                                  10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+
+        TEST_CYCLE() d_alg->calc(d_frame0, d_frame1, flow);
+
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
+
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];
+
+        CUDA_SANITY_CHECK(u, 1e-1);
+        CUDA_SANITY_CHECK(v, 1e-1);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////
+// PyrLKOpticalFlowSparse
+
+DEF_PARAM_TEST(ImagePair_Gray_NPts_WinSz_Levels_Iters, pair_string, bool, int, int, int, int);
+
+PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse,
+            Combine(Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")),
+                    Bool(),
+                    Values(8000),
+                    Values(21),
+                    Values(1, 3),
+                    Values(1, 30)))
+{
+    declare.time(20.0);
+
+    const pair_string imagePair = GET_PARAM(0);
+    const bool useGray = GET_PARAM(1);
+    const int points = GET_PARAM(2);
+    const int winSize = GET_PARAM(3);
+    const int levels = GET_PARAM(4);
+    const int iters = GET_PARAM(5);
+
+    cv::Mat frame0 = readImage(imagePair.first, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage(imagePair.second, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat gray_frame;
+    if (useGray)
+        gray_frame = frame0;
+    else
+        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
+
+    cv::Mat pts;
+    cv::goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
+
+    frame0.convertTo(frame0, CV_32F);
+    frame1.convertTo(frame1, CV_32F);
+    if(!useGray)
+    {
+        cv::cvtColor(frame0, frame0, cv::COLOR_BGR2BGRA);
+        cv::cvtColor(frame1, frame1, cv::COLOR_BGR2BGRA);
+    }
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_pts(pts.reshape(2, 1));
+
+        cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> d_pyrLK =
+                cv::cuda::SparsePyrLKOpticalFlow::create(cv::Size(winSize, winSize),
+                                                         levels - 1,
+                                                         iters);
+
+        const cv::cuda::GpuMat d_frame0(frame0);
+        const cv::cuda::GpuMat d_frame1(frame1);
+        cv::cuda::GpuMat nextPts;
+        cv::cuda::GpuMat status;
+
+        TEST_CYCLE() d_pyrLK->calc(d_frame0, d_frame1, d_pts, nextPts, status);
+
+        CUDA_SANITY_CHECK(nextPts);
+        CUDA_SANITY_CHECK(status);
+    }
+    else
+    {
+        cv::Mat nextPts;
+        cv::Mat status;
+
+        TEST_CYCLE()
+        {
+            cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, cv::noArray(),
+                                     cv::Size(winSize, winSize), levels - 1,
+                                     cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, iters, 0.01));
+        }
+
+        CPU_SANITY_CHECK(nextPts);
+        CPU_SANITY_CHECK(status);
+    }
+}
+
+//////////////////////////////////////////////////////
+// PyrLKOpticalFlowDense
+
+DEF_PARAM_TEST(ImagePair_WinSz_Levels_Iters, pair_string, int, int, int);
+
+PERF_TEST_P(ImagePair_WinSz_Levels_Iters, PyrLKOpticalFlowDense,
+            Combine(Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")),
+                    Values(3, 5, 7, 9, 13, 17, 21),
+                    Values(1, 3),
+                    Values(1, 10)))
+{
+    declare.time(30);
+
+    const pair_string imagePair = GET_PARAM(0);
+    const int winSize = GET_PARAM(1);
+    const int levels = GET_PARAM(2);
+    const int iters = GET_PARAM(3);
+
+    const cv::Mat frame0 = readImage(imagePair.first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    const cv::Mat frame1 = readImage(imagePair.second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_frame0(frame0);
+        const cv::cuda::GpuMat d_frame1(frame1);
+        cv::cuda::GpuMat flow;
+
+        cv::Ptr<cv::cuda::DensePyrLKOpticalFlow> d_pyrLK =
+                cv::cuda::DensePyrLKOpticalFlow::create(cv::Size(winSize, winSize),
+                                                        levels - 1,
+                                                        iters);
+
+        TEST_CYCLE() d_pyrLK->calc(d_frame0, d_frame1, flow);
+
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
+
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];
+
+        // Sanity test fails on Maxwell and CUDA 7.0
+        SANITY_CHECK_NOTHING();
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////
+// FarnebackOpticalFlow
+
+PERF_TEST_P(ImagePair, FarnebackOpticalFlow,
+            Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(10);
+
+    const cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    const int numLevels = 5;
+    const double pyrScale = 0.5;
+    const int winSize = 13;
+    const int numIters = 10;
+    const int polyN = 5;
+    const double polySigma = 1.1;
+    const int flags = 0;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_frame0(frame0);
+        const cv::cuda::GpuMat d_frame1(frame1);
+        cv::cuda::GpuMat flow;
+
+        cv::Ptr<cv::cuda::FarnebackOpticalFlow> d_farneback =
+                cv::cuda::FarnebackOpticalFlow::create(numLevels, pyrScale, false, winSize,
+                                                       numIters, polyN, polySigma, flags);
+
+        TEST_CYCLE() d_farneback->calc(d_frame0, d_frame1, flow);
+
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
+
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];
+
+        CUDA_SANITY_CHECK(u, 1e-4);
+        CUDA_SANITY_CHECK(v, 1e-4);
+    }
+    else
+    {
+        cv::Mat flow;
+
+        TEST_CYCLE() cv::calcOpticalFlowFarneback(frame0, frame1, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+
+        CPU_SANITY_CHECK(flow);
+    }
+}
+
+//////////////////////////////////////////////////////
+// OpticalFlowDual_TVL1
+
+PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1,
+            Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(20);
+
+    const cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_frame0(frame0);
+        const cv::cuda::GpuMat d_frame1(frame1);
+        cv::cuda::GpuMat flow;
+
+        cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> d_alg =
+                cv::cuda::OpticalFlowDual_TVL1::create();
+
+        TEST_CYCLE() d_alg->calc(d_frame0, d_frame1, flow);
+
+        cv::cuda::GpuMat flows[2];
+        cv::cuda::split(flow, flows);
+
+        cv::cuda::GpuMat u = flows[0];
+        cv::cuda::GpuMat v = flows[1];
+
+        CUDA_SANITY_CHECK(u, 1e-1);
+        CUDA_SANITY_CHECK(v, 1e-1);
+    }
+    else
+    {
+        cv::Mat flow;
+
+        cv::Ptr<cv::DualTVL1OpticalFlow> alg = cv::createOptFlow_DualTVL1();
+        alg->setMedianFiltering(1);
+        alg->setInnerIterations(1);
+        alg->setOuterIterations(300);
+        TEST_CYCLE() alg->calc(frame0, frame1, flow);
+
+        CPU_SANITY_CHECK(flow);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudaoptflow/perf/perf_precomp.hpp b/modules/cudaoptflow/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..5816c98583c
--- /dev/null
+++ b/modules/cudaoptflow/perf/perf_precomp.hpp
@@ -0,0 +1,57 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/video.hpp"
+
+namespace opencv_test {
+using namespace perf;
+using namespace testing;
+}
+
+#endif
diff --git a/modules/cudaoptflow/src/brox.cpp b/modules/cudaoptflow/src/brox.cpp
new file mode 100644
index 00000000000..11c541906be
--- /dev/null
+++ b/modules/cudaoptflow/src/brox.cpp
@@ -0,0 +1,194 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER)
+
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double, double, double, int, int, int) { throw_no_cuda(); return Ptr<BroxOpticalFlow>(); }
+
+#else
+
+namespace {
+
+    class BroxOpticalFlowImpl : public BroxOpticalFlow
+    {
+    public:
+        BroxOpticalFlowImpl(double alpha, double gamma, double scale_factor,
+                            int inner_iterations, int outer_iterations, int solver_iterations) :
+            alpha_(alpha), gamma_(gamma), scale_factor_(scale_factor),
+            inner_iterations_(inner_iterations), outer_iterations_(outer_iterations),
+            solver_iterations_(solver_iterations)
+        {
+        }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+        virtual double getFlowSmoothness() const { return alpha_; }
+        virtual void setFlowSmoothness(double alpha) { alpha_ = static_cast<float>(alpha); }
+
+        virtual double getGradientConstancyImportance() const { return gamma_; }
+        virtual void setGradientConstancyImportance(double gamma) { gamma_ = static_cast<float>(gamma); }
+
+        virtual double getPyramidScaleFactor() const { return scale_factor_; }
+        virtual void setPyramidScaleFactor(double scale_factor) { scale_factor_ = static_cast<float>(scale_factor); }
+
+        //! number of lagged non-linearity iterations (inner loop)
+        virtual int getInnerIterations() const { return inner_iterations_; }
+        virtual void setInnerIterations(int inner_iterations) { inner_iterations_ = inner_iterations; }
+
+        //! number of warping iterations (number of pyramid levels)
+        virtual int getOuterIterations() const { return outer_iterations_; }
+        virtual void setOuterIterations(int outer_iterations) { outer_iterations_ = outer_iterations; }
+
+        //! number of linear system solver iterations
+        virtual int getSolverIterations() const { return solver_iterations_; }
+        virtual void setSolverIterations(int solver_iterations) { solver_iterations_ = solver_iterations; }
+
+    private:
+        //! flow smoothness
+        float alpha_;
+
+        //! gradient constancy importance
+        float gamma_;
+
+        //! pyramid scale factor
+        float scale_factor_;
+
+        //! number of lagged non-linearity iterations (inner loop)
+        int inner_iterations_;
+
+        //! number of warping iterations (number of pyramid levels)
+        int outer_iterations_;
+
+        //! number of linear system solver iterations
+        int solver_iterations_;
+    };
+
+    static size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc,
+                             const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
+                             NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v,
+                             size_t textureAlignment)
+    {
+        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(textureAlignment));
+
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );
+
+        return gpuCounter.maxSize();
+    }
+
+    static void outputHandler(const String &msg)
+    {
+        CV_Error(cv::Error::GpuApiCallError, msg.c_str());
+    }
+
+    void BroxOpticalFlowImpl::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow, Stream& stream)
+    {
+        const GpuMat frame0 = _I0.getGpuMat();
+        const GpuMat frame1 = _I1.getGpuMat();
+
+        CV_Assert( frame0.type() == CV_32FC1 );
+        CV_Assert( frame1.size() == frame0.size() && frame1.type() == frame0.type() );
+
+        ncvSetDebugOutputHandler(outputHandler);
+
+        BufferPool pool(stream);
+        GpuMat u = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat v = pool.getBuffer(frame0.size(), CV_32FC1);
+
+        NCVBroxOpticalFlowDescriptor desc;
+        desc.alpha = alpha_;
+        desc.gamma = gamma_;
+        desc.scale_factor = scale_factor_;
+        desc.number_of_inner_iterations = inner_iterations_;
+        desc.number_of_outer_iterations = outer_iterations_;
+        desc.number_of_solver_iterations = solver_iterations_;
+
+        NCVMemSegment frame0MemSeg;
+        frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
+        frame0MemSeg.size = frame0.step * frame0.rows;
+
+        NCVMemSegment frame1MemSeg;
+        frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
+        frame1MemSeg.size = frame1.step * frame1.rows;
+
+        NCVMemSegment uMemSeg;
+        uMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        uMemSeg.begin.ptr = u.ptr();
+        uMemSeg.size = u.step * u.rows;
+
+        NCVMemSegment vMemSeg;
+        vMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        vMemSeg.begin.ptr = v.ptr();
+        vMemSeg.size = v.step * v.rows;
+
+        DeviceInfo devInfo;
+        size_t textureAlignment = devInfo.textureAlignment();
+
+        NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
+        NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
+        NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
+        NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
+
+        size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, textureAlignment);
+        GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);
+
+        NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(textureAlignment), buf.ptr());
+
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, StreamAccessor::getStream(stream)) );
+
+        GpuMat flows[] = {u, v};
+        cuda::merge(flows, 2, _flow, stream);
+    }
+}
+
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double alpha, double gamma, double scale_factor, int inner_iterations, int outer_iterations, int solver_iterations)
+{
+    return makePtr<BroxOpticalFlowImpl>(alpha, gamma, scale_factor, inner_iterations, outer_iterations, solver_iterations);
+}
+
+#endif /* HAVE_CUDA */
diff --git a/modules/cudaoptflow/src/cuda/farneback.cu b/modules/cudaoptflow/src/cuda/farneback.cu
new file mode 100644
index 00000000000..7c902c8d226
--- /dev/null
+++ b/modules/cudaoptflow/src/cuda/farneback.cu
@@ -0,0 +1,656 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+#define tx threadIdx.x
+#define ty threadIdx.y
+#define bx blockIdx.x
+#define by blockIdx.y
+#define bdx blockDim.x
+#define bdy blockDim.y
+
+#define BORDER_SIZE 5
+#define MAX_KSIZE_HALF 100
+
+namespace cv { namespace cuda { namespace device { namespace optflow_farneback
+{
+    __constant__ float c_g[8];
+    __constant__ float c_xg[8];
+    __constant__ float c_xxg[8];
+    __constant__ float c_ig11, c_ig03, c_ig33, c_ig55;
+
+
+    template <int polyN>
+    __global__ void polynomialExpansion(
+            const int height, const int width, const PtrStepf src, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * (bdx - 2*polyN) + tx - polyN;
+
+        if (y < height)
+        {
+            extern __shared__ float smem[];
+            volatile float *row = smem + tx;
+            int xWarped = ::min(::max(x, 0), width - 1);
+
+            row[0] = src(y, xWarped) * c_g[0];
+            row[bdx] = 0.f;
+            row[2*bdx] = 0.f;
+
+            for (int k = 1; k <= polyN; ++k)
+            {
+                float t0 = src(::max(y - k, 0), xWarped);
+                float t1 = src(::min(y + k, height - 1), xWarped);
+
+                row[0] += c_g[k] * (t0 + t1);
+                row[bdx] += c_xg[k] * (t1 - t0);
+                row[2*bdx] += c_xxg[k] * (t0 + t1);
+            }
+
+            __syncthreads();
+
+            if (tx >= polyN && tx + polyN < bdx && x < width)
+            {
+                float b1 = c_g[0] * row[0];
+                float b3 = c_g[0] * row[bdx];
+                float b5 = c_g[0] * row[2*bdx];
+                float b2 = 0, b4 = 0, b6 = 0;
+
+                for (int k = 1; k <= polyN; ++k)
+                {
+                    b1 += (row[k] + row[-k]) * c_g[k];
+                    b4 += (row[k] + row[-k]) * c_xxg[k];
+                    b2 += (row[k] - row[-k]) * c_xg[k];
+                    b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
+                    b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
+                    b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
+                }
+
+                dst(y, xWarped) = b3*c_ig11;
+                dst(height + y, xWarped) = b2*c_ig11;
+                dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33;
+                dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33;
+                dst(4*height + y, xWarped) = b6*c_ig55;
+            }
+        }
+    }
+
+
+    void setPolynomialExpansionConsts(
+            int polyN, const float *g, const float *xg, const float *xxg,
+            float ig11, float ig03, float ig33, float ig55)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55)));
+    }
+
+
+    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
+    {
+        dim3 block(256);
+        dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
+        int smem = 3 * block.x * sizeof(float);
+
+        if (polyN == 5)
+            polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
+        else if (polyN == 7)
+            polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    __constant__ float c_border[BORDER_SIZE + 1];
+
+    __global__ void updateMatrices(
+            const int height, const int width, const PtrStepf flowx, const PtrStepf flowy,
+            const PtrStepf R0, const PtrStepf R1, PtrStepf M)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        if (y < height && x < width)
+        {
+            float dx = flowx(y, x);
+            float dy = flowy(y, x);
+            float fx = x + dx;
+            float fy = y + dy;
+
+            int x1 = floorf(fx);
+            int y1 = floorf(fy);
+            fx -= x1; fy -= y1;
+
+            float r2, r3, r4, r5, r6;
+
+            if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
+            {
+                float a00 = (1.f - fx) * (1.f - fy);
+                float a01 = fx * (1.f - fy);
+                float a10 = (1.f - fx) * fy;
+                float a11 = fx * fy;
+
+                r2 = a00 * R1(y1, x1) +
+                     a01 * R1(y1, x1 + 1) +
+                     a10 * R1(y1 + 1, x1) +
+                     a11 * R1(y1 + 1, x1 + 1);
+
+                r3 = a00 * R1(height + y1, x1) +
+                     a01 * R1(height + y1, x1 + 1) +
+                     a10 * R1(height + y1 + 1, x1) +
+                     a11 * R1(height + y1 + 1, x1 + 1);
+
+                r4 = a00 * R1(2*height + y1, x1) +
+                     a01 * R1(2*height + y1, x1 + 1) +
+                     a10 * R1(2*height + y1 + 1, x1) +
+                     a11 * R1(2*height + y1 + 1, x1 + 1);
+
+                r5 = a00 * R1(3*height + y1, x1) +
+                     a01 * R1(3*height + y1, x1 + 1) +
+                     a10 * R1(3*height + y1 + 1, x1) +
+                     a11 * R1(3*height + y1 + 1, x1 + 1);
+
+                r6 = a00 * R1(4*height + y1, x1) +
+                     a01 * R1(4*height + y1, x1 + 1) +
+                     a10 * R1(4*height + y1 + 1, x1) +
+                     a11 * R1(4*height + y1 + 1, x1 + 1);
+
+                r4 = (R0(2*height + y, x) + r4) * 0.5f;
+                r5 = (R0(3*height + y, x) + r5) * 0.5f;
+                r6 = (R0(4*height + y, x) + r6) * 0.25f;
+            }
+            else
+            {
+                r2 = r3 = 0.f;
+                r4 = R0(2*height + y, x);
+                r5 = R0(3*height + y, x);
+                r6 = R0(4*height + y, x) * 0.5f;
+            }
+
+            r2 = (R0(y, x) - r2) * 0.5f;
+            r3 = (R0(height + y, x) - r3) * 0.5f;
+
+            r2 += r4*dy + r6*dx;
+            r3 += r6*dy + r5*dx;
+
+            float scale =
+                    c_border[::min(x, BORDER_SIZE)] *
+                    c_border[::min(y, BORDER_SIZE)] *
+                    c_border[::min(width - x - 1, BORDER_SIZE)] *
+                    c_border[::min(height - y - 1, BORDER_SIZE)];
+
+            r2 *= scale; r3 *= scale; r4 *= scale;
+            r5 *= scale; r6 *= scale;
+
+            M(y, x) = r4*r4 + r6*r6;
+            M(height + y, x) = (r4 + r5)*r6;
+            M(2*height + y, x) = r5*r5 + r6*r6;
+            M(3*height + y, x) = r4*r2 + r6*r3;
+            M(4*height + y, x) = r6*r2 + r5*r3;
+        }
+    }
+
+
+    void setUpdateMatricesConsts()
+    {
+        static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f};
+        cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border)));
+    }
+
+
+    void updateMatricesGpu(
+            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
+            PtrStepSzf M, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
+
+        updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    __global__ void updateFlow(
+            const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        if (y < height && x < width)
+        {
+            float g11 = M(y, x);
+            float g12 = M(height + y, x);
+            float g22 = M(2*height + y, x);
+            float h1 = M(3*height + y, x);
+            float h2 = M(4*height + y, x);
+
+            float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
+
+            flowx(y, x) = (g11*h2 - g12*h1) * detInv;
+            flowy(y, x) = (g22*h1 - g12*h2) * detInv;
+        }
+    }
+
+
+    void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
+
+        updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    /*__global__ void boxFilter(
+            const int height, const int width, const PtrStepf src,
+            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = ::min(::max(xExt, 0), width - 1);
+
+                row[i] = src(y, xExt);
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt);
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal passs
+                row += tx + ksizeHalf;
+                float res = row[0];
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    res += row[-i] + row[i];
+                dst(y, x) = res * boxAreaInv;
+            }
+        }
+    }
+
+
+    void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        dim3 block(256);
+        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
+
+        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+        boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }*/
+
+
+    __global__ void boxFilter5(
+            const int height, const int width, const PtrStepf src,
+            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+
+        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
+        volatile float *row = smem + 5 * ty * smw;
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = ::min(::max(xExt, 0), width - 1);
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    row[k*smw + i] = src(k*height + y, xExt);
+
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        row[k*smw + i] +=
+                                src(k*height + ::max(y - j, 0), xExt) +
+                                src(k*height + ::min(y + j, height - 1), xExt);
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal passs
+
+                row += tx + ksizeHalf;
+                float res[5];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    res[k] = row[k*smw];
+
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        res[k] += row[k*smw - i] + row[k*smw + i];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    dst(k*height + y, x) = res[k] * boxAreaInv;
+            }
+        }
+    }
+
+
+    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows / 5;
+        int width = src.cols;
+
+        dim3 block(256);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
+
+        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows / 5;
+        int width = src.cols;
+
+        dim3 block(128);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
+
+        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
+
+    template <typename Border>
+    __global__ void gaussianBlur(
+            const int height, const int width, const PtrStepf src, const int ksizeHalf,
+            const Border b, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = b.idx_col(xExt);
+                row[i] = src(y, xExt) * c_gKer[0];
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    row[i] +=
+                            (src(b.idx_row_low(y - j), xExt) +
+                             src(b.idx_row_high(y + j), xExt)) * c_gKer[j];
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal pass
+                row += tx + ksizeHalf;
+                float res = row[0] * c_gKer[0];
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    res += (row[-i] + row[i]) * c_gKer[i];
+                dst(y, x) = res;
+            }
+        }
+    }
+
+
+    void setGaussianBlurKernel(const float *gKer, int ksizeHalf)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer)));
+    }
+
+
+    template <typename Border>
+    void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows;
+        int width = src.cols;
+
+        dim3 block(256);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
+        Border b(height, width);
+
+        gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    void gaussianBlurGpu(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
+
+        static const caller_t callers[] =
+        {
+            0 /*gaussianBlurCaller<BrdConstant<float> >*/,
+            gaussianBlurCaller<BrdReplicate<float> >,
+            0 /*gaussianBlurCaller<BrdReflect<float> >*/,
+            0 /*gaussianBlurCaller<BrdWrap<float> >*/,
+            gaussianBlurCaller<BrdReflect101<float> >
+        };
+
+        callers[borderMode](src, ksizeHalf, dst, stream);
+    }
+
+
+    template <typename Border>
+    __global__ void gaussianBlur5(
+            const int height, const int width, const PtrStepf src, const int ksizeHalf,
+            const Border b, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+
+        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
+        volatile float *row = smem + 5 * ty * smw;
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = b.idx_col(xExt);
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0];
+
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        row[k*smw + i] +=
+                                (src(k*height + b.idx_row_low(y - j), xExt) +
+                                 src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j];
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal pass
+
+                row += tx + ksizeHalf;
+                float res[5];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    res[k] = row[k*smw] * c_gKer[0];
+
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    dst(k*height + y, x) = res[k];
+            }
+        }
+    }
+
+
+    template <typename Border, int blockDimX>
+    void gaussianBlur5Caller(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows / 5;
+        int width = src.cols;
+
+        dim3 block(blockDimX);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
+        Border b(height, width);
+
+        gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    void gaussianBlur5Gpu(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
+
+        static const caller_t callers[] =
+        {
+            0 /*gaussianBlur5Caller<BrdConstant<float>,256>*/,
+            gaussianBlur5Caller<BrdReplicate<float>,256>,
+            0 /*gaussianBlur5Caller<BrdReflect<float>,256>*/,
+            0 /*gaussianBlur5Caller<BrdWrap<float>,256>*/,
+            gaussianBlur5Caller<BrdReflect101<float>,256>
+        };
+
+        callers[borderMode](src, ksizeHalf, dst, stream);
+    }
+
+    void gaussianBlur5Gpu_CC11(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
+
+        static const caller_t callers[] =
+        {
+            0 /*gaussianBlur5Caller<BrdConstant<float>,128>*/,
+            gaussianBlur5Caller<BrdReplicate<float>,128>,
+            0 /*gaussianBlur5Caller<BrdReflect<float>,128>*/,
+            0 /*gaussianBlur5Caller<BrdWrap<float>,128>*/,
+            gaussianBlur5Caller<BrdReflect101<float>,128>
+        };
+
+        callers[borderMode](src, ksizeHalf, dst, stream);
+    }
+
+}}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaoptflow/src/cuda/pyrlk.cu b/modules/cudaoptflow/src/cuda/pyrlk.cu
new file mode 100644
index 00000000000..5d18c7697e5
--- /dev/null
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@@ -0,0 +1,1162 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+#include <iostream>
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace pyrlk
+{
+    __constant__ int c_winSize_x;
+    __constant__ int c_winSize_y;
+    __constant__ int c_halfWin_x;
+    __constant__ int c_halfWin_y;
+    __constant__ int c_iters;
+
+    texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+
+    texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+
+    texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+
+
+    template <int cn, typename T> struct Tex_I
+    {
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T, cn>::vec_type> I)
+        {
+            CV_UNUSED(I);
+        }
+    };
+
+    template <> struct Tex_I<1, uchar>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_I8U, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& I)
+        {
+            bindTexture(&tex_I8U, I);
+        }
+    };
+    template <> struct Tex_I<1, ushort>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return 0.0;
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort>& I)
+        {
+            CV_UNUSED(I);
+        }
+    };
+    template <> struct Tex_I<1, int>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return 0.0;
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<int>& I)
+        {
+            CV_UNUSED(I);
+        }
+    };
+    template <> struct Tex_I<1, float>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_If, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& I)
+        {
+            bindTexture(&tex_If, I);
+        }
+    };
+    // ****************** 3 channel specializations ************************
+    template <> struct Tex_I<3, uchar>
+    {
+        static __device__ __forceinline__ float3 read(float x, float y)
+        {
+            return make_float3(0,0,0);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar3> I)
+        {
+            CV_UNUSED(I);
+        }
+    };
+    template <> struct Tex_I<3, ushort>
+    {
+        static __device__ __forceinline__ float3 read(float x, float y)
+        {
+            return make_float3(0, 0, 0);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort3> I)
+        {
+            CV_UNUSED(I);
+        }
+    };
+    template <> struct Tex_I<3, int>
+    {
+        static __device__ __forceinline__ float3 read(float x, float y)
+        {
+            return make_float3(0, 0, 0);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<int3> I)
+        {
+            CV_UNUSED(I);
+        }
+    };
+    template <> struct Tex_I<3, float>
+    {
+        static __device__ __forceinline__ float3 read(float x, float y)
+        {
+            return make_float3(0, 0, 0);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float3> I)
+        {
+            CV_UNUSED(I);
+        }
+    };
+    // ****************** 4 channel specializations ************************
+
+    template <> struct Tex_I<4, uchar>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_I8UC4, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& I)
+        {
+            bindTexture(&tex_I8UC4, I);
+        }
+    };
+    template <> struct Tex_I<4, ushort>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_I16UC4, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& I)
+        {
+            bindTexture(&tex_I16UC4, I);
+        }
+    };
+    template <> struct Tex_I<4, float>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_If4, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& I)
+        {
+            bindTexture(&tex_If4, I);
+        }
+    };
+    // ************* J  ***************
+    template <int cn, typename T> struct Tex_J
+    {
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T,cn>::vec_type>& J)
+        {
+            CV_UNUSED(J);
+        }
+    };
+    template <> struct Tex_J<1, uchar>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_J8U, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& J)
+        {
+            bindTexture(&tex_J8U, J);
+        }
+    };
+    template <> struct Tex_J<1, float>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_Jf, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& J)
+        {
+            bindTexture(&tex_Jf, J);
+        }
+    };
+    // ************* 4 channel specializations ***************
+    template <> struct Tex_J<4, uchar>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_J8UC4, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& J)
+        {
+            bindTexture(&tex_J8UC4, J);
+        }
+    };
+    template <> struct Tex_J<4, ushort>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_J16UC4, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& J)
+        {
+            bindTexture(&tex_J16UC4, J);
+        }
+    };
+    template <> struct Tex_J<4, float>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_Jf4, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& J)
+        {
+            bindTexture(&tex_Jf4, J);
+        }
+    };
+
+    __device__ __forceinline__ void accum(float& dst, const float& val)
+    {
+        dst += val;
+    }
+    __device__ __forceinline__ void accum(float& dst, const float2& val)
+    {
+        dst += val.x + val.y;
+    }
+    __device__ __forceinline__ void accum(float& dst, const float3& val)
+    {
+        dst += val.x + val.y + val.z;
+    }
+    __device__ __forceinline__ void accum(float& dst, const float4& val)
+    {
+        dst += val.x + val.y + val.z + val.w;
+    }
+
+    __device__ __forceinline__ float abs_(float a)
+    {
+        return ::fabsf(a);
+    }
+    __device__ __forceinline__ float4 abs_(const float4& a)
+    {
+        return abs(a);
+    }
+    __device__ __forceinline__ float2 abs_(const float2& a)
+    {
+        return abs(a);
+    }
+    __device__ __forceinline__ float3 abs_(const float3& a)
+    {
+        return abs(a);
+    }
+
+
+    template<typename T> __device__ __forceinline__ typename TypeVec<float, 1>::vec_type ToFloat(const typename TypeVec<T, 1>::vec_type& other)
+    {
+        return other;
+    }
+    template<typename T> __device__ __forceinline__  typename TypeVec<float, 2>::vec_type ToFloat(const typename TypeVec<T, 2>::vec_type& other)
+    {
+        typename TypeVec<float, 2>::vec_type ret;
+        ret.x = other.x;
+        ret.y = other.y;
+        return ret;
+    }
+    template<typename T> __device__ __forceinline__  typename TypeVec<float, 3>::vec_type ToFloat(const typename TypeVec<T, 3>::vec_type& other)
+    {
+        typename TypeVec<float, 3>::vec_type ret;
+        ret.x = other.x;
+        ret.y = other.y;
+        ret.z = other.z;
+        return ret;
+    }
+    template<typename T> __device__ __forceinline__  typename TypeVec<float, 4>::vec_type ToFloat(const typename TypeVec<T, 4>::vec_type& other)
+    {
+        typename TypeVec<float, 4>::vec_type ret;
+        ret.x = other.x;
+        ret.y = other.y;
+        ret.z = other.z;
+        ret.w = other.w;
+        return ret;
+    }
+
+    template <typename T>
+    struct DenormalizationFactor
+    {
+        static __device__ __forceinline__ float factor()
+        {
+            return 1.0f;
+        }
+    };
+
+    template <>
+    struct DenormalizationFactor<uchar>
+    {
+        static __device__ __forceinline__ float factor()
+        {
+            return 255.0f;
+        }
+    };
+
+    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T>
+    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
+    {
+    #if __CUDA_ARCH__ <= 110
+        const int BLOCK_SIZE = 128;
+    #else
+        const int BLOCK_SIZE = 256;
+    #endif
+
+        __shared__ float smem1[BLOCK_SIZE];
+        __shared__ float smem2[BLOCK_SIZE];
+        __shared__ float smem3[BLOCK_SIZE];
+
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        float2 prevPt = prevPts[blockIdx.x];
+        prevPt.x *= (1.0f / (1 << level));
+        prevPt.y *= (1.0f / (1 << level));
+
+        if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+
+            return;
+        }
+
+        prevPt.x -= c_halfWin_x;
+        prevPt.y -= c_halfWin_y;
+
+        // extract the patch from the first image, compute covariation matrix of derivatives
+
+        float A11 = 0;
+        float A12 = 0;
+        float A22 = 0;
+
+        typedef typename TypeVec<float, cn>::vec_type work_type;
+
+        work_type I_patch   [PATCH_Y][PATCH_X];
+        work_type dIdx_patch[PATCH_Y][PATCH_X];
+        work_type dIdy_patch[PATCH_Y][PATCH_X];
+
+        for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
+        {
+            for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
+            {
+                float x = prevPt.x + xBase + 0.5f;
+                float y = prevPt.y + yBase + 0.5f;
+
+                I_patch[i][j] = Tex_I<cn, T>::read(x, y);
+
+                // Sharr Deriv
+
+                work_type dIdx = 3.0f * Tex_I<cn,T>::read(x+1, y-1) + 10.0f * Tex_I<cn, T>::read(x+1, y) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
+                                 (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x-1, y) + 3.0f * Tex_I<cn,T>::read(x-1, y+1));
+
+                work_type dIdy = 3.0f * Tex_I<cn,T>::read(x-1, y+1) + 10.0f * Tex_I<cn, T>::read(x, y+1) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
+                                (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x, y-1) + 3.0f * Tex_I<cn,T>::read(x+1, y-1));
+
+                dIdx_patch[i][j] = dIdx;
+                dIdy_patch[i][j] = dIdy;
+
+                accum(A11, dIdx * dIdx);
+                accum(A12, dIdx * dIdy);
+                accum(A22, dIdy * dIdy);
+            }
+        }
+
+        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
+
+    #if __CUDA_ARCH__ >= 300
+        if (tid == 0)
+        {
+            smem1[0] = A11;
+            smem2[0] = A12;
+            smem3[0] = A22;
+        }
+    #endif
+
+        __syncthreads();
+
+        A11 = smem1[0];
+        A12 = smem2[0];
+        A22 = smem3[0];
+
+        float D = A11 * A22 - A12 * A12;
+
+        if (D < numeric_limits<float>::epsilon())
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+
+            return;
+        }
+
+        D = 1.f / D;
+
+        A11 *= D;
+        A12 *= D;
+        A22 *= D;
+
+        float2 nextPt = nextPts[blockIdx.x];
+        nextPt.x *= 2.f;
+        nextPt.y *= 2.f;
+
+        nextPt.x -= c_halfWin_x;
+        nextPt.y -= c_halfWin_y;
+
+        for (int k = 0; k < c_iters; ++k)
+        {
+            if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
+            {
+                if (tid == 0 && level == 0)
+                    status[blockIdx.x] = 0;
+
+                return;
+            }
+
+            float b1 = 0;
+            float b2 = 0;
+
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
+            {
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
+                {
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
+                    work_type diff = (J_val - I_val) * 32.0f;
+
+                    accum(b1, diff * dIdx_patch[i][j]);
+                    accum(b2, diff * dIdy_patch[i][j]);
+                }
+            }
+
+            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
+
+        #if __CUDA_ARCH__ >= 300
+            if (tid == 0)
+            {
+                smem1[0] = b1;
+                smem2[0] = b2;
+            }
+        #endif
+
+            __syncthreads();
+
+            b1 = smem1[0];
+            b2 = smem2[0];
+
+            float2 delta;
+            delta.x = A12 * b2 - A22 * b1;
+            delta.y = A12 * b1 - A11 * b2;
+
+            nextPt.x += delta.x;
+            nextPt.y += delta.y;
+
+            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
+                break;
+        }
+
+        float errval = 0;
+        if (calcErr)
+        {
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
+            {
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
+                {
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
+                    work_type diff = J_val - I_val;
+
+                    accum(errval, abs_(diff));
+                }
+            }
+
+            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
+        }
+
+        if (tid == 0)
+        {
+            nextPt.x += c_halfWin_x;
+            nextPt.y += c_halfWin_y;
+
+            nextPts[blockIdx.x] = nextPt;
+
+            if (calcErr)
+                err[blockIdx.x] = static_cast<float>(errval) / (::min(cn, 3) * c_winSize_x * c_winSize_y) * DenormalizationFactor<T>::factor();
+        }
+    }
+
+    // Kernel, uses non texture fetches
+    template <int PATCH_X, int PATCH_Y, bool calcErr, int cn, typename T, typename Ptr2D>
+    __global__ void sparseKernel_(Ptr2D I, Ptr2D J, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
+    {
+#if __CUDA_ARCH__ <= 110
+        const int BLOCK_SIZE = 128;
+#else
+        const int BLOCK_SIZE = 256;
+#endif
+
+        __shared__ float smem1[BLOCK_SIZE];
+        __shared__ float smem2[BLOCK_SIZE];
+        __shared__ float smem3[BLOCK_SIZE];
+
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        float2 prevPt = prevPts[blockIdx.x];
+        prevPt.x *= (1.0f / (1 << level));
+        prevPt.y *= (1.0f / (1 << level));
+
+        if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+
+            return;
+        }
+
+        prevPt.x -= c_halfWin_x;
+        prevPt.y -= c_halfWin_y;
+
+        // extract the patch from the first image, compute covariation matrix of derivatives
+
+        float A11 = 0;
+        float A12 = 0;
+        float A22 = 0;
+
+        typedef typename TypeVec<float, cn>::vec_type work_type;
+
+        work_type I_patch[PATCH_Y][PATCH_X];
+        work_type dIdx_patch[PATCH_Y][PATCH_X];
+        work_type dIdy_patch[PATCH_Y][PATCH_X];
+
+        for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
+        {
+            for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
+            {
+                float x = prevPt.x + xBase + 0.5f;
+                float y = prevPt.y + yBase + 0.5f;
+
+                I_patch[i][j] = ToFloat<T>(I(y, x));
+
+                // Sharr Deriv
+
+                work_type dIdx = 3.0f * I(y - 1, x + 1) + 10.0f * I(y, x + 1) + 3.0f * I(y + 1, x + 1) -
+                    (3.0f * I(y - 1, x - 1) + 10.0f * I(y, x - 1) + 3.0f * I(y + 1 , x - 1));
+
+                work_type dIdy = 3.0f * I(y + 1, x - 1) + 10.0f * I(y + 1, x) + 3.0f * I(y+1, x + 1) -
+                    (3.0f * I(y - 1, x - 1) + 10.0f * I(y-1, x) + 3.0f * I(y - 1, x + 1));
+
+                dIdx_patch[i][j] = dIdx;
+                dIdy_patch[i][j] = dIdy;
+
+                accum(A11, dIdx * dIdx);
+                accum(A12, dIdx * dIdy);
+                accum(A22, dIdy * dIdy);
+            }
+        }
+
+        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
+
+#if __CUDA_ARCH__ >= 300
+        if (tid == 0)
+        {
+            smem1[0] = A11;
+            smem2[0] = A12;
+            smem3[0] = A22;
+        }
+#endif
+
+        __syncthreads();
+
+        A11 = smem1[0];
+        A12 = smem2[0];
+        A22 = smem3[0];
+
+        float D = A11 * A22 - A12 * A12;
+
+        if (D < numeric_limits<float>::epsilon())
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+
+            return;
+        }
+
+        D = 1.f / D;
+
+        A11 *= D;
+        A12 *= D;
+        A22 *= D;
+
+        float2 nextPt = nextPts[blockIdx.x];
+        nextPt.x *= 2.f;
+        nextPt.y *= 2.f;
+
+        nextPt.x -= c_halfWin_x;
+        nextPt.y -= c_halfWin_y;
+
+        for (int k = 0; k < c_iters; ++k)
+        {
+            if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
+            {
+                if (tid == 0 && level == 0)
+                    status[blockIdx.x] = 0;
+
+                return;
+            }
+
+            float b1 = 0;
+            float b2 = 0;
+
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
+            {
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
+                {
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = ToFloat<T>(J(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f));
+
+                    work_type diff = (J_val - I_val) * 32.0f;
+
+                    accum(b1, diff * dIdx_patch[i][j]);
+                    accum(b2, diff * dIdy_patch[i][j]);
+                }
+            }
+
+            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
+
+#if __CUDA_ARCH__ >= 300
+            if (tid == 0)
+            {
+                smem1[0] = b1;
+                smem2[0] = b2;
+            }
+#endif
+
+            __syncthreads();
+
+            b1 = smem1[0];
+            b2 = smem2[0];
+
+            float2 delta;
+            delta.x = A12 * b2 - A22 * b1;
+            delta.y = A12 * b1 - A11 * b2;
+
+            nextPt.x += delta.x;
+            nextPt.y += delta.y;
+
+            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
+                break;
+        }
+
+        float errval = 0;
+        if (calcErr)
+        {
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
+            {
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
+                {
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = ToFloat<T>(J(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f));
+
+                    work_type diff = J_val - I_val;
+
+                    accum(errval, abs_(diff));
+                }
+            }
+
+            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
+        }
+
+        if (tid == 0)
+        {
+            nextPt.x += c_halfWin_x;
+            nextPt.y += c_halfWin_y;
+
+            nextPts[blockIdx.x] = nextPt;
+
+            if (calcErr)
+                err[blockIdx.x] = static_cast<float>(errval) / (::min(cn, 3)*c_winSize_x * c_winSize_y);
+        }
+    } // __global__ void sparseKernel_
+
+
+    template <int cn, int PATCH_X, int PATCH_Y, typename T> class sparse_caller
+    {
+    public:
+        static void call(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, cudaStream_t stream)
+        {
+            dim3 grid(ptcount);
+            CV_UNUSED(I);
+            CV_UNUSED(J);
+            if (level == 0 && err)
+                sparseKernel<cn, PATCH_X, PATCH_Y, true, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
+            else
+                sparseKernel<cn, PATCH_X, PATCH_Y, false, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
+
+            cudaSafeCall(cudaGetLastError());
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
+    // Specialization to use non texture path because for some reason the texture path keeps failing accuracy tests
+    template<int PATCH_X, int PATCH_Y> class sparse_caller<1, PATCH_X, PATCH_Y, unsigned short>
+    {
+    public:
+        typedef typename TypeVec<unsigned short, 1>::vec_type work_type;
+        typedef PtrStepSz<work_type> Ptr2D;
+        typedef BrdConstant<work_type> BrdType;
+        typedef BorderReader<Ptr2D, BrdType> Reader;
+        typedef LinearFilter<Reader> Filter;
+        static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, cudaStream_t stream)
+        {
+            dim3 grid(ptcount);
+            if (level == 0 && err)
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, true, 1, unsigned short> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            else
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, false, 1, unsigned short> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            cudaSafeCall(cudaGetLastError());
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
+    // Specialization for int because the texture path keeps failing
+    template<int PATCH_X, int PATCH_Y> class sparse_caller<1, PATCH_X, PATCH_Y, int>
+    {
+    public:
+        typedef typename TypeVec<int, 1>::vec_type work_type;
+        typedef PtrStepSz<work_type> Ptr2D;
+        typedef BrdConstant<work_type> BrdType;
+        typedef BorderReader<Ptr2D, BrdType> Reader;
+        typedef LinearFilter<Reader> Filter;
+        static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, cudaStream_t stream)
+        {
+            dim3 grid(ptcount);
+            if (level == 0 && err)
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, true, 1, int> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            else
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, false, 1, int> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            cudaSafeCall(cudaGetLastError());
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
+    template<int PATCH_X, int PATCH_Y> class sparse_caller<4, PATCH_X, PATCH_Y, int>
+    {
+    public:
+        typedef typename TypeVec<int, 4>::vec_type work_type;
+        typedef PtrStepSz<work_type> Ptr2D;
+        typedef BrdConstant<work_type> BrdType;
+        typedef BorderReader<Ptr2D, BrdType> Reader;
+        typedef LinearFilter<Reader> Filter;
+        static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, cudaStream_t stream)
+        {
+            dim3 grid(ptcount);
+            if (level == 0 && err)
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, true, 4, int> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            else
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, false, 4, int> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            cudaSafeCall(cudaGetLastError());
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
+    using namespace cv::cuda::device;
+    template <int PATCH_X, int PATCH_Y, typename T> class sparse_caller<3, PATCH_X, PATCH_Y, T>
+    {
+    public:
+        typedef typename TypeVec<T, 3>::vec_type work_type;
+        typedef PtrStepSz<work_type> Ptr2D;
+        typedef BrdConstant<work_type> BrdType;
+        typedef BorderReader<Ptr2D, BrdType> Reader;
+        typedef LinearFilter<Reader> Filter;
+        static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, cudaStream_t stream)
+        {
+            dim3 grid(ptcount);
+            if (level == 0 && err)
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, true, 3, T> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            else
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, false, 3, T> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            cudaSafeCall(cudaGetLastError());
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
+
+
+    template <bool calcErr>
+    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
+    {
+        extern __shared__ int smem[];
+
+        const int patchWidth  = blockDim.x + 2 * c_halfWin_x;
+        const int patchHeight = blockDim.y + 2 * c_halfWin_y;
+
+        int* I_patch = smem;
+        int* dIdx_patch = I_patch + patchWidth * patchHeight;
+        int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
+
+        const int xBase = blockIdx.x * blockDim.x;
+        const int yBase = blockIdx.y * blockDim.y;
+
+        for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
+        {
+            for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
+            {
+                float x = xBase - c_halfWin_x + j + 0.5f;
+                float y = yBase - c_halfWin_y + i + 0.5f;
+
+                I_patch[i * patchWidth + j] = tex2D(tex_If, x, y);
+
+                // Sharr Deriv
+
+                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x+1, y-1) + 10 * tex2D(tex_If, x+1, y) + 3 * tex2D(tex_If, x+1, y+1) -
+                                                (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x-1, y) + 3 * tex2D(tex_If, x-1, y+1));
+
+                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x-1, y+1) + 10 * tex2D(tex_If, x, y+1) + 3 * tex2D(tex_If, x+1, y+1) -
+                                                (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x, y-1) + 3 * tex2D(tex_If, x+1, y-1));
+            }
+        }
+
+        __syncthreads();
+
+        const int x = xBase + threadIdx.x;
+        const int y = yBase + threadIdx.y;
+
+        if (x >= cols || y >= rows)
+            return;
+
+
+        int A11i = 0;
+        int A12i = 0;
+        int A22i = 0;
+
+        for (int i = 0; i < c_winSize_y; ++i)
+        {
+            for (int j = 0; j < c_winSize_x; ++j)
+            {
+                int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+                int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+
+                A11i += dIdx * dIdx;
+                A12i += dIdx * dIdy;
+                A22i += dIdy * dIdy;
+            }
+        }
+
+        float A11 = A11i;
+        float A12 = A12i;
+        float A22 = A22i;
+
+        float D = A11 * A22 - A12 * A12;
+
+        if (D < numeric_limits<float>::epsilon())
+        {
+            if (calcErr)
+                err(y, x) = numeric_limits<float>::max();
+            return;
+        }
+
+        D = 1.f / D;
+
+        A11 *= D;
+        A12 *= D;
+        A22 *= D;
+
+        float2 nextPt;
+        nextPt.x = x + prevU(y/2, x/2) * 2.0f;
+        nextPt.y = y + prevV(y/2, x/2) * 2.0f;
+
+        for (int k = 0; k < c_iters; ++k)
+        {
+            if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
+            {
+                if (calcErr)
+                    err(y, x) = numeric_limits<float>::max();
+
+                return;
+            }
+
+            int b1 = 0;
+            int b2 = 0;
+
+            for (int i = 0; i < c_winSize_y; ++i)
+            {
+                for (int j = 0; j < c_winSize_x; ++j)
+                {
+                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
+                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+
+                    int diff = (J - I) * 32;
+
+                    int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+                    int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+
+                    b1 += diff * dIdx;
+                    b2 += diff * dIdy;
+                }
+            }
+
+
+            float2 delta;
+            delta.x = A12 * b2 - A22 * b1;
+            delta.y = A12 * b1 - A11 * b2;
+
+            nextPt.x += delta.x;
+            nextPt.y += delta.y;
+
+            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
+                break;
+        }
+
+        u(y, x) = nextPt.x - x;
+        v(y, x) = nextPt.y - y;
+
+        if (calcErr)
+        {
+            int errval = 0;
+
+            for (int i = 0; i < c_winSize_y; ++i)
+            {
+                for (int j = 0; j < c_winSize_x; ++j)
+                {
+                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
+                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+
+                    errval += ::abs(J - I);
+                }
+            }
+
+            err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
+        }
+    }
+
+    void loadWinSize(int* winSize, int* halfWinSize, cudaStream_t stream)
+    {
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, winSize, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, winSize + 1, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, halfWinSize, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, halfWinSize + 1, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+    }
+
+    void loadIters(int* iters, cudaStream_t stream)
+    {
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+    }
+
+    void loadConstants(int2 winSize_, int iters_, cudaStream_t stream)
+    {
+        static int2 winSize = make_int2(0,0);
+        if(winSize.x != winSize_.x || winSize.y != winSize_.y)
+        {
+            winSize = winSize_;
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        }
+
+        static int2 halfWin = make_int2(0,0);
+        int2 half = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+        if(halfWin.x != half.x || halfWin.y != half.y)
+        {
+            halfWin = half;
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        }
+
+        static int iters = 0;
+        if(iters != iters_)
+        {
+            iters = iters_;
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        }
+    }
+
+    template<typename T, int cn> struct pyrLK_caller
+    {
+        static void sparse(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, dim3 patch, cudaStream_t stream)
+        {
+            typedef void(*func_t)(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J,
+                int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                int level, dim3 block, cudaStream_t stream);
+
+            static const func_t funcs[5][5] =
+            {
+                { sparse_caller<cn, 1, 1,T>::call, sparse_caller<cn, 2, 1,T>::call, sparse_caller<cn, 3, 1,T>::call, sparse_caller<cn, 4, 1,T>::call, sparse_caller<cn, 5, 1,T>::call },
+                { sparse_caller<cn, 1, 2,T>::call, sparse_caller<cn, 2, 2,T>::call, sparse_caller<cn, 3, 2,T>::call, sparse_caller<cn, 4, 2,T>::call, sparse_caller<cn, 5, 2,T>::call },
+                { sparse_caller<cn, 1, 3,T>::call, sparse_caller<cn, 2, 3,T>::call, sparse_caller<cn, 3, 3,T>::call, sparse_caller<cn, 4, 3,T>::call, sparse_caller<cn, 5, 3,T>::call },
+                { sparse_caller<cn, 1, 4,T>::call, sparse_caller<cn, 2, 4,T>::call, sparse_caller<cn, 3, 4,T>::call, sparse_caller<cn, 4, 4,T>::call, sparse_caller<cn, 5, 4,T>::call },
+                { sparse_caller<cn, 1, 5,T>::call, sparse_caller<cn, 2, 5,T>::call, sparse_caller<cn, 3, 5,T>::call, sparse_caller<cn, 4, 5,T>::call, sparse_caller<cn, 5, 5,T>::call }
+            };
+
+            Tex_I<cn, T>::bindTexture_(I);
+            Tex_J<cn, T>::bindTexture_(J);
+
+            funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+                level, block, stream);
+        }
+        static void dense(PtrStepSz<T> I, PtrStepSz<T> J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
+        {
+            dim3 block(16, 16);
+            dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
+            Tex_I<1, T>::bindTexture_(I);
+            Tex_J<1, T>::bindTexture_(J);
+
+            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+            const int patchWidth = block.x + 2 * halfWin.x;
+            const int patchHeight = block.y + 2 * halfWin.y;
+            size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
+
+            if (err.data)
+            {
+                denseKernel<true> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, err, I.rows, I.cols);
+                cudaSafeCall(cudaGetLastError());
+            }
+            else
+            {
+                denseKernel<false> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
+                cudaSafeCall(cudaGetLastError());
+            }
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
+
+    template class pyrLK_caller<unsigned char,1>;
+    template class pyrLK_caller<unsigned short,1>;
+    template class pyrLK_caller<int,1>;
+    template class pyrLK_caller<float,1>;
+
+    template class pyrLK_caller<unsigned char, 3>;
+    template class pyrLK_caller<unsigned short, 3>;
+    template class pyrLK_caller<int, 3>;
+    template class pyrLK_caller<float, 3>;
+
+    template class pyrLK_caller<unsigned char, 4>;
+    template class pyrLK_caller<unsigned short, 4>;
+    template class pyrLK_caller<int, 4>;
+    template class pyrLK_caller<float, 4>;
+}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaoptflow/src/cuda/tvl1flow.cu b/modules/cudaoptflow/src/cuda/tvl1flow.cu
new file mode 100644
index 00000000000..66f0d664a03
--- /dev/null
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@@ -0,0 +1,372 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+////////////////////////////////////////////////////////////
+// centeredGradient
+
+namespace tvl1flow
+{
+    __global__ void centeredGradientKernel(const PtrStepSzf src, PtrStepf dx, PtrStepf dy)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= src.cols || y >= src.rows)
+            return;
+
+        dx(y, x) = 0.5f * (src(y, ::min(x + 1, src.cols - 1)) - src(y, ::max(x - 1, 0)));
+        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
+    }
+
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        centeredGradientKernel<<<grid, block, 0, stream>>>(src, dx, dy);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// warpBackward
+
+namespace tvl1flow
+{
+    static __device__ __forceinline__ float bicubicCoeff(float x_)
+    {
+        float x = fabsf(x_);
+        if (x <= 1.0f)
+        {
+            return x * x * (1.5f * x - 2.5f) + 1.0f;
+        }
+        else if (x < 2.0f)
+        {
+            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+        }
+        else
+        {
+            return 0.0f;
+        }
+    }
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= I0.cols || y >= I0.rows)
+            return;
+
+        const float u1Val = u1(y, x);
+        const float u2Val = u2(y, x);
+
+        const float wx = x + u1Val;
+        const float wy = y + u2Val;
+
+        const int xmin = ::ceilf(wx - 2.0f);
+        const int xmax = ::floorf(wx + 2.0f);
+
+        const int ymin = ::ceilf(wy - 2.0f);
+        const int ymax = ::floorf(wy + 2.0f);
+
+        float sum  = 0.0f;
+        float sumx = 0.0f;
+        float sumy = 0.0f;
+        float wsum = 0.0f;
+
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
+
+                sum  += w * tex2D(tex_I1 , cx, cy);
+                sumx += w * tex2D(tex_I1x, cx, cy);
+                sumy += w * tex2D(tex_I1y, cx, cy);
+
+                wsum += w;
+            }
+        }
+
+        const float coeff = 1.0f / wsum;
+
+        const float I1wVal  = sum  * coeff;
+        const float I1wxVal = sumx * coeff;
+        const float I1wyVal = sumy * coeff;
+
+        I1w(y, x)  = I1wVal;
+        I1wx(y, x) = I1wxVal;
+        I1wy(y, x) = I1wyVal;
+
+        const float Ix2 = I1wxVal * I1wxVal;
+        const float Iy2 = I1wyVal * I1wyVal;
+
+        // store the |Grad(I1)|^2
+        grad(y, x) = Ix2 + Iy2;
+
+        // compute the constant part of the rho function
+        const float I0Val = I0(y, x);
+        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
+    }
+
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
+                      PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx,
+                      PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
+                      cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
+
+        bindTexture(&tex_I1 , I1);
+        bindTexture(&tex_I1x, I1x);
+        bindTexture(&tex_I1y, I1y);
+
+        warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// estimateU
+
+namespace tvl1flow
+{
+    __device__ float divergence(const PtrStepf& v1, const PtrStepf& v2, int y, int x)
+    {
+        if (x > 0 && y > 0)
+        {
+            const float v1x = v1(y, x) - v1(y, x - 1);
+            const float v2y = v2(y, x) - v2(y - 1, x);
+            return v1x + v2y;
+        }
+        else
+        {
+            if (y > 0)
+                return v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
+            else
+            {
+                if (x > 0)
+                    return v1(0, x) - v1(0, x - 1) + v2(0, x);
+                else
+                    return v1(0, 0) + v2(0, 0);
+            }
+        }
+    }
+
+    __global__ void estimateUKernel(const PtrStepSzf I1wx, const PtrStepf I1wy,
+                              const PtrStepf grad, const PtrStepf rho_c,
+                              const PtrStepf p11, const PtrStepf p12,
+                              const PtrStepf p21, const PtrStepf p22,
+                              const PtrStepf p31, const PtrStepf p32,
+                              PtrStepf u1, PtrStepf u2, PtrStepf u3, PtrStepf error,
+                              const float l_t, const float theta, const float gamma, const bool calcError)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= I1wx.cols || y >= I1wx.rows)
+            return;
+
+        const float I1wxVal = I1wx(y, x);
+        const float I1wyVal = I1wy(y, x);
+        const float gradVal = grad(y, x);
+        const float u1OldVal = u1(y, x);
+        const float u2OldVal = u2(y, x);
+        const float u3OldVal = gamma ? u3(y, x) : 0;
+
+        const float rho = rho_c(y, x) + (I1wxVal * u1OldVal + I1wyVal * u2OldVal + gamma * u3OldVal);
+
+        // estimate the values of the variable (v1, v2) (thresholding operator TH)
+
+        float d1 = 0.0f;
+        float d2 = 0.0f;
+        float d3 = 0.0f;
+
+        if (rho < -l_t * gradVal)
+        {
+            d1 = l_t * I1wxVal;
+            d2 = l_t * I1wyVal;
+            if (gamma)
+                d3 = l_t * gamma;
+        }
+        else if (rho > l_t * gradVal)
+        {
+            d1 = -l_t * I1wxVal;
+            d2 = -l_t * I1wyVal;
+            if (gamma)
+                d3 = -l_t * gamma;
+        }
+        else if (gradVal > numeric_limits<float>::epsilon())
+        {
+            const float fi = -rho / gradVal;
+            d1 = fi * I1wxVal;
+            d2 = fi * I1wyVal;
+            if (gamma)
+                d3 = fi * gamma;
+        }
+
+        const float v1 = u1OldVal + d1;
+        const float v2 = u2OldVal + d2;
+        const float v3 = u3OldVal + d3;
+
+        // compute the divergence of the dual variable (p1, p2)
+
+        const float div_p1 = divergence(p11, p12, y, x);
+        const float div_p2 = divergence(p21, p22, y, x);
+        const float div_p3 = gamma ? divergence(p31, p32, y, x) : 0;
+
+        // estimate the values of the optical flow (u1, u2)
+
+        const float u1NewVal = v1 + theta * div_p1;
+        const float u2NewVal = v2 + theta * div_p2;
+        const float u3NewVal = gamma ? v3 + theta * div_p3 : 0;
+
+        u1(y, x) = u1NewVal;
+        u2(y, x) = u2NewVal;
+        if (gamma)
+            u3(y, x) = u3NewVal;
+
+        if (calcError)
+        {
+            const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
+            const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
+            error(y, x) = n1 + n2;
+        }
+    }
+
+    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
+                   PtrStepSzf grad, PtrStepSzf rho_c,
+                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
+                   float l_t, float theta, float gamma, bool calcError,
+                   cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
+
+        estimateUKernel<<<grid, block, 0, stream>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// estimateDualVariables
+
+namespace tvl1flow
+{
+    __global__ void estimateDualVariablesKernel(const PtrStepSzf u1, const PtrStepf u2, const PtrStepSzf u3,
+                                                PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, PtrStepf p31, PtrStepf p32, const float taut, const float gamma)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= u1.cols || y >= u1.rows)
+            return;
+
+        const float u1x = u1(y, ::min(x + 1, u1.cols - 1)) - u1(y, x);
+        const float u1y = u1(::min(y + 1, u1.rows - 1), x) - u1(y, x);
+
+        const float u2x = u2(y, ::min(x + 1, u1.cols - 1)) - u2(y, x);
+        const float u2y = u2(::min(y + 1, u1.rows - 1), x) - u2(y, x);
+
+        const float u3x = gamma ? u3(y, ::min(x + 1, u1.cols - 1)) - u3(y, x) : 0;
+        const float u3y = gamma ? u3(::min(y + 1, u1.rows - 1), x) - u3(y, x) : 0;
+
+        const float g1 = ::hypotf(u1x, u1y);
+        const float g2 = ::hypotf(u2x, u2y);
+        const float g3 = gamma ? ::hypotf(u3x, u3y) : 0;
+
+        const float ng1 = 1.0f + taut * g1;
+        const float ng2 = 1.0f + taut * g2;
+        const float ng3 = gamma ? 1.0f + taut * g3 : 0;
+
+        p11(y, x) = (p11(y, x) + taut * u1x) / ng1;
+        p12(y, x) = (p12(y, x) + taut * u1y) / ng1;
+        p21(y, x) = (p21(y, x) + taut * u2x) / ng2;
+        p22(y, x) = (p22(y, x) + taut * u2y) / ng2;
+        if (gamma)
+        {
+            p31(y, x) = (p31(y, x) + taut * u3x) / ng3;
+            p32(y, x) = (p32(y, x) + taut * u3y) / ng3;
+        }
+    }
+
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
+                               PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                               float taut, float gamma,
+                               cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
+
+        estimateDualVariablesKernel<<<grid, block, 0, stream>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif // !defined CUDA_DISABLER
diff --git a/modules/cudaoptflow/src/farneback.cpp b/modules/cudaoptflow/src/farneback.cpp
new file mode 100644
index 00000000000..43032b4f8fd
--- /dev/null
+++ b/modules/cudaoptflow/src/farneback.cpp
@@ -0,0 +1,469 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+Ptr<FarnebackOpticalFlow> cv::cuda::FarnebackOpticalFlow::create(int, double, bool, int, int, int, double, int) { throw_no_cuda(); return Ptr<FarnebackOpticalFlow>(); }
+
+#else
+
+#define MIN_SIZE 32
+
+// CUDA resize() is fast, but it differs from the CPU analog. Disabling this flag
+// leads to an inefficient code. It's for debug purposes only.
+#define ENABLE_CUDA_RESIZE 1
+
+namespace cv { namespace cuda { namespace device { namespace optflow_farneback
+{
+    void setPolynomialExpansionConsts(
+            int polyN, const float *g, const float *xg, const float *xxg,
+            float ig11, float ig03, float ig33, float ig55);
+
+    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream);
+
+    void setUpdateMatricesConsts();
+
+    void updateMatricesGpu(
+            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
+            PtrStepSzf M, cudaStream_t stream);
+
+    void updateFlowGpu(
+            const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream);
+
+    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
+
+    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
+
+    void setGaussianBlurKernel(const float *gKer, int ksizeHalf);
+
+    void gaussianBlurGpu(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
+
+    void gaussianBlur5Gpu(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
+
+    void gaussianBlur5Gpu_CC11(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
+
+}}}}
+
+namespace
+{
+    class FarnebackOpticalFlowImpl : public cv::cuda::FarnebackOpticalFlow
+    {
+    public:
+        FarnebackOpticalFlowImpl(int numLevels, double pyrScale, bool fastPyramids, int winSize,
+                                 int numIters, int polyN, double polySigma, int flags) :
+            numLevels_(numLevels), pyrScale_(pyrScale), fastPyramids_(fastPyramids), winSize_(winSize),
+            numIters_(numIters), polyN_(polyN), polySigma_(polySigma), flags_(flags)
+        {
+        }
+
+        virtual int getNumLevels() const { return numLevels_; }
+        virtual void setNumLevels(int numLevels) { numLevels_ = numLevels; }
+
+        virtual double getPyrScale() const { return pyrScale_; }
+        virtual void setPyrScale(double pyrScale) { pyrScale_ = pyrScale; }
+
+        virtual bool getFastPyramids() const { return fastPyramids_; }
+        virtual void setFastPyramids(bool fastPyramids) { fastPyramids_ = fastPyramids; }
+
+        virtual int getWinSize() const { return winSize_; }
+        virtual void setWinSize(int winSize) { winSize_ = winSize; }
+
+        virtual int getNumIters() const { return numIters_; }
+        virtual void setNumIters(int numIters) { numIters_ = numIters; }
+
+        virtual int getPolyN() const { return polyN_; }
+        virtual void setPolyN(int polyN) { polyN_ = polyN; }
+
+        virtual double getPolySigma() const { return polySigma_; }
+        virtual void setPolySigma(double polySigma) { polySigma_ = polySigma; }
+
+        virtual int getFlags() const { return flags_; }
+        virtual void setFlags(int flags) { flags_ = flags; }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+    private:
+        int numLevels_;
+        double pyrScale_;
+        bool fastPyramids_;
+        int winSize_;
+        int numIters_;
+        int polyN_;
+        double polySigma_;
+        int flags_;
+
+    private:
+        void prepareGaussian(
+                int n, double sigma, float *g, float *xg, float *xxg,
+                double &ig11, double &ig03, double &ig33, double &ig55);
+
+        void setPolynomialExpansionConsts(int n, double sigma);
+
+        void updateFlow_boxFilter(
+                const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
+                GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
+
+        void updateFlow_gaussianBlur(
+                const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
+                GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
+
+        void calcImpl(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &stream);
+
+        GpuMat frames_[2];
+        GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
+        std::vector<GpuMat> pyramid0_, pyramid1_;
+    };
+
+    void FarnebackOpticalFlowImpl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream)
+    {
+        const GpuMat frame0 = _frame0.getGpuMat();
+        const GpuMat frame1 = _frame1.getGpuMat();
+
+        BufferPool pool(stream);
+        GpuMat flowx = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat flowy = pool.getBuffer(frame0.size(), CV_32FC1);
+
+        calcImpl(frame0, frame1, flowx, flowy, stream);
+
+        GpuMat flows[] = {flowx, flowy};
+        cuda::merge(flows, 2, _flow, stream);
+    }
+
+    GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
+    {
+        if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
+            return mat(Rect(0, 0, cols, rows));
+
+        return mat = GpuMat(rows, cols, type);
+    }
+
+    void FarnebackOpticalFlowImpl::prepareGaussian(
+            int n, double sigma, float *g, float *xg, float *xxg,
+            double &ig11, double &ig03, double &ig33, double &ig55)
+    {
+        double s = 0.;
+        for (int x = -n; x <= n; x++)
+        {
+            g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
+            s += g[x];
+        }
+
+        s = 1./s;
+        for (int x = -n; x <= n; x++)
+        {
+            g[x] = (float)(g[x]*s);
+            xg[x] = (float)(x*g[x]);
+            xxg[x] = (float)(x*x*g[x]);
+        }
+
+        Mat_<double> G(6, 6);
+        G.setTo(0);
+
+        for (int y = -n; y <= n; y++)
+        {
+            for (int x = -n; x <= n; x++)
+            {
+                G(0,0) += g[y]*g[x];
+                G(1,1) += g[y]*g[x]*x*x;
+                G(3,3) += g[y]*g[x]*x*x*x*x;
+                G(5,5) += g[y]*g[x]*x*x*y*y;
+            }
+        }
+
+        //G[0][0] = 1.;
+        G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
+        G(4,4) = G(3,3);
+        G(3,4) = G(4,3) = G(5,5);
+
+        // invG:
+        // [ x        e  e    ]
+        // [    y             ]
+        // [       y          ]
+        // [ e        z       ]
+        // [ e           z    ]
+        // [                u ]
+        Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
+
+        ig11 = invG(1,1);
+        ig03 = invG(0,3);
+        ig33 = invG(3,3);
+        ig55 = invG(5,5);
+    }
+
+    void FarnebackOpticalFlowImpl::setPolynomialExpansionConsts(int n, double sigma)
+    {
+        std::vector<float> buf(n*6 + 3);
+        float* g = &buf[0] + n;
+        float* xg = g + n*2 + 1;
+        float* xxg = xg + n*2 + 1;
+
+        if (sigma < FLT_EPSILON)
+            sigma = n*0.3;
+
+        double ig11, ig03, ig33, ig55;
+        prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
+
+        device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
+    }
+
+    void FarnebackOpticalFlowImpl::updateFlow_boxFilter(
+            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
+            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
+    {
+        if (deviceSupports(FEATURE_SET_COMPUTE_12))
+            device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, StreamAccessor::getStream(streams[0]));
+        else
+            device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, StreamAccessor::getStream(streams[0]));
+        swap(M, bufM);
+
+        for (int i = 1; i < 5; ++i)
+            streams[i].waitForCompletion();
+        device::optflow_farneback::updateFlowGpu(M, flowx, flowy, StreamAccessor::getStream(streams[0]));
+
+        if (updateMatrices)
+            device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, StreamAccessor::getStream(streams[0]));
+    }
+
+    void FarnebackOpticalFlowImpl::updateFlow_gaussianBlur(
+            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
+            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
+    {
+        if (deviceSupports(FEATURE_SET_COMPUTE_12))
+            device::optflow_farneback::gaussianBlur5Gpu(
+                        M, blockSize/2, bufM, BORDER_REPLICATE, StreamAccessor::getStream(streams[0]));
+        else
+            device::optflow_farneback::gaussianBlur5Gpu_CC11(
+                        M, blockSize/2, bufM, BORDER_REPLICATE, StreamAccessor::getStream(streams[0]));
+        swap(M, bufM);
+
+        device::optflow_farneback::updateFlowGpu(M, flowx, flowy, StreamAccessor::getStream(streams[0]));
+
+        if (updateMatrices)
+            device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, StreamAccessor::getStream(streams[0]));
+    }
+
+    void FarnebackOpticalFlowImpl::calcImpl(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &stream)
+    {
+        CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
+        CV_Assert(frame0.size() == frame1.size());
+        CV_Assert(polyN_ == 5 || polyN_ == 7);
+        CV_Assert(!fastPyramids_ || std::abs(pyrScale_ - 0.5) < 1e-6);
+
+        Stream streams[5];
+        if (stream)
+            streams[0] = stream;
+
+        Size size = frame0.size();
+        GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
+
+        flowx.create(size, CV_32F);
+        flowy.create(size, CV_32F);
+        GpuMat flowx0 = flowx;
+        GpuMat flowy0 = flowy;
+
+        // Crop unnecessary levels
+        double scale = 1;
+        int numLevelsCropped = 0;
+        for (; numLevelsCropped < numLevels_; numLevelsCropped++)
+        {
+            scale *= pyrScale_;
+            if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
+                break;
+        }
+
+        frame0.convertTo(frames_[0], CV_32F, streams[0]);
+        frame1.convertTo(frames_[1], CV_32F, streams[1]);
+
+        if (fastPyramids_)
+        {
+            // Build Gaussian pyramids using pyrDown()
+            pyramid0_.resize(numLevelsCropped + 1);
+            pyramid1_.resize(numLevelsCropped + 1);
+            pyramid0_[0] = frames_[0];
+            pyramid1_[0] = frames_[1];
+            for (int i = 1; i <= numLevelsCropped; ++i)
+            {
+                cuda::pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
+                cuda::pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
+            }
+        }
+
+        setPolynomialExpansionConsts(polyN_, polySigma_);
+        device::optflow_farneback::setUpdateMatricesConsts();
+
+        for (int k = numLevelsCropped; k >= 0; k--)
+        {
+            streams[0].waitForCompletion();
+
+            scale = 1;
+            for (int i = 0; i < k; i++)
+                scale *= pyrScale_;
+
+            double sigma = (1./scale - 1) * 0.5;
+            int smoothSize = cvRound(sigma*5) | 1;
+            smoothSize = std::max(smoothSize, 3);
+
+            int width = cvRound(size.width*scale);
+            int height = cvRound(size.height*scale);
+
+            if (fastPyramids_)
+            {
+                width = pyramid0_[k].cols;
+                height = pyramid0_[k].rows;
+            }
+
+            if (k > 0)
+            {
+                curFlowX.create(height, width, CV_32F);
+                curFlowY.create(height, width, CV_32F);
+            }
+            else
+            {
+                curFlowX = flowx0;
+                curFlowY = flowy0;
+            }
+
+            if (!prevFlowX.data)
+            {
+                if (flags_ & OPTFLOW_USE_INITIAL_FLOW)
+                {
+                    cuda::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
+                    cuda::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
+                    curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]);
+                    curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]);
+                }
+                else
+                {
+                    curFlowX.setTo(0, streams[0]);
+                    curFlowY.setTo(0, streams[1]);
+                }
+            }
+            else
+            {
+                cuda::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
+                cuda::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
+                curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale_, streams[0]);
+                curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale_, streams[1]);
+            }
+
+            GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
+            GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
+            GpuMat R[2] =
+            {
+                allocMatFromBuf(5*height, width, CV_32F, R_[0]),
+                allocMatFromBuf(5*height, width, CV_32F, R_[1])
+            };
+
+            if (fastPyramids_)
+            {
+                device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN_, R[0], StreamAccessor::getStream(streams[0]));
+                device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN_, R[1], StreamAccessor::getStream(streams[1]));
+            }
+            else
+            {
+                GpuMat blurredFrame[2] =
+                {
+                    allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
+                    allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
+                };
+                GpuMat pyrLevel[2] =
+                {
+                    allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
+                    allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
+                };
+
+                Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
+                device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
+
+                for (int i = 0; i < 2; i++)
+                {
+                    device::optflow_farneback::gaussianBlurGpu(
+                            frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, StreamAccessor::getStream(streams[i]));
+                    cuda::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]);
+                    device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN_, R[i], StreamAccessor::getStream(streams[i]));
+                }
+            }
+
+            streams[1].waitForCompletion();
+            device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, StreamAccessor::getStream(streams[0]));
+
+            if (flags_ & OPTFLOW_FARNEBACK_GAUSSIAN)
+            {
+                Mat g = getGaussianKernel(winSize_, winSize_/2*0.3f, CV_32F);
+                device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize_/2), winSize_/2);
+            }
+            for (int i = 0; i < numIters_; i++)
+            {
+                if (flags_ & OPTFLOW_FARNEBACK_GAUSSIAN)
+                    updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize_, i < numIters_-1, streams);
+                else
+                    updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize_, i < numIters_-1, streams);
+            }
+
+            prevFlowX = curFlowX;
+            prevFlowY = curFlowY;
+        }
+
+        flowx = curFlowX;
+        flowy = curFlowY;
+
+        if (!stream)
+            streams[0].waitForCompletion();
+    }
+}
+
+Ptr<cv::cuda::FarnebackOpticalFlow> cv::cuda::FarnebackOpticalFlow::create(int numLevels, double pyrScale, bool fastPyramids, int winSize,
+                                                                 int numIters, int polyN, double polySigma, int flags)
+{
+    return makePtr<FarnebackOpticalFlowImpl>(numLevels, pyrScale, fastPyramids, winSize,
+                                             numIters, polyN, polySigma, flags);
+}
+
+#endif
diff --git a/modules/cudaoptflow/src/precomp.hpp b/modules/cudaoptflow/src/precomp.hpp
new file mode 100644
index 00000000000..d5ac493342d
--- /dev/null
+++ b/modules/cudaoptflow/src/precomp.hpp
@@ -0,0 +1,62 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudawarping.hpp"
+#include "opencv2/cudaimgproc.hpp"
+#include "opencv2/video.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy/private.hpp"
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudaoptflow/src/pyrlk.cpp b/modules/cudaoptflow/src/pyrlk.cpp
new file mode 100644
index 00000000000..881c6201434
--- /dev/null
+++ b/modules/cudaoptflow/src/pyrlk.cpp
@@ -0,0 +1,404 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cv::cuda::SparsePyrLKOpticalFlow> cv::cuda::SparsePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr<SparsePyrLKOpticalFlow>(); }
+
+Ptr<cv::cuda::DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr<DensePyrLKOpticalFlow>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace pyrlk
+{
+    void loadConstants(int* winSize, int iters, cudaStream_t stream);
+    void loadWinSize(int* winSize, int* halfWinSize, cudaStream_t stream);
+    void loadIters(int* iters, cudaStream_t stream);
+    template<typename T, int cn> struct pyrLK_caller
+    {
+        static void sparse(PtrStepSz<typename device::TypeVec<T, cn>::vec_type> I, PtrStepSz<typename device::TypeVec<T, cn>::vec_type> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, dim3 patch, cudaStream_t stream);
+
+        static void dense(PtrStepSzf I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
+            PtrStepSzf err, int2 winSize, cudaStream_t stream);
+    };
+
+    template<typename T, int cn> void dispatcher(GpuMat I, GpuMat J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+        int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        pyrLK_caller<T, cn>::sparse(I, J, prevPts, nextPts, status, err, ptcount, level, block, patch, stream);
+    }
+}
+
+namespace
+{
+    class PyrLKOpticalFlowBase
+    {
+    public:
+        PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow);
+
+        void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
+            GpuMat& status, GpuMat* err, Stream& stream);
+
+        void sparse(std::vector<GpuMat>& prevPyr, std::vector<GpuMat>& nextPyr, const GpuMat& prevPts, GpuMat& nextPts,
+            GpuMat& status, GpuMat* err, Stream& stream);
+
+        void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream);
+
+    protected:
+        int winSize_[2];
+        int halfWinSize_[2];
+        int maxLevel_;
+        int iters_;
+        bool useInitialFlow_;
+        void buildImagePyramid(const GpuMat& prevImg, std::vector<GpuMat>& prevPyr, const GpuMat& nextImg, std::vector<GpuMat>& nextPyr, Stream stream);
+    private:
+        friend class SparsePyrLKOpticalFlowImpl;
+        std::vector<GpuMat> prevPyr_;
+        std::vector<GpuMat> nextPyr_;
+    };
+
+    PyrLKOpticalFlowBase::PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+        maxLevel_(maxLevel), iters_(iters), useInitialFlow_(useInitialFlow)
+    {
+        winSize_[0] = winSize.width;
+        winSize_[1] = winSize.height;
+        halfWinSize_[0] = (winSize.width - 1) / 2;
+        halfWinSize_[1] = (winSize.height - 1) / 2;
+        pyrlk::loadWinSize(winSize_, halfWinSize_, 0);
+        pyrlk::loadIters(&iters_, 0);
+    }
+
+    void calcPatchSize(Size winSize, dim3& block, dim3& patch)
+    {
+        if (winSize.width > 32 && winSize.width > 2 * winSize.height)
+        {
+            block.x = deviceSupports(FEATURE_SET_COMPUTE_12) ? 32 : 16;
+            block.y = 8;
+        }
+        else
+        {
+            block.x = 16;
+            block.y = deviceSupports(FEATURE_SET_COMPUTE_12) ? 16 : 8;
+        }
+
+        patch.x = (winSize.width  + block.x - 1) / block.x;
+        patch.y = (winSize.height + block.y - 1) / block.y;
+
+        block.z = patch.z = 1;
+    }
+
+    void PyrLKOpticalFlowBase::buildImagePyramid(const GpuMat& prevImg, std::vector<GpuMat>& prevPyr, const GpuMat& nextImg, std::vector<GpuMat>& nextPyr, Stream stream)
+    {
+        prevPyr.resize(maxLevel_ + 1);
+        nextPyr.resize(maxLevel_ + 1);
+
+        int cn = prevImg.channels();
+
+        CV_Assert(cn == 1 || cn == 3 || cn == 4);
+
+        prevPyr[0] = prevImg;
+        nextPyr[0] = nextImg;
+
+        for (int level = 1; level <= maxLevel_; ++level)
+        {
+            cuda::pyrDown(prevPyr[level - 1], prevPyr[level], stream);
+            cuda::pyrDown(nextPyr[level - 1], nextPyr[level], stream);
+        }
+    }
+    void PyrLKOpticalFlowBase::sparse(std::vector<GpuMat>& prevPyr, std::vector<GpuMat>& nextPyr, const GpuMat& prevPts, GpuMat& nextPts,
+        GpuMat& status, GpuMat* err, Stream& stream)
+    {
+        CV_Assert(prevPyr.size() && nextPyr.size() && "Pyramid needs to at least contain the original matrix as the first element");
+        CV_Assert(prevPyr[0].size() == nextPyr[0].size());
+        CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
+        CV_Assert(maxLevel_ >= 0);
+        CV_Assert(winSize_[0] > 2 && winSize_[1] > 2);
+        if (useInitialFlow_)
+            CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type());
+        else
+            ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
+
+        GpuMat temp1 = (useInitialFlow_ ? nextPts : prevPts).reshape(1);
+        GpuMat temp2 = nextPts.reshape(1);
+        cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream);
+
+
+        ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
+        status.setTo(Scalar::all(1), stream);
+
+        if (err)
+            ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
+
+        if (prevPyr.size() != size_t(maxLevel_ + 1) || nextPyr.size() != size_t(maxLevel_ + 1))
+        {
+            buildImagePyramid(prevPyr[0], prevPyr, nextPyr[0], nextPyr, stream);
+        }
+
+        dim3 block, patch;
+        calcPatchSize(Size(winSize_[0], winSize_[1]), block, patch);
+        CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
+        cudaStream_t stream_ = StreamAccessor::getStream(stream);
+        pyrlk::loadWinSize(winSize_, halfWinSize_, stream_);
+        pyrlk::loadIters(&iters_, stream_);
+
+        const int cn = prevPyr[0].channels();
+        const int type = prevPyr[0].depth();
+
+        typedef void(*func_t)(GpuMat I, GpuMat J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, dim3 patch, cudaStream_t stream);
+
+        // Current int datatype is disabled due to pyrDown not implementing it
+        // while ushort does work, it has significantly worse performance, and thus doesn't pass accuracy tests.
+        static const func_t funcs[6][4] =
+        {
+          {   pyrlk::dispatcher<uchar, 1>     , /*pyrlk::dispatcher<uchar, 2>*/ 0,   pyrlk::dispatcher<uchar, 3>      ,   pyrlk::dispatcher<uchar, 4>    },
+          { /*pyrlk::dispatcher<char, 1>*/   0, /*pyrlk::dispatcher<char, 2>*/  0, /*pyrlk::dispatcher<char, 3>*/  0  , /*pyrlk::dispatcher<char, 4>*/ 0 },
+          {   pyrlk::dispatcher<ushort, 1>    , /*pyrlk::dispatcher<ushort, 2>*/0,   pyrlk::dispatcher<ushort, 3>     ,   pyrlk::dispatcher<ushort, 4>   },
+          { /*pyrlk::dispatcher<short, 1>*/  0, /*pyrlk::dispatcher<short, 2>*/ 0, /*pyrlk::dispatcher<short, 3>*/ 0  , /*pyrlk::dispatcher<short, 4>*/0 },
+          {   pyrlk::dispatcher<int, 1>       , /*pyrlk::dispatcher<int, 2>*/   0,   pyrlk::dispatcher<int, 3>        ,   pyrlk::dispatcher<int, 4>      },
+          {   pyrlk::dispatcher<float, 1>     , /*pyrlk::dispatcher<float, 2>*/ 0,   pyrlk::dispatcher<float, 3>      ,   pyrlk::dispatcher<float, 4>    }
+        };
+
+        func_t func = funcs[type][cn-1];
+        CV_Assert(func != NULL && "Datatype not implemented");
+        for (int level = maxLevel_; level >= 0; level--)
+        {
+            func(prevPyr[level], nextPyr[level],
+                prevPts.ptr<float2>(), nextPts.ptr<float2>(),
+                status.ptr(), level == 0 && err ? err->ptr<float>() : 0,
+                prevPts.cols, level, block, patch,
+                stream_);
+        }
+    }
+
+    void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream)
+    {
+        if (prevPts.empty())
+        {
+            nextPts.release();
+            status.release();
+            if (err) err->release();
+            return;
+        }
+        CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 );
+        CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
+
+        // build the image pyramids.
+        buildImagePyramid(prevImg, prevPyr_, nextImg, nextPyr_, stream);
+
+        sparse(prevPyr_, nextPyr_, prevPts, nextPts, status, err, stream);
+
+    }
+
+    void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream)
+    {
+        CV_Assert( prevImg.type() == CV_8UC1 );
+        CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
+        CV_Assert( maxLevel_ >= 0 );
+        CV_Assert( winSize_[0] > 2 && winSize_[1] > 2 );
+
+        // build the image pyramids.
+
+        prevPyr_.resize(maxLevel_ + 1);
+        nextPyr_.resize(maxLevel_ + 1);
+
+        //prevPyr_[0] = prevImg;
+
+        prevImg.convertTo(prevPyr_[0], CV_32F, stream);
+        nextImg.convertTo(nextPyr_[0], CV_32F, stream);
+
+        for (int level = 1; level <= maxLevel_; ++level)
+        {
+            cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
+            cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
+        }
+
+        BufferPool pool(stream);
+
+        GpuMat uPyr[] = {
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+        };
+        GpuMat vPyr[] = {
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+        };
+
+        uPyr[0].setTo(Scalar::all(0), stream);
+        vPyr[0].setTo(Scalar::all(0), stream);
+        uPyr[1].setTo(Scalar::all(0), stream);
+        vPyr[1].setTo(Scalar::all(0), stream);
+        cudaStream_t stream_ = StreamAccessor::getStream(stream);
+        pyrlk::loadWinSize(winSize_, halfWinSize_, stream_);
+        pyrlk::loadIters(&iters_, stream_);
+        int2 winSize2i = make_int2(winSize_[0], winSize_[1]);
+        //pyrlk::loadConstants(winSize2i, iters_, StreamAccessor::getStream(stream));
+
+        int idx = 0;
+
+        for (int level = maxLevel_; level >= 0; level--)
+        {
+            int idx2 = (idx + 1) & 1;
+
+            pyrlk::pyrLK_caller<float,1>::dense(prevPyr_[level], nextPyr_[level],
+                         uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2],
+                         PtrStepSzf(), winSize2i,
+                         stream_);
+
+            if (level > 0)
+                idx = idx2;
+        }
+
+        uPyr[idx].copyTo(u, stream);
+        vPyr[idx].copyTo(v, stream);
+    }
+
+    class SparsePyrLKOpticalFlowImpl : public cv::cuda::SparsePyrLKOpticalFlow, private PyrLKOpticalFlowBase
+    {
+    public:
+        SparsePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+            PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow)
+        {
+        }
+
+        virtual Size getWinSize() const { return cv::Size(winSize_[0], winSize_[1]); }
+        virtual void setWinSize(Size winSize) {
+            winSize_[0] = winSize.width;
+            winSize_[1] = winSize.height;
+            halfWinSize_[0] = (winSize.width - 1) / 2;
+            halfWinSize_[1] = (winSize.height -1) / 2;
+        }
+
+        virtual int getMaxLevel() const { return maxLevel_; }
+        virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }
+
+        virtual int getNumIters() const { return iters_; }
+        virtual void setNumIters(int iters) { iters_ = iters; }
+
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
+
+        virtual void calc(InputArray _prevImg, InputArray _nextImg,
+                          InputArray _prevPts, InputOutputArray _nextPts,
+                          OutputArray _status,
+                          OutputArray _err,
+                          Stream& stream)
+        {
+            const GpuMat prevPts = _prevPts.getGpuMat();
+            GpuMat& nextPts = _nextPts.getGpuMatRef();
+            GpuMat& status = _status.getGpuMatRef();
+            GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL;
+            if (_prevImg.kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT && _prevImg.kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT)
+            {
+                std::vector<GpuMat> prevPyr, nextPyr;
+                _prevImg.getGpuMatVector(prevPyr);
+                _nextImg.getGpuMatVector(nextPyr);
+                sparse(prevPyr, nextPyr, prevPts, nextPts, status, err, stream);
+            }
+            else
+            {
+                const GpuMat prevImg = _prevImg.getGpuMat();
+                const GpuMat nextImg = _nextImg.getGpuMat();
+                sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream);
+            }
+        }
+    };
+
+    class DensePyrLKOpticalFlowImpl : public DensePyrLKOpticalFlow, private PyrLKOpticalFlowBase
+    {
+    public:
+        DensePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+            PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow)
+        {
+        }
+
+        virtual Size getWinSize() const { return cv::Size(winSize_[0], winSize_[1]); }
+        virtual void setWinSize(Size winSize) {
+            winSize_[0] = winSize.width;
+            winSize_[1] = winSize.height;
+            halfWinSize_[0] = (winSize.width - 1) / 2;
+            halfWinSize_[1] = (winSize.height -1) / 2;
+        }
+
+        virtual int getMaxLevel() const { return maxLevel_; }
+        virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }
+
+        virtual int getNumIters() const { return iters_; }
+        virtual void setNumIters(int iters) { iters_ = iters; }
+
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
+
+        virtual void calc(InputArray _prevImg, InputArray _nextImg, InputOutputArray _flow, Stream& stream)
+        {
+            const GpuMat prevImg = _prevImg.getGpuMat();
+            const GpuMat nextImg = _nextImg.getGpuMat();
+
+            BufferPool pool(stream);
+            GpuMat u = pool.getBuffer(prevImg.size(), CV_32FC1);
+            GpuMat v = pool.getBuffer(prevImg.size(), CV_32FC1);
+
+            dense(prevImg, nextImg, u, v, stream);
+
+            GpuMat flows[] = {u, v};
+            cuda::merge(flows, 2, _flow, stream);
+        }
+    };
+}
+
+Ptr<cv::cuda::SparsePyrLKOpticalFlow> cv::cuda::SparsePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow)
+{
+    return makePtr<SparsePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
+}
+
+Ptr<cv::cuda::DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow)
+{
+    return makePtr<DensePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaoptflow/src/tvl1flow.cpp b/modules/cudaoptflow/src/tvl1flow.cpp
new file mode 100644
index 00000000000..abc6c2e318f
--- /dev/null
+++ b/modules/cudaoptflow/src/tvl1flow.cpp
@@ -0,0 +1,385 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> cv::cuda::OpticalFlowDual_TVL1::create(double, double, double, int, int, double, int, double, double, bool)
+{
+    throw_no_cuda();
+    return Ptr<cv::cuda::OpticalFlowDual_TVL1>();
+}
+
+#else
+
+using namespace cv;
+using namespace cv::cuda;
+
+namespace tvl1flow
+{
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream);
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
+                      PtrStepSzf u1, PtrStepSzf u2,
+                      PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy,
+                      PtrStepSzf grad, PtrStepSzf rho,
+                      cudaStream_t stream);
+    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
+                   PtrStepSzf grad, PtrStepSzf rho_c,
+                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
+                   float l_t, float theta, float gamma, bool calcError,
+                   cudaStream_t stream);
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
+                               PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                               float taut, float gamma,
+                               cudaStream_t stream);
+}
+
+namespace
+{
+    class OpticalFlowDual_TVL1_Impl : public OpticalFlowDual_TVL1
+    {
+    public:
+        OpticalFlowDual_TVL1_Impl(double tau, double lambda, double theta, int nscales, int warps, double epsilon,
+                                  int iterations, double scaleStep, double gamma, bool useInitialFlow) :
+            tau_(tau), lambda_(lambda), gamma_(gamma), theta_(theta), nscales_(nscales), warps_(warps),
+            epsilon_(epsilon), iterations_(iterations), scaleStep_(scaleStep), useInitialFlow_(useInitialFlow)
+        {
+        }
+
+        virtual double getTau() const { return tau_; }
+        virtual void setTau(double tau) { tau_ = tau; }
+
+        virtual double getLambda() const { return lambda_; }
+        virtual void setLambda(double lambda) { lambda_ = lambda; }
+
+        virtual double getGamma() const { return gamma_; }
+        virtual void setGamma(double gamma) { gamma_ = gamma; }
+
+        virtual double getTheta() const { return theta_; }
+        virtual void setTheta(double theta) { theta_ = theta; }
+
+        virtual int getNumScales() const { return nscales_; }
+        virtual void setNumScales(int nscales) { nscales_ = nscales; }
+
+        virtual int getNumWarps() const { return warps_; }
+        virtual void setNumWarps(int warps) { warps_ = warps; }
+
+        virtual double getEpsilon() const { return epsilon_; }
+        virtual void setEpsilon(double epsilon) { epsilon_ = epsilon; }
+
+        virtual int getNumIterations() const { return iterations_; }
+        virtual void setNumIterations(int iterations) { iterations_ = iterations; }
+
+        virtual double getScaleStep() const { return scaleStep_; }
+        virtual void setScaleStep(double scaleStep) { scaleStep_ = scaleStep; }
+
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+    private:
+        double tau_;
+        double lambda_;
+        double gamma_;
+        double theta_;
+        int nscales_;
+        int warps_;
+        double epsilon_;
+        int iterations_;
+        double scaleStep_;
+        bool useInitialFlow_;
+
+    private:
+        void calcImpl(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, Stream& stream);
+        void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3, Stream& stream);
+
+        std::vector<GpuMat> I0s;
+        std::vector<GpuMat> I1s;
+        std::vector<GpuMat> u1s;
+        std::vector<GpuMat> u2s;
+        std::vector<GpuMat> u3s;
+
+        GpuMat I1x_buf;
+        GpuMat I1y_buf;
+
+        GpuMat I1w_buf;
+        GpuMat I1wx_buf;
+        GpuMat I1wy_buf;
+
+        GpuMat grad_buf;
+        GpuMat rho_c_buf;
+
+        GpuMat p11_buf;
+        GpuMat p12_buf;
+        GpuMat p21_buf;
+        GpuMat p22_buf;
+        GpuMat p31_buf;
+        GpuMat p32_buf;
+
+        GpuMat diff_buf;
+        GpuMat norm_buf;
+    };
+
+    void OpticalFlowDual_TVL1_Impl::calc(InputArray _frame0, InputArray _frame1, InputOutputArray _flow, Stream& stream)
+    {
+        const GpuMat frame0 = _frame0.getGpuMat();
+        const GpuMat frame1 = _frame1.getGpuMat();
+
+        BufferPool pool(stream);
+        GpuMat flowx = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat flowy = pool.getBuffer(frame0.size(), CV_32FC1);
+
+        calcImpl(frame0, frame1, flowx, flowy, stream);
+
+        GpuMat flows[] = {flowx, flowy};
+        cuda::merge(flows, 2, _flow, stream);
+    }
+
+    void OpticalFlowDual_TVL1_Impl::calcImpl(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, Stream& stream)
+    {
+        CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
+        CV_Assert( I0.size() == I1.size() );
+        CV_Assert( I0.type() == I1.type() );
+        CV_Assert( !useInitialFlow_ || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
+        CV_Assert( nscales_ > 0 );
+
+        // allocate memory for the pyramid structure
+        I0s.resize(nscales_);
+        I1s.resize(nscales_);
+        u1s.resize(nscales_);
+        u2s.resize(nscales_);
+        u3s.resize(nscales_);
+
+        I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0, stream);
+        I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0, stream);
+
+        if (!useInitialFlow_)
+        {
+            flowx.create(I0.size(), CV_32FC1);
+            flowy.create(I0.size(), CV_32FC1);
+        }
+
+        u1s[0] = flowx;
+        u2s[0] = flowy;
+        if (gamma_)
+        {
+            u3s[0].create(I0.size(), CV_32FC1);
+        }
+
+        I1x_buf.create(I0.size(), CV_32FC1);
+        I1y_buf.create(I0.size(), CV_32FC1);
+
+        I1w_buf.create(I0.size(), CV_32FC1);
+        I1wx_buf.create(I0.size(), CV_32FC1);
+        I1wy_buf.create(I0.size(), CV_32FC1);
+
+        grad_buf.create(I0.size(), CV_32FC1);
+        rho_c_buf.create(I0.size(), CV_32FC1);
+
+        p11_buf.create(I0.size(), CV_32FC1);
+        p12_buf.create(I0.size(), CV_32FC1);
+        p21_buf.create(I0.size(), CV_32FC1);
+        p22_buf.create(I0.size(), CV_32FC1);
+        if (gamma_)
+        {
+            p31_buf.create(I0.size(), CV_32FC1);
+            p32_buf.create(I0.size(), CV_32FC1);
+        }
+        diff_buf.create(I0.size(), CV_32FC1);
+
+        // create the scales
+        for (int s = 1; s < nscales_; ++s)
+        {
+            cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+            cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+
+            if (I0s[s].cols < 16 || I0s[s].rows < 16)
+            {
+                nscales_ = s;
+                break;
+            }
+
+            if (useInitialFlow_)
+            {
+                cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+                cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep_, scaleStep_, INTER_LINEAR, stream);
+
+                cuda::multiply(u1s[s], Scalar::all(scaleStep_), u1s[s], 1, -1, stream);
+                cuda::multiply(u2s[s], Scalar::all(scaleStep_), u2s[s], 1, -1, stream);
+            }
+            else
+            {
+                u1s[s].create(I0s[s].size(), CV_32FC1);
+                u2s[s].create(I0s[s].size(), CV_32FC1);
+            }
+            if (gamma_)
+            {
+                u3s[s].create(I0s[s].size(), CV_32FC1);
+            }
+        }
+
+        if (!useInitialFlow_)
+        {
+            u1s[nscales_-1].setTo(Scalar::all(0), stream);
+            u2s[nscales_-1].setTo(Scalar::all(0), stream);
+        }
+        if (gamma_)
+        {
+            u3s[nscales_ - 1].setTo(Scalar::all(0), stream);
+        }
+
+        // pyramidal structure for computing the optical flow
+        for (int s = nscales_ - 1; s >= 0; --s)
+        {
+            // compute the optical flow at the current scale
+            procOneScale(I0s[s], I1s[s], u1s[s], u2s[s], u3s[s], stream);
+
+            // if this was the last scale, finish now
+            if (s == 0)
+                break;
+
+            // otherwise, upsample the optical flow
+
+            // zoom the optical flow for the next finer scale
+            cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
+            cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
+            if (gamma_)
+            {
+                cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size(), 0, 0, INTER_LINEAR, stream);
+            }
+
+            // scale the optical flow with the appropriate zoom factor
+            cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep_), u1s[s - 1], 1, -1, stream);
+            cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep_), u2s[s - 1], 1, -1, stream);
+        }
+    }
+
+    void OpticalFlowDual_TVL1_Impl::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3, Stream& _stream)
+    {
+        using namespace tvl1flow;
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        const double scaledEpsilon = epsilon_ * epsilon_ * I0.size().area();
+
+        CV_DbgAssert( I1.size() == I0.size() );
+        CV_DbgAssert( I1.type() == I0.type() );
+        CV_DbgAssert( u1.size() == I0.size() );
+        CV_DbgAssert( u2.size() == u1.size() );
+
+        GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+        centeredGradient(I1, I1x, I1y, stream);
+
+        GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+
+        GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+
+        GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
+        GpuMat p31, p32;
+        if (gamma_)
+        {
+            p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows));
+            p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows));
+        }
+        p11.setTo(Scalar::all(0), _stream);
+        p12.setTo(Scalar::all(0), _stream);
+        p21.setTo(Scalar::all(0), _stream);
+        p22.setTo(Scalar::all(0), _stream);
+        if (gamma_)
+        {
+            p31.setTo(Scalar::all(0), _stream);
+            p32.setTo(Scalar::all(0), _stream);
+        }
+
+        GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
+
+        const float l_t = static_cast<float>(lambda_ * theta_);
+        const float taut = static_cast<float>(tau_ / theta_);
+
+        for (int warpings = 0; warpings < warps_; ++warpings)
+        {
+            warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c, stream);
+
+            double error = std::numeric_limits<double>::max();
+            double prevError = 0.0;
+            for (int n = 0; error > scaledEpsilon && n < iterations_; ++n)
+            {
+                // some tweaks to make sum operation less frequently
+                bool calcError = (epsilon_ > 0) && (n & 0x1) && (prevError < scaledEpsilon);
+                estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, diff, l_t, static_cast<float>(theta_), gamma_, calcError, stream);
+                if (calcError)
+                {
+                    _stream.waitForCompletion();
+                    error = cuda::sum(diff, norm_buf)[0];
+                    prevError = error;
+                }
+                else
+                {
+                    error = std::numeric_limits<double>::max();
+                    prevError -= scaledEpsilon;
+                }
+
+                estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma_, stream);
+            }
+        }
+    }
+}
+
+Ptr<OpticalFlowDual_TVL1> cv::cuda::OpticalFlowDual_TVL1::create(
+            double tau, double lambda, double theta, int nscales, int warps,
+            double epsilon, int iterations, double scaleStep, double gamma, bool useInitialFlow)
+{
+    return makePtr<OpticalFlowDual_TVL1_Impl>(tau, lambda, theta, nscales, warps,
+                                              epsilon, iterations, scaleStep, gamma, useInitialFlow);
+}
+
+#endif // !defined HAVE_CUDA || defined(CUDA_DISABLER)
diff --git a/modules/cudaoptflow/test/test_main.cpp b/modules/cudaoptflow/test/test_main.cpp
new file mode 100644
index 00000000000..04f4fcf6e60
--- /dev/null
+++ b/modules/cudaoptflow/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cudaoptflow/test/test_optflow.cpp b/modules/cudaoptflow/test/test_optflow.cpp
new file mode 100644
index 00000000000..c8564743634
--- /dev/null
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -0,0 +1,406 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////
+// BroxOpticalFlow
+
+//#define BROX_DUMP
+
+struct BroxOpticalFlow : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(BroxOpticalFlow, Regression)
+{
+    cv::Mat frame0 = readImageType("opticalflow/frame0.png", CV_32FC1);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImageType("opticalflow/frame1.png", CV_32FC1);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Ptr<cv::cuda::BroxOpticalFlow> brox =
+            cv::cuda::BroxOpticalFlow::create(0.197 /*alpha*/, 50.0 /*gamma*/, 0.8 /*scale_factor*/,
+                                              10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+
+    cv::cuda::GpuMat flow;
+    brox->calc(loadMat(frame0), loadMat(frame1), flow);
+
+    cv::cuda::GpuMat flows[2];
+    cv::cuda::split(flow, flows);
+
+    cv::cuda::GpuMat u = flows[0];
+    cv::cuda::GpuMat v = flows[1];
+
+    std::string fname(cvtest::TS::ptr()->get_data_path());
+    if (devInfo.majorVersion() >= 2)
+        fname += "opticalflow/brox_optical_flow_cc20.bin";
+    else
+        fname += "opticalflow/brox_optical_flow.bin";
+
+#ifndef BROX_DUMP
+    std::ifstream f(fname.c_str(), std::ios_base::binary);
+
+    int rows, cols;
+
+    f.read((char*) &rows, sizeof(rows));
+    f.read((char*) &cols, sizeof(cols));
+
+    cv::Mat u_gold(rows, cols, CV_32FC1);
+
+    for (int i = 0; i < u_gold.rows; ++i)
+        f.read(u_gold.ptr<char>(i), u_gold.cols * sizeof(float));
+
+    cv::Mat v_gold(rows, cols, CV_32FC1);
+
+    for (int i = 0; i < v_gold.rows; ++i)
+        f.read(v_gold.ptr<char>(i), v_gold.cols * sizeof(float));
+
+    EXPECT_MAT_SIMILAR(u_gold, u, 1e-3);
+    EXPECT_MAT_SIMILAR(v_gold, v, 1e-3);
+#else
+    std::ofstream f(fname.c_str(), std::ios_base::binary);
+
+    f.write((char*) &u.rows, sizeof(u.rows));
+    f.write((char*) &u.cols, sizeof(u.cols));
+
+    cv::Mat h_u(u);
+    cv::Mat h_v(v);
+
+    for (int i = 0; i < u.rows; ++i)
+        f.write(h_u.ptr<char>(i), u.cols * sizeof(float));
+
+    for (int i = 0; i < v.rows; ++i)
+        f.write(h_v.ptr<char>(i), v.cols * sizeof(float));
+#endif
+}
+
+CUDA_TEST_P(BroxOpticalFlow, OpticalFlowNan)
+{
+    cv::Mat frame0 = readImageType("opticalflow/frame0.png", CV_32FC1);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImageType("opticalflow/frame1.png", CV_32FC1);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat r_frame0, r_frame1;
+    cv::resize(frame0, r_frame0, cv::Size(1380,1000));
+    cv::resize(frame1, r_frame1, cv::Size(1380,1000));
+
+    cv::Ptr<cv::cuda::BroxOpticalFlow> brox =
+            cv::cuda::BroxOpticalFlow::create(0.197 /*alpha*/, 50.0 /*gamma*/, 0.8 /*scale_factor*/,
+                                              10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+
+    cv::cuda::GpuMat flow;
+    brox->calc(loadMat(frame0), loadMat(frame1), flow);
+
+    cv::cuda::GpuMat flows[2];
+    cv::cuda::split(flow, flows);
+
+    cv::cuda::GpuMat u = flows[0];
+    cv::cuda::GpuMat v = flows[1];
+
+    cv::Mat h_u, h_v;
+    u.download(h_u);
+    v.download(h_v);
+
+    EXPECT_TRUE(cv::checkRange(h_u));
+    EXPECT_TRUE(cv::checkRange(h_v));
+};
+
+INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, BroxOpticalFlow, ALL_DEVICES);
+
+//////////////////////////////////////////////////////
+// PyrLKOpticalFlow
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(Chan, int)
+    IMPLEMENT_PARAM_CLASS(DataType, int)
+}
+
+PARAM_TEST_CASE(PyrLKOpticalFlow, cv::cuda::DeviceInfo, Chan, DataType)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int channels;
+    int dataType;
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        channels = GET_PARAM(1);
+        dataType = GET_PARAM(2);
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
+{
+    cv::Mat frame0 = readImage("opticalflow/frame0.png", channels == 1 ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/frame1.png", channels == 1 ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat gray_frame;
+    if (channels == 1)
+        gray_frame = frame0;
+    else
+        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
+
+    std::vector<cv::Point2f> pts;
+    cv::goodFeaturesToTrack(gray_frame, pts, 1000, 0.01, 0.0);
+
+    cv::cuda::GpuMat d_pts;
+    cv::Mat pts_mat(1, (int) pts.size(), CV_32FC2, (void*) &pts[0]);
+    d_pts.upload(pts_mat);
+
+    cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> pyrLK =
+            cv::cuda::SparsePyrLKOpticalFlow::create();
+
+    std::vector<cv::Point2f> nextPts_gold;
+    std::vector<unsigned char> status_gold;
+    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
+
+
+    cv::cuda::GpuMat d_nextPts;
+    cv::cuda::GpuMat d_status;
+    cv::Mat converted0, converted1;
+    if(channels == 4)
+    {
+        cv::cvtColor(frame0, frame0, cv::COLOR_BGR2BGRA);
+        cv::cvtColor(frame1, frame1, cv::COLOR_BGR2BGRA);
+    }
+    frame0.convertTo(converted0, dataType);
+    frame1.convertTo(converted1, dataType);
+
+    pyrLK->calc(loadMat(converted0), loadMat(converted1), d_pts, d_nextPts, d_status);
+
+    std::vector<cv::Point2f> nextPts(d_nextPts.cols);
+    cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*)&nextPts[0]);
+    d_nextPts.download(nextPts_mat);
+
+    std::vector<unsigned char> status(d_status.cols);
+    cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*)&status[0]);
+    d_status.download(status_mat);
+
+    ASSERT_EQ(nextPts_gold.size(), nextPts.size());
+    ASSERT_EQ(status_gold.size(), status.size());
+
+    size_t mistmatch = 0;
+    for (size_t i = 0; i < nextPts.size(); ++i)
+    {
+        cv::Point2i a = nextPts[i];
+        cv::Point2i b = nextPts_gold[i];
+
+        if (status[i] != status_gold[i])
+        {
+            ++mistmatch;
+            continue;
+        }
+
+        if (status[i])
+        {
+            bool eq = std::abs(a.x - b.x) <= 1 && std::abs(a.y - b.y) <= 1;
+
+            if (!eq)
+                ++mistmatch;
+        }
+    }
+
+    double bad_ratio = static_cast<double>(mistmatch) / nextPts.size();
+
+    ASSERT_LE(bad_ratio, 0.01);
+
+
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, PyrLKOpticalFlow, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(Chan(1), Chan(3), Chan(4)),
+    testing::Values(DataType(CV_8U), DataType(CV_16U), DataType(CV_32S), DataType(CV_32F))));
+
+
+
+//////////////////////////////////////////////////////
+// FarnebackOpticalFlow
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(PyrScale, double)
+    IMPLEMENT_PARAM_CLASS(PolyN, int)
+    CV_FLAGS(FarnebackOptFlowFlags, 0, OPTFLOW_FARNEBACK_GAUSSIAN)
+    IMPLEMENT_PARAM_CLASS(UseInitFlow, bool)
+}
+
+PARAM_TEST_CASE(FarnebackOpticalFlow, cv::cuda::DeviceInfo, PyrScale, PolyN, FarnebackOptFlowFlags, UseInitFlow)
+{
+    cv::cuda::DeviceInfo devInfo;
+    double pyrScale;
+    int polyN;
+    int flags;
+    bool useInitFlow;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        pyrScale = GET_PARAM(1);
+        polyN = GET_PARAM(2);
+        flags = GET_PARAM(3);
+        useInitFlow = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(FarnebackOpticalFlow, Accuracy)
+{
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    double polySigma = polyN <= 5 ? 1.1 : 1.5;
+
+    cv::Ptr<cv::cuda::FarnebackOpticalFlow> farn =
+            cv::cuda::FarnebackOpticalFlow::create();
+    farn->setPyrScale(pyrScale);
+    farn->setPolyN(polyN);
+    farn->setPolySigma(polySigma);
+    farn->setFlags(flags);
+
+    cv::cuda::GpuMat d_flow;
+    farn->calc(loadMat(frame0), loadMat(frame1), d_flow);
+
+    cv::Mat flow;
+    if (useInitFlow)
+    {
+        d_flow.download(flow);
+
+        farn->setFlags(farn->getFlags() | cv::OPTFLOW_USE_INITIAL_FLOW);
+        farn->calc(loadMat(frame0), loadMat(frame1), d_flow);
+    }
+
+    cv::calcOpticalFlowFarneback(
+        frame0, frame1, flow, farn->getPyrScale(), farn->getNumLevels(), farn->getWinSize(),
+        farn->getNumIters(), farn->getPolyN(), farn->getPolySigma(), farn->getFlags());
+
+    EXPECT_MAT_SIMILAR(flow, d_flow, 0.1);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, FarnebackOpticalFlow, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(PyrScale(0.3), PyrScale(0.5), PyrScale(0.8)),
+    testing::Values(PolyN(5), PolyN(7)),
+    testing::Values(FarnebackOptFlowFlags(0), FarnebackOptFlowFlags(cv::OPTFLOW_FARNEBACK_GAUSSIAN)),
+    testing::Values(UseInitFlow(false), UseInitFlow(true))));
+
+//////////////////////////////////////////////////////
+// OpticalFlowDual_TVL1
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(Gamma, double)
+}
+
+PARAM_TEST_CASE(OpticalFlowDual_TVL1, cv::cuda::DeviceInfo, Gamma)
+{
+    cv::cuda::DeviceInfo devInfo;
+    double gamma;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        gamma = GET_PARAM(1);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(OpticalFlowDual_TVL1, Accuracy)
+{
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> d_alg =
+            cv::cuda::OpticalFlowDual_TVL1::create();
+    d_alg->setNumIterations(10);
+    d_alg->setGamma(gamma);
+
+    cv::cuda::GpuMat d_flow;
+    d_alg->calc(loadMat(frame0), loadMat(frame1), d_flow);
+
+    cv::Ptr<cv::DualTVL1OpticalFlow> alg = cv::createOptFlow_DualTVL1();
+    alg->setMedianFiltering(1);
+    alg->setInnerIterations(1);
+    alg->setOuterIterations(d_alg->getNumIterations());
+    alg->setGamma(gamma);
+
+    cv::Mat flow;
+    alg->calc(frame0, frame1, flow);
+
+    EXPECT_MAT_SIMILAR(flow, d_flow, 4e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(Gamma(0.0), Gamma(1.0))));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudaoptflow/test/test_precomp.hpp b/modules/cudaoptflow/test/test_precomp.hpp
new file mode 100644
index 00000000000..ccfff0680ff
--- /dev/null
+++ b/modules/cudaoptflow/test/test_precomp.hpp
@@ -0,0 +1,54 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/video.hpp"
+
+#include "cvconfig.h"
+
+#endif
diff --git a/modules/cudastereo/CMakeLists.txt b/modules/cudastereo/CMakeLists.txt
new file mode 100644
index 00000000000..c02086913cf
--- /dev/null
+++ b/modules/cudastereo/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR  WINRT OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudastereo)
+endif()
+
+set(the_description "CUDA-accelerated Stereo Correspondence")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudastereo opencv_calib3d WRAP python)
diff --git a/modules/cudastereo/include/opencv2/cudastereo.hpp b/modules/cudastereo/include/opencv2/cudastereo.hpp
new file mode 100644
index 00000000000..0c312054d7c
--- /dev/null
+++ b/modules/cudastereo/include/opencv2/cudastereo.hpp
@@ -0,0 +1,333 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDASTEREO_HPP
+#define OPENCV_CUDASTEREO_HPP
+
+#ifndef __cplusplus
+#  error cudastereo.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/calib3d.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudastereo Stereo Correspondence
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudastereo
+//! @{
+
+/////////////////////////////////////////
+// StereoBM
+
+/** @brief Class computing stereo correspondence (disparity map) using the block matching algorithm. :
+
+@sa StereoBM
+ */
+class CV_EXPORTS_W StereoBM : public cv::StereoBM
+{
+public:
+    using cv::StereoBM::compute;
+
+    CV_WRAP virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;
+};
+
+/** @brief Creates StereoBM object.
+
+@param numDisparities the disparity search range. For each pixel algorithm will find the best
+disparity from 0 (default minimum disparity) to numDisparities. The search range can then be
+shifted by changing the minimum disparity.
+@param blockSize the linear size of the blocks compared by the algorithm. The size should be odd
+(as the block is centered at the current pixel). Larger block size implies smoother, though less
+accurate disparity map. Smaller block size gives more detailed disparity map, but there is higher
+chance for algorithm to find a wrong correspondence.
+ */
+CV_EXPORTS_W Ptr<cuda::StereoBM> createStereoBM(int numDisparities = 64, int blockSize = 19);
+
+/////////////////////////////////////////
+// StereoBeliefPropagation
+
+/** @brief Class computing stereo correspondence using the belief propagation algorithm. :
+
+The class implements algorithm described in @cite Felzenszwalb2006 . It can compute own data cost
+(using a truncated linear model) or use a user-provided data cost.
+
+@note
+   StereoBeliefPropagation requires a lot of memory for message storage:
+
+    \f[width \_ step  \cdot height  \cdot ndisp  \cdot 4  \cdot (1 + 0.25)\f]
+
+    and for data cost storage:
+
+    \f[width\_step \cdot height \cdot ndisp \cdot (1 + 0.25 + 0.0625 +  \dotsm + \frac{1}{4^{levels}})\f]
+
+    width_step is the number of bytes in a line including padding.
+
+StereoBeliefPropagation uses a truncated linear model for the data cost and discontinuity terms:
+
+\f[DataCost = data \_ weight  \cdot \min ( \lvert Img_Left(x,y)-Img_Right(x-d,y)  \rvert , max \_ data \_ term)\f]
+
+\f[DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)\f]
+
+For more details, see @cite Felzenszwalb2006 .
+
+By default, StereoBeliefPropagation uses floating-point arithmetics and the CV_32FC1 type for
+messages. But it can also use fixed-point arithmetics and the CV_16SC1 message type for better
+performance. To avoid an overflow in this case, the parameters must satisfy the following
+requirement:
+
+\f[10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX\f]
+
+@sa StereoMatcher
+ */
+class CV_EXPORTS_W StereoBeliefPropagation : public cv::StereoMatcher
+{
+public:
+    using cv::StereoMatcher::compute;
+
+    /** @overload */
+    CV_WRAP virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;
+
+    /** @brief Enables the stereo correspondence operator that finds the disparity for the specified data cost.
+
+    @param data User-specified data cost, a matrix of msg_type type and
+    Size(\<image columns\>\*ndisp, \<image rows\>) size.
+    @param disparity Output disparity map. If disparity is empty, the output type is CV_16SC1 .
+    Otherwise, the type is retained. In 16-bit signed format, the disparity values do not have
+    fractional bits.
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void compute(InputArray data, OutputArray disparity, Stream& stream = Stream::Null()) = 0;
+
+    //! number of BP iterations on each level
+    CV_WRAP virtual int getNumIters() const = 0;
+    CV_WRAP virtual void setNumIters(int iters) = 0;
+
+    //! number of levels
+    CV_WRAP virtual int getNumLevels() const = 0;
+    CV_WRAP virtual void setNumLevels(int levels) = 0;
+
+    //! truncation of data cost
+    CV_WRAP virtual double getMaxDataTerm() const = 0;
+    CV_WRAP virtual void setMaxDataTerm(double max_data_term) = 0;
+
+    //! data weight
+    CV_WRAP virtual double getDataWeight() const = 0;
+    CV_WRAP virtual void setDataWeight(double data_weight) = 0;
+
+    //! truncation of discontinuity cost
+    CV_WRAP virtual double getMaxDiscTerm() const = 0;
+    CV_WRAP virtual void setMaxDiscTerm(double max_disc_term) = 0;
+
+    //! discontinuity single jump
+    CV_WRAP virtual double getDiscSingleJump() const = 0;
+    CV_WRAP virtual void setDiscSingleJump(double disc_single_jump) = 0;
+
+    //! type for messages (CV_16SC1 or CV_32FC1)
+    CV_WRAP virtual int getMsgType() const = 0;
+    CV_WRAP virtual void setMsgType(int msg_type) = 0;
+
+    /** @brief Uses a heuristic method to compute the recommended parameters ( ndisp, iters and levels ) for the
+    specified image size ( width and height ).
+     */
+    CV_WRAP static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);
+};
+
+/** @brief Creates StereoBeliefPropagation object.
+
+@param ndisp Number of disparities.
+@param iters Number of BP iterations on each level.
+@param levels Number of levels.
+@param msg_type Type for messages. CV_16SC1 and CV_32FC1 types are supported.
+ */
+CV_EXPORTS_W Ptr<cuda::StereoBeliefPropagation>
+    createStereoBeliefPropagation(int ndisp = 64, int iters = 5, int levels = 5, int msg_type = CV_32F);
+
+/////////////////////////////////////////
+// StereoConstantSpaceBP
+
+/** @brief Class computing stereo correspondence using the constant space belief propagation algorithm. :
+
+The class implements algorithm described in @cite Yang2010 . StereoConstantSpaceBP supports both local
+minimum and global minimum data cost initialization algorithms. For more details, see the paper
+mentioned above. By default, a local algorithm is used. To enable a global algorithm, set
+use_local_init_data_cost to false .
+
+StereoConstantSpaceBP uses a truncated linear model for the data cost and discontinuity terms:
+
+\f[DataCost = data \_ weight  \cdot \min ( \lvert I_2-I_1  \rvert , max \_ data \_ term)\f]
+
+\f[DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)\f]
+
+For more details, see @cite Yang2010 .
+
+By default, StereoConstantSpaceBP uses floating-point arithmetics and the CV_32FC1 type for
+messages. But it can also use fixed-point arithmetics and the CV_16SC1 message type for better
+performance. To avoid an overflow in this case, the parameters must satisfy the following
+requirement:
+
+\f[10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX\f]
+
+ */
+class CV_EXPORTS_W StereoConstantSpaceBP : public cuda::StereoBeliefPropagation
+{
+public:
+    //! number of active disparity on the first level
+    CV_WRAP virtual int getNrPlane() const = 0;
+    CV_WRAP virtual void setNrPlane(int nr_plane) = 0;
+
+    CV_WRAP virtual bool getUseLocalInitDataCost() const = 0;
+    CV_WRAP virtual void setUseLocalInitDataCost(bool use_local_init_data_cost) = 0;
+
+    /** @brief Uses a heuristic method to compute parameters (ndisp, iters, levelsand nrplane) for the specified
+    image size (widthand height).
+     */
+    CV_WRAP static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane);
+};
+
+/** @brief Creates StereoConstantSpaceBP object.
+
+@param ndisp Number of disparities.
+@param iters Number of BP iterations on each level.
+@param levels Number of levels.
+@param nr_plane Number of disparity levels on the first level.
+@param msg_type Type for messages. CV_16SC1 and CV_32FC1 types are supported.
+ */
+CV_EXPORTS_W Ptr<cuda::StereoConstantSpaceBP>
+    createStereoConstantSpaceBP(int ndisp = 128, int iters = 8, int levels = 4, int nr_plane = 4, int msg_type = CV_32F);
+
+/////////////////////////////////////////
+// DisparityBilateralFilter
+
+/** @brief Class refining a disparity map using joint bilateral filtering. :
+
+The class implements @cite Yang2010 algorithm.
+ */
+class CV_EXPORTS_W DisparityBilateralFilter : public cv::Algorithm
+{
+public:
+    /** @brief Refines a disparity map using joint bilateral filtering.
+
+    @param disparity Input disparity map. CV_8UC1 and CV_16SC1 types are supported.
+    @param image Input image. CV_8UC1 and CV_8UC3 types are supported.
+    @param dst Destination disparity map. It has the same size and type as disparity .
+    @param stream Stream for the asynchronous version.
+     */
+    CV_WRAP virtual void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+
+    CV_WRAP virtual int getNumDisparities() const = 0;
+    CV_WRAP virtual void setNumDisparities(int numDisparities) = 0;
+
+    CV_WRAP virtual int getRadius() const = 0;
+    CV_WRAP virtual void setRadius(int radius) = 0;
+
+    CV_WRAP virtual int getNumIters() const = 0;
+    CV_WRAP virtual void setNumIters(int iters) = 0;
+
+    //! truncation of data continuity
+    CV_WRAP virtual double getEdgeThreshold() const = 0;
+    CV_WRAP virtual void setEdgeThreshold(double edge_threshold) = 0;
+
+    //! truncation of disparity continuity
+    CV_WRAP virtual double getMaxDiscThreshold() const = 0;
+    CV_WRAP virtual void setMaxDiscThreshold(double max_disc_threshold) = 0;
+
+    //! filter range sigma
+    CV_WRAP virtual double getSigmaRange() const = 0;
+    CV_WRAP virtual void setSigmaRange(double sigma_range) = 0;
+};
+
+/** @brief Creates DisparityBilateralFilter object.
+
+@param ndisp Number of disparities.
+@param radius Filter radius.
+@param iters Number of iterations.
+ */
+CV_EXPORTS_W Ptr<cuda::DisparityBilateralFilter>
+    createDisparityBilateralFilter(int ndisp = 64, int radius = 3, int iters = 1);
+
+/////////////////////////////////////////
+// Utility
+
+/** @brief Reprojects a disparity image to 3D space.
+
+@param disp Input single-channel 8-bit unsigned, 16-bit signed, 32-bit signed or 32-bit
+floating-point disparity image. If 16-bit signed format is used, the values are assumed to have no
+fractional bits.
+@param xyzw Output 3- or 4-channel floating-point image of the same size as disp . Each element of
+xyzw(x,y) contains 3D coordinates (x,y,z) or (x,y,z,1) of the point (x,y) , computed from the
+disparity map.
+@param Q \f$4 \times 4\f$ perspective transformation matrix that can be obtained via stereoRectify .
+@param dst_cn The number of channels for output image. Can be 3 or 4.
+@param stream Stream for the asynchronous version.
+
+@sa reprojectImageTo3D
+ */
+CV_EXPORTS_W void reprojectImageTo3D(InputArray disp, OutputArray xyzw, InputArray Q, int dst_cn = 4, Stream& stream = Stream::Null());
+
+/** @brief Colors a disparity image.
+
+@param src_disp Input single-channel 8-bit unsigned, 16-bit signed, 32-bit signed or 32-bit
+floating-point disparity image. If 16-bit signed format is used, the values are assumed to have no
+fractional bits.
+@param dst_disp Output disparity image. It has the same size as src_disp. The type is CV_8UC4
+in BGRA format (alpha = 255).
+@param ndisp Number of disparities.
+@param stream Stream for the asynchronous version.
+
+This function draws a colored disparity map by converting disparity values from [0..ndisp) interval
+first to HSV color space (where different disparity values correspond to different hues) and then
+converting the pixels to RGB for visualization.
+ */
+CV_EXPORTS_W void drawColorDisp(InputArray src_disp, OutputArray dst_disp, int ndisp, Stream& stream = Stream::Null());
+
+//! @}
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_CUDASTEREO_HPP */
diff --git a/modules/cudastereo/perf/perf_main.cpp b/modules/cudastereo/perf/perf_main.cpp
new file mode 100644
index 00000000000..3cf84eb7833
--- /dev/null
+++ b/modules/cudastereo/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudastereo)
diff --git a/modules/cudastereo/perf/perf_precomp.hpp b/modules/cudastereo/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..1da8d3ae16b
--- /dev/null
+++ b/modules/cudastereo/perf/perf_precomp.hpp
@@ -0,0 +1,55 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudastereo.hpp"
+#include "opencv2/calib3d.hpp"
+
+namespace opencv_test {
+using namespace perf;
+}
+
+#endif
diff --git a/modules/cudastereo/perf/perf_stereo.cpp b/modules/cudastereo/perf/perf_stereo.cpp
new file mode 100644
index 00000000000..50529c2fe09
--- /dev/null
+++ b/modules/cudastereo/perf/perf_stereo.cpp
@@ -0,0 +1,255 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// StereoBM
+
+typedef tuple<string, string> pair_string;
+DEF_PARAM_TEST_1(ImagePair, pair_string);
+
+PERF_TEST_P(ImagePair, StereoBM,
+            Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
+{
+    declare.time(300.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 256;
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::StereoBM> d_bm = cv::cuda::createStereoBM(ndisp);
+
+        const cv::cuda::GpuMat d_imgLeft(imgLeft);
+        const cv::cuda::GpuMat d_imgRight(imgRight);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() d_bm->compute(d_imgLeft, d_imgRight, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Ptr<cv::StereoBM> bm = cv::StereoBM::create(ndisp);
+
+        cv::Mat dst;
+
+        TEST_CYCLE() bm->compute(imgLeft, imgRight, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// StereoBeliefPropagation
+
+PERF_TEST_P(ImagePair, StereoBeliefPropagation,
+            Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
+{
+    declare.time(300.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0));
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1));
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 64;
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::StereoBeliefPropagation> d_bp = cv::cuda::createStereoBeliefPropagation(ndisp);
+
+        const cv::cuda::GpuMat d_imgLeft(imgLeft);
+        const cv::cuda::GpuMat d_imgRight(imgRight);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() d_bp->compute(d_imgLeft, d_imgRight, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// StereoConstantSpaceBP
+
+PERF_TEST_P(ImagePair, StereoConstantSpaceBP,
+            Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
+{
+    declare.time(300.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 128;
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::StereoConstantSpaceBP> d_csbp = cv::cuda::createStereoConstantSpaceBP(ndisp);
+
+        const cv::cuda::GpuMat d_imgLeft(imgLeft);
+        const cv::cuda::GpuMat d_imgRight(imgRight);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() d_csbp->compute(d_imgLeft, d_imgRight, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// DisparityBilateralFilter
+
+PERF_TEST_P(ImagePair, DisparityBilateralFilter,
+            Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
+{
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    const cv::Mat disp = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(disp.empty());
+
+    const int ndisp = 128;
+
+    if (PERF_RUN_CUDA())
+    {
+        cv::Ptr<cv::cuda::DisparityBilateralFilter> d_filter = cv::cuda::createDisparityBilateralFilter(ndisp);
+
+        const cv::cuda::GpuMat d_img(img);
+        const cv::cuda::GpuMat d_disp(disp);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() d_filter->apply(d_disp, d_img, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ReprojectImageTo3D
+
+DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
+
+PERF_TEST_P(Sz_Depth, ReprojectImageTo3D,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat Q(4, 4, CV_32FC1);
+    cv::randu(Q, 0.1, 1.0);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::reprojectImageTo3D(d_src, dst, Q);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::reprojectImageTo3D(src, dst, Q);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// DrawColorDisp
+
+PERF_TEST_P(Sz_Depth, DrawColorDisp,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::drawColorDisp(d_src, dst, 255);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+}} // namespace
diff --git a/modules/cudastereo/src/cuda/disparity_bilateral_filter.cu b/modules/cudastereo/src/cuda/disparity_bilateral_filter.cu
new file mode 100644
index 00000000000..c69e6559f79
--- /dev/null
+++ b/modules/cudastereo/src/cuda/disparity_bilateral_filter.cu
@@ -0,0 +1,205 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+#include "disparity_bilateral_filter.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace disp_bilateral_filter
+    {
+        template <int channels>
+        struct DistRgbMax
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+            {
+                uchar x = ::abs(a[0] - b[0]);
+                uchar y = ::abs(a[1] - b[1]);
+                uchar z = ::abs(a[2] - b[2]);
+                return (::max(::max(x, y), z));
+            }
+        };
+
+        template <>
+        struct DistRgbMax<1>
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+            {
+                return ::abs(a[0] - b[0]);
+            }
+        };
+
+        template <int channels, typename T>
+        __global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step,
+            const uchar* img, size_t img_step, int h, int w,
+            const float* ctable_color, const float * ctable_space, size_t ctable_space_step,
+            int cradius,
+            short cedge_disc, short cmax_disc)
+        {
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+            T dp[5];
+
+            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+            {
+                dp[0] = *(disp + (y  ) * disp_step + x + 0);
+                dp[1] = *(disp + (y-1) * disp_step + x + 0);
+                dp[2] = *(disp + (y  ) * disp_step + x - 1);
+                dp[3] = *(disp + (y+1) * disp_step + x + 0);
+                dp[4] = *(disp + (y  ) * disp_step + x + 1);
+
+                if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
+                {
+                    const int ymin = ::max(0, y - cradius);
+                    const int xmin = ::max(0, x - cradius);
+                    const int ymax = ::min(h - 1, y + cradius);
+                    const int xmax = ::min(w - 1, x + cradius);
+
+                    float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+                    const uchar* ic = img + y * img_step + channels * x;
+
+                    for(int yi = ymin; yi <= ymax; yi++)
+                    {
+                        const T* disp_y = disp + yi * disp_step;
+
+                        for(int xi = xmin; xi <= xmax; xi++)
+                        {
+                            const uchar* in = img + yi * img_step + channels * xi;
+
+                            uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+
+                            const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
+
+                            const T disp_reg = disp_y[xi];
+
+                            cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
+                            cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
+                            cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
+                            cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
+                            cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
+                        }
+                    }
+
+                    float minimum = numeric_limits<float>::max();
+                    int id = 0;
+
+                    if (cost[0] < minimum)
+                    {
+                        minimum = cost[0];
+                        id = 0;
+                    }
+                    if (cost[1] < minimum)
+                    {
+                        minimum = cost[1];
+                        id = 1;
+                    }
+                    if (cost[2] < minimum)
+                    {
+                        minimum = cost[2];
+                        id = 2;
+                    }
+                    if (cost[3] < minimum)
+                    {
+                        minimum = cost[3];
+                        id = 3;
+                    }
+                    if (cost[4] < minimum)
+                    {
+                        minimum = cost[4];
+                        id = 4;
+                    }
+
+                    *(disp + y * disp_step + x) = dp[id];
+                }
+            }
+        }
+
+        template <typename T>
+        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, const float *table_color, const float* table_space, size_t table_step, int radius, short edge_disc, short max_disc, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(disp.cols, threads.x << 1);
+            grid.y = divUp(disp.rows, threads.y);
+
+            switch (channels)
+            {
+            case 1:
+                for (int i = 0; i < iters; ++i)
+                {
+                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
+                    cudaSafeCall( cudaGetLastError() );
+                }
+                break;
+            case 3:
+                for (int i = 0; i < iters; ++i)
+                {
+                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
+                    cudaSafeCall( cudaGetLastError() );
+                }
+                break;
+            default:
+                CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void disp_bilateral_filter<uchar>(PtrStepSz<uchar> disp, PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, short, short, cudaStream_t stream);
+        template void disp_bilateral_filter<short>(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, short, short, cudaStream_t stream);
+    } // namespace bilateral_filter
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudastereo/src/cuda/disparity_bilateral_filter.hpp b/modules/cudastereo/src/cuda/disparity_bilateral_filter.hpp
new file mode 100644
index 00000000000..95be8345736
--- /dev/null
+++ b/modules/cudastereo/src/cuda/disparity_bilateral_filter.hpp
@@ -0,0 +1,8 @@
+namespace cv { namespace cuda { namespace device
+{
+    namespace disp_bilateral_filter
+    {
+        template<typename T>
+        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, const float *, const float *, size_t, int radius, short edge_disc, short max_disc, cudaStream_t stream);
+    }
+}}}
diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu
new file mode 100644
index 00000000000..ccc6737ffa9
--- /dev/null
+++ b/modules/cudastereo/src/cuda/stereobm.cu
@@ -0,0 +1,540 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereobm
+    {
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+
+        #define ROWSperTHREAD 21     // the number of rows a thread will process
+
+        #define BLOCK_W 128          // the thread block width (464)
+        #define N_DISPARITIES 8
+
+        #define STEREO_MIND 0                    // The minimum d range to check
+        #define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
+
+        __constant__ unsigned int* cminSSDImage;
+        __constant__ size_t cminSSD_step;
+        __constant__ int cwidth;
+        __constant__ int cheight;
+
+        __device__ __forceinline__ int SQ(int a)
+        {
+            return a * a;
+        }
+
+        template<int RADIUS>
+        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
+        {
+            unsigned int cache = 0;
+            unsigned int cache2 = 0;
+
+            for(int i = 1; i <= RADIUS; i++)
+                cache += col_ssd[i];
+
+            col_ssd_cache[0] = cache;
+
+            __syncthreads();
+
+            if (threadIdx.x < BLOCK_W - RADIUS)
+                cache2 = col_ssd_cache[RADIUS];
+            else
+                for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
+                    cache2 += col_ssd[i];
+
+            return col_ssd[0] + cache + cache2;
+        }
+
+        template<int RADIUS>
+        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
+        {
+            unsigned int ssd[N_DISPARITIES];
+
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
+
+            int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
+
+            int bestIdx = 0;
+            for (int i = 0; i < N_DISPARITIES; i++)
+            {
+                if (mssd == ssd[i])
+                    bestIdx = i;
+            }
+
+            return make_uint2(mssd, bestIdx);
+        }
+
+        template<int RADIUS>
+        __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
+        {
+            unsigned char leftPixel1;
+            unsigned char leftPixel2;
+            unsigned char rightPixel1[8];
+            unsigned char rightPixel2[8];
+            unsigned int diff1, diff2;
+
+            leftPixel1 = imageL[idx1];
+            leftPixel2 = imageL[idx2];
+
+            idx1 = idx1 - d;
+            idx2 = idx2 - d;
+
+            rightPixel1[7] = imageR[idx1 - 7];
+            rightPixel1[0] = imageR[idx1 - 0];
+            rightPixel1[1] = imageR[idx1 - 1];
+            rightPixel1[2] = imageR[idx1 - 2];
+            rightPixel1[3] = imageR[idx1 - 3];
+            rightPixel1[4] = imageR[idx1 - 4];
+            rightPixel1[5] = imageR[idx1 - 5];
+            rightPixel1[6] = imageR[idx1 - 6];
+
+            rightPixel2[7] = imageR[idx2 - 7];
+            rightPixel2[0] = imageR[idx2 - 0];
+            rightPixel2[1] = imageR[idx2 - 1];
+            rightPixel2[2] = imageR[idx2 - 2];
+            rightPixel2[3] = imageR[idx2 - 3];
+            rightPixel2[4] = imageR[idx2 - 4];
+            rightPixel2[5] = imageR[idx2 - 5];
+            rightPixel2[6] = imageR[idx2 - 6];
+
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            diff1 = leftPixel1 - rightPixel1[0];
+            diff2 = leftPixel2 - rightPixel2[0];
+            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[1];
+            diff2 = leftPixel2 - rightPixel2[1];
+            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[2];
+            diff2 = leftPixel2 - rightPixel2[2];
+            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[3];
+            diff2 = leftPixel2 - rightPixel2[3];
+            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[4];
+            diff2 = leftPixel2 - rightPixel2[4];
+            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[5];
+            diff2 = leftPixel2 - rightPixel2[5];
+            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[6];
+            diff2 = leftPixel2 - rightPixel2[6];
+            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[7];
+            diff2 = leftPixel2 - rightPixel2[7];
+            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+        }
+
+        template<int RADIUS>
+        __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
+        {
+            unsigned char leftPixel1;
+            int idx;
+            unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+            for(int i = 0; i < (2 * RADIUS + 1); i++)
+            {
+                idx = y_tex * im_pitch + x_tex;
+                leftPixel1 = imageL[idx];
+                idx = idx - d;
+
+                diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
+                diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
+                diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
+                diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
+                diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
+                diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
+                diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
+                diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
+
+                y_tex += 1;
+            }
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
+            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
+            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
+            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
+            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
+            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
+            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
+            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
+        }
+
+        template<int RADIUS>
+        __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
+        {
+            extern __shared__ unsigned int col_ssd_cache[];
+            volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
+            volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;  //#define N_DIRTY_PIXELS (2 * RADIUS)
+
+            //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
+            int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
+            //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
+            #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
+            //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
+
+            unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
+            unsigned char* disparImage = disp.data + X + Y * disp.step;
+         /*   if (X < cwidth)
+            {
+                unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
+                for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
+                    *ptr = 0xFFFFFFFF;
+            }*/
+            int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
+            int y_tex;
+            int x_tex = X - RADIUS;
+
+            if (x_tex >= cwidth)
+                return;
+
+            for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
+            {
+                y_tex = Y - RADIUS;
+
+                InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
+
+                if (col_ssd_extra > 0)
+                    if (x_tex + BLOCK_W < cwidth)
+                        InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
+
+                __syncthreads(); //before MinSSD function
+
+                if (X < cwidth - RADIUS && Y < cheight - RADIUS)
+                {
+                    uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
+                    if (minSSD.x < minSSDImage[0])
+                    {
+                        disparImage[0] = (unsigned char)(d + minSSD.y);
+                        minSSDImage[0] = minSSD.x;
+                    }
+                }
+
+                for(int row = 1; row < end_row; row++)
+                {
+                    int idx1 = y_tex * img_step + x_tex;
+                    int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
+
+                    __syncthreads();
+
+                    StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
+
+                    if (col_ssd_extra)
+                        if (x_tex + BLOCK_W < cwidth)
+                            StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
+
+                    y_tex += 1;
+
+                    __syncthreads(); //before MinSSD function
+
+                    if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
+                    {
+                        int idx = row * cminSSD_step;
+                        uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
+                        if (minSSD.x < minSSDImage[idx])
+                        {
+                            disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
+                            minSSDImage[idx] = minSSD.x;
+                        }
+                    }
+                } // for row loop
+            } // for d loop
+        }
+
+
+        template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream)
+        {
+            dim3 grid(1,1,1);
+            dim3 threads(BLOCK_W, 1, 1);
+
+            grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
+            grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
+
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
+
+            stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        };
+
+        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
+
+        const static kernel_caller_t callers[] =
+        {
+            0,
+            kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
+            kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
+            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<14>, kernel_caller<15>,
+            kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
+            kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
+
+            //0,0,0, 0,0,0, 0,0,kernel_caller<9>
+        };
+        const int calles_num = sizeof(callers)/sizeof(callers[0]);
+
+        void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
+        {
+            int winsz2 = winsz >> 1;
+
+            if (winsz2 == 0 || winsz2 >= calles_num)
+                CV_Error(cv::Error::StsBadArg, "Unsupported window size");
+
+            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
+            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
+
+            cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
+            cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
+
+            cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
+            cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
+            cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
+
+            size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
+            cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step,  &minssd_step, sizeof(minssd_step) ) );
+
+            callers[winsz2](left, right, disp, maxdisp, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+
+        texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
+
+        __global__ void prefilter_kernel(PtrStepSzb output, int prefilterCap)
+        {
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < output.cols && y < output.rows)
+            {
+                int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
+                           (int)tex2D(texForSobel, x - 1, y    ) * (-2) + (int)tex2D(texForSobel, x + 1, y    ) * (2) +
+                           (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
+
+
+                conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
+                output.ptr(y)[x] = conv & 0xFF;
+            }
+        }
+
+        void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, cudaStream_t & stream)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+            cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
+
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(input.cols, threads.x);
+            grid.y = divUp(input.rows, threads.y);
+
+            prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaUnbindTexture (texForSobel ) );
+        }
+
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////// Textureness filtering ////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+
+        texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
+
+        __device__ __forceinline__ float sobel(int x, int y)
+        {
+            float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
+                         tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
+                         tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
+            return fabs(conv);
+        }
+
+        __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
+        {
+            float cache = 0;
+            float cache2 = 0;
+            int winsz2 = winsz/2;
+
+            for(int i = 1; i <= winsz2; i++)
+                cache += cols[i];
+
+            cols_cache[0] = cache;
+
+            __syncthreads();
+
+            if (threadIdx.x < blockDim.x - winsz2)
+                cache2 = cols_cache[winsz2];
+            else
+                for(int i = winsz2 + 1; i < winsz; i++)
+                    cache2 += cols[i];
+
+            return cols[0] + cache + cache2;
+        }
+
+        #define RpT (2 * ROWSperTHREAD)  // got experimentally
+
+        __global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold)
+        {
+            int winsz2 = winsz/2;
+            int n_dirty_pixels = (winsz2) * 2;
+
+            extern __shared__ float cols_cache[];
+            float *cols = cols_cache + blockDim.x + threadIdx.x;
+            float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
+
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int beg_row = blockIdx.y * RpT;
+            int end_row = ::min(beg_row + RpT, disp.rows);
+
+            if (x < disp.cols)
+            {
+                int y = beg_row;
+
+                float sum = 0;
+                float sum_extra = 0;
+
+                for(int i = y - winsz2; i <= y + winsz2; ++i)
+                {
+                    sum += sobel(x - winsz2, i);
+                    if (cols_extra)
+                        sum_extra += sobel(x + blockDim.x - winsz2, i);
+                }
+                *cols = sum;
+                if (cols_extra)
+                    *cols_extra = sum_extra;
+
+                __syncthreads();
+
+                float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
+                if (sum_win < threshold)
+                    disp.data[y * disp.step + x] = 0;
+
+                __syncthreads();
+
+                for(int y = beg_row + 1; y < end_row; ++y)
+                {
+                    sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
+                    *cols = sum;
+
+                    if (cols_extra)
+                    {
+                        sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
+                        *cols_extra = sum_extra;
+                    }
+
+                    __syncthreads();
+                    float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
+                    if (sum_win < threshold)
+                        disp.data[y * disp.step + x] = 0;
+
+                    __syncthreads();
+                }
+            }
+        }
+
+        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
+        {
+            avgTexturenessThreshold *= winsz * winsz;
+
+            texForTF.filterMode     = cudaFilterModeLinear;
+            texForTF.addressMode[0] = cudaAddressModeWrap;
+            texForTF.addressMode[1] = cudaAddressModeWrap;
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+            cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
+
+            dim3 threads(128, 1, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(input.cols, threads.x);
+            grid.y = divUp(input.rows, RpT);
+
+            size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
+            textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaUnbindTexture (texForTF) );
+        }
+    } // namespace stereobm
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudastereo/src/cuda/stereobp.cu b/modules/cudastereo/src/cuda/stereobp.cu
new file mode 100644
index 00000000000..f8ecdaf508f
--- /dev/null
+++ b/modules/cudastereo/src/cuda/stereobp.cu
@@ -0,0 +1,538 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereobp
+    {
+        ///////////////////////////////////////////////////////////////
+        /////////////////////// load constants ////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        __constant__ int   cndisp;
+        __constant__ float cmax_data_term;
+        __constant__ float cdata_weight;
+        __constant__ float cmax_disc_term;
+        __constant__ float cdisc_single_jump;
+
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
+        }
+
+        ///////////////////////////////////////////////////////////////
+        ////////////////////////// comp data //////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <int cn> struct PixDiff;
+        template <> struct PixDiff<1>
+        {
+            __device__ __forceinline__ PixDiff(const uchar* ls)
+            {
+                l = *ls;
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                return ::abs((int)l - *rs);
+            }
+            uchar l;
+        };
+        template <> struct PixDiff<3>
+        {
+            __device__ __forceinline__ PixDiff(const uchar* ls)
+            {
+                l = *((uchar3*)ls);
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                const float tr = 0.299f;
+                const float tg = 0.587f;
+                const float tb = 0.114f;
+
+                float val  = tb * ::abs((int)l.x - rs[0]);
+                      val += tg * ::abs((int)l.y - rs[1]);
+                      val += tr * ::abs((int)l.z - rs[2]);
+
+                return val;
+            }
+            uchar3 l;
+        };
+        template <> struct PixDiff<4>
+        {
+            __device__ __forceinline__ PixDiff(const uchar* ls)
+            {
+                l = *((uchar4*)ls);
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                const float tr = 0.299f;
+                const float tg = 0.587f;
+                const float tb = 0.114f;
+
+                uchar4 r = *((uchar4*)rs);
+
+                float val  = tb * ::abs((int)l.x - r.x);
+                      val += tg * ::abs((int)l.y - r.y);
+                      val += tr * ::abs((int)l.z - r.z);
+
+                return val;
+            }
+            uchar4 l;
+        };
+
+        template <int cn, typename D>
+        __global__ void comp_data(const PtrStepSzb left, const PtrStepb right, PtrStep<D> data)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
+            {
+                const uchar* ls = left.ptr(y) + x * cn;
+                const PixDiff<cn> pixDiff(ls);
+                const uchar* rs = right.ptr(y) + x * cn;
+
+                D* ds = data.ptr(y) + x;
+                const size_t disp_step = data.step * left.rows / sizeof(D);
+
+                for (int disp = 0; disp < cndisp; disp++)
+                {
+                    if (x - disp >= 1)
+                    {
+                        float val = pixDiff(rs - disp * cn);
+
+                        ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
+                    }
+                    else
+                    {
+                        ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
+                    }
+                }
+            }
+        }
+
+        template<typename T, typename D>
+        void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
+
+        template <> void comp_data_gpu<uchar, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <> void comp_data_gpu<uchar3, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar3, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <> void comp_data_gpu<uchar4, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar4, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////
+        //////////////////////// data step down ///////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst_cols && y < dst_rows)
+            {
+                for (int d = 0; d < cndisp; ++d)
+                {
+                    float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
+
+                    dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
+                }
+            }
+        }
+
+        template<typename T>
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(dst_cols, threads.x);
+            grid.y = divUp(dst_rows, threads.y);
+
+            data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)src, (PtrStepSz<T>)dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        /////////////////// level up messages  ////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst_cols && y < dst_rows)
+            {
+                const size_t dst_disp_step = dst.step * dst_rows / sizeof(T);
+                const size_t src_disp_step = src.step * src_rows / sizeof(T);
+
+                T*       dstr = dst.ptr(y  ) + x;
+                const T* srcr = src.ptr(y/2) + x/2;
+
+                for (int d = 0; d < cndisp; ++d)
+                    dstr[d * dst_disp_step] = srcr[d * src_disp_step];
+            }
+        }
+
+        template <typename T>
+        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(dst_cols, threads.x);
+            grid.y = divUp(dst_rows, threads.y);
+
+            int src_idx = (dst_idx + 1) & 1;
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mus[src_idx], (PtrStepSz<T>)mus[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mds[src_idx], (PtrStepSz<T>)mds[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mls[src_idx], (PtrStepSz<T>)mls[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mrs[src_idx], (PtrStepSz<T>)mrs[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+        template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        ////////////////////  calc all iterations /////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __device__ void calc_min_linear_penalty(T* dst, size_t step)
+        {
+            float prev = dst[0];
+            float cur;
+            for (int disp = 1; disp < cndisp; ++disp)
+            {
+                prev += cdisc_single_jump;
+                cur = dst[step * disp];
+                if (prev < cur)
+                {
+                    cur = prev;
+                    dst[step * disp] = saturate_cast<T>(prev);
+                }
+                prev = cur;
+            }
+
+            prev = dst[(cndisp - 1) * step];
+            for (int disp = cndisp - 2; disp >= 0; disp--)
+            {
+                prev += cdisc_single_jump;
+                cur = dst[step * disp];
+                if (prev < cur)
+                {
+                    cur = prev;
+                    dst[step * disp] = saturate_cast<T>(prev);
+                }
+                prev = cur;
+            }
+        }
+
+        template <typename T>
+        __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
+        {
+            float minimum = device::numeric_limits<float>::max();
+
+            for(int i = 0; i < cndisp; ++i)
+            {
+                float dst_reg  = msg1[msg_disp_step * i];
+                      dst_reg += msg2[msg_disp_step * i];
+                      dst_reg += msg3[msg_disp_step * i];
+                      dst_reg += data[data_disp_step * i];
+
+                if (dst_reg < minimum)
+                    minimum = dst_reg;
+
+                dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
+            }
+
+            calc_min_linear_penalty(dst, msg_disp_step);
+
+            minimum += cmax_disc_term;
+
+            float sum = 0;
+            for(int i = 0; i < cndisp; ++i)
+            {
+                float dst_reg = dst[msg_disp_step * i];
+                if (dst_reg > minimum)
+                {
+                    dst_reg = minimum;
+                    dst[msg_disp_step * i] = saturate_cast<T>(minimum);
+                }
+                sum += dst_reg;
+            }
+            sum /= cndisp;
+
+            for(int i = 0; i < cndisp; ++i)
+                dst[msg_disp_step * i] -= sum;
+        }
+
+        template <typename T>
+        __global__ void one_iteration(int t, int elem_step, T* u, T* d, T* l, T* r, const PtrStep<T> data, int cols, int rows)
+        {
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+            if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
+            {
+                T* us = u + y * elem_step + x;
+                T* ds = d + y * elem_step + x;
+                T* ls = l + y * elem_step + x;
+                T* rs = r + y * elem_step + x;
+                const T* dt = data.ptr(y) + x;
+
+                size_t msg_disp_step = elem_step * rows;
+                size_t data_disp_step = data.step * rows / sizeof(T);
+
+                message(us + elem_step, ls         + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
+                message(ds - elem_step, ls         + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
+                message(us + elem_step, ds - elem_step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
+                message(us + elem_step, ds - elem_step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
+            }
+        }
+
+        template <typename T>
+        void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
+            const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(cols, threads.x << 1);
+            grid.y = divUp(rows, threads.y);
+
+            int elem_step = (int)(u.step / sizeof(T));
+
+            for(int t = 0; t < iters; ++t)
+            {
+                one_iteration<T><<<grid, threads, 0, stream>>>(t, elem_step, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, (PtrStepSz<T>)data, cols, rows);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        }
+
+        template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+        template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        /////////////////////////// output ////////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __global__ void output(const int elem_step, const T* u, const T* d, const T* l, const T* r, const T* data,
+            PtrStepSz<short> disp)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
+            {
+                const T* us = u + (y + 1) * elem_step + x;
+                const T* ds = d + (y - 1) * elem_step + x;
+                const T* ls = l + y * elem_step + (x + 1);
+                const T* rs = r + y * elem_step+ (x - 1);
+                const T* dt = data + y * elem_step + x;
+
+                size_t disp_step = disp.rows * elem_step;
+
+                int best = 0;
+                float best_val = numeric_limits<float>::max();
+                for (int d = 0; d < cndisp; ++d)
+                {
+                    float val  = us[d * disp_step];
+                          val += ds[d * disp_step];
+                          val += ls[d * disp_step];
+                          val += rs[d * disp_step];
+                          val += dt[d * disp_step];
+
+                    if (val < best_val)
+                    {
+                        best_val = val;
+                        best = d;
+                    }
+                }
+
+                disp.ptr(y)[x] = saturate_cast<short>(best);
+            }
+        }
+
+        template <typename T>
+        void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
+            const PtrStepSz<short>& disp, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(disp.cols, threads.x);
+            grid.y = divUp(disp.rows, threads.y);
+
+            int elem_step = static_cast<int>(u.step/sizeof(T));
+
+            output<T><<<grid, threads, 0, stream>>>(elem_step, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void output_gpu<short>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
+        template void output_gpu<float>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
+    } // namespace stereobp
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudastereo/src/cuda/stereocsbp.cu b/modules/cudastereo/src/cuda/stereocsbp.cu
new file mode 100644
index 00000000000..de1707d68b8
--- /dev/null
+++ b/modules/cudastereo/src/cuda/stereocsbp.cu
@@ -0,0 +1,814 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+
+#include "stereocsbp.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereocsbp
+    {
+        ///////////////////////////////////////////////////////////////
+        /////////////////////// init data cost ////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <int channels> static float __device__ pixeldiff(const uchar* left, const uchar* right, float max_data_term);
+        template<> __device__ __forceinline__ float pixeldiff<1>(const uchar* left, const uchar* right, float max_data_term)
+        {
+            return fminf( ::abs((int)*left - *right), max_data_term);
+        }
+        template<> __device__ __forceinline__ float pixeldiff<3>(const uchar* left, const uchar* right, float max_data_term)
+        {
+            float tb = 0.114f * ::abs((int)left[0] - right[0]);
+            float tg = 0.587f * ::abs((int)left[1] - right[1]);
+            float tr = 0.299f * ::abs((int)left[2] - right[2]);
+
+            return fminf(tr + tg + tb, max_data_term);
+        }
+        template<> __device__ __forceinline__ float pixeldiff<4>(const uchar* left, const uchar* right, float max_data_term)
+        {
+            uchar4 l = *((const uchar4*)left);
+            uchar4 r = *((const uchar4*)right);
+
+            float tb = 0.114f * ::abs((int)l.x - r.x);
+            float tg = 0.587f * ::abs((int)l.y - r.y);
+            float tr = 0.299f * ::abs((int)l.z - r.z);
+
+            return fminf(tr + tg + tb, max_data_term);
+        }
+
+        template <typename T>
+        __global__ void get_first_k_initial_global(uchar *ctemp, T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane, int ndisp,
+            size_t msg_step, size_t disp_step)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                T* selected_disparity = selected_disp_pyr + y * msg_step + x;
+                T* data_cost_selected = data_cost_selected_ + y * msg_step + x;
+                T* data_cost = (T*)ctemp + y * msg_step + x;
+
+                for(int i = 0; i < nr_plane; i++)
+                {
+                    T minimum = device::numeric_limits<T>::max();
+                    int id = 0;
+                    for(int d = 0; d < ndisp; d++)
+                    {
+                        T cur = data_cost[d * disp_step];
+                        if(cur < minimum)
+                        {
+                            minimum = cur;
+                            id = d;
+                        }
+                    }
+
+                    data_cost_selected[i  * disp_step] = minimum;
+                    selected_disparity[i  * disp_step] = id;
+                    data_cost         [id * disp_step] = numeric_limits<T>::max();
+                }
+            }
+        }
+
+
+        template <typename T>
+        __global__ void get_first_k_initial_local(uchar *ctemp, T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane, int ndisp,
+            size_t msg_step, size_t disp_step)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                T* selected_disparity = selected_disp_pyr + y * msg_step + x;
+                T* data_cost_selected = data_cost_selected_ + y * msg_step + x;
+                T* data_cost = (T*)ctemp + y * msg_step + x;
+
+                int nr_local_minimum = 0;
+
+                T prev = data_cost[0 * disp_step];
+                T cur  = data_cost[1 * disp_step];
+                T next = data_cost[2 * disp_step];
+
+                for (int d = 1; d < ndisp - 1 && nr_local_minimum < nr_plane; d++)
+                {
+                    if (cur < prev && cur < next)
+                    {
+                        data_cost_selected[nr_local_minimum * disp_step] = cur;
+                        selected_disparity[nr_local_minimum * disp_step] = d;
+
+                        data_cost[d * disp_step] = numeric_limits<T>::max();
+
+                        nr_local_minimum++;
+                    }
+                    prev = cur;
+                    cur = next;
+                    next = data_cost[(d + 1) * disp_step];
+                }
+
+                for (int i = nr_local_minimum; i < nr_plane; i++)
+                {
+                    T minimum = numeric_limits<T>::max();
+                    int id = 0;
+
+                    for (int d = 0; d < ndisp; d++)
+                    {
+                        cur = data_cost[d * disp_step];
+                        if (cur < minimum)
+                        {
+                            minimum = cur;
+                            id = d;
+                        }
+                    }
+                    data_cost_selected[i * disp_step] = minimum;
+                    selected_disparity[i * disp_step] = id;
+
+                    data_cost[id * disp_step] = numeric_limits<T>::max();
+                }
+            }
+        }
+
+        template <typename T, int channels>
+        __global__ void init_data_cost(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step,
+                                      int h, int w, int level, int ndisp, float data_weight, float max_data_term,
+                                      int min_disp, size_t msg_step, size_t disp_step)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                int y0 = y << level;
+                int yt = (y + 1) << level;
+
+                int x0 = x << level;
+                int xt = (x + 1) << level;
+
+                T* data_cost = (T*)ctemp + y * msg_step + x;
+
+                for(int d = 0; d < ndisp; ++d)
+                {
+                    float val = 0.0f;
+                    for(int yi = y0; yi < yt; yi++)
+                    {
+                        for(int xi = x0; xi < xt; xi++)
+                        {
+                            int xr = xi - d;
+                            if(d < min_disp || xr < 0)
+                                val += data_weight * max_data_term;
+                            else
+                            {
+                                const uchar* lle = cleft + yi * cimg_step + xi * channels;
+                                const uchar* lri = cright + yi * cimg_step + xr * channels;
+
+                                val += data_weight * pixeldiff<channels>(lle, lri, max_data_term);
+                            }
+                        }
+                    }
+                    data_cost[disp_step * d] = saturate_cast<T>(val);
+                }
+            }
+        }
+
+        template <typename T, int winsz, int channels>
+        __global__ void init_data_cost_reduce(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step,
+                                              int level, int rows, int cols, int h, int ndisp, float data_weight, float max_data_term,
+                                              int min_disp, size_t msg_step, size_t disp_step)
+        {
+            int x_out = blockIdx.x;
+            int y_out = blockIdx.y % h;
+            int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
+
+            int tid = threadIdx.x;
+
+            if (d < ndisp)
+            {
+                int x0 = x_out << level;
+                int y0 = y_out << level;
+
+                int len = ::min(y0 + winsz, rows) - y0;
+
+                float val = 0.0f;
+                if (x0 + tid < cols)
+                {
+                    if (x0 + tid - d < 0 || d < min_disp)
+                        val = data_weight * max_data_term * len;
+                    else
+                    {
+                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
+                        const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
+
+                        for(int y = 0; y < len; ++y)
+                        {
+                            val += data_weight * pixeldiff<channels>(lle, lri, max_data_term);
+
+                            lle += cimg_step;
+                            lri += cimg_step;
+                        }
+                    }
+                }
+
+                extern __shared__ float smem[];
+
+                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
+
+                T* data_cost = (T*)ctemp + y_out * msg_step + x_out;
+
+                if (tid == 0)
+                    data_cost[disp_step * d] = saturate_cast<T>(val);
+            }
+        }
+
+
+        template <typename T>
+        void init_data_cost_caller_(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int /*rows*/, int /*cols*/, int h, int w, int level, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
+
+            switch (channels)
+            {
+            case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(cleft, cright, ctemp, cimg_step, h, w, level, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
+            case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(cleft, cright, ctemp, cimg_step, h, w, level, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
+            case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(cleft, cright, ctemp, cimg_step, h, w, level, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
+            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+        }
+
+        template <typename T, int winsz>
+        void init_data_cost_reduce_caller_(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int rows, int cols, int h, int w, int level, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step, cudaStream_t stream)
+        {
+            const int threadsNum = 256;
+            const size_t smem_size = threadsNum * sizeof(float);
+
+            dim3 threads(winsz, 1, threadsNum / winsz);
+            dim3 grid(w, h, 1);
+            grid.y *= divUp(ndisp, threads.z);
+
+            switch (channels)
+            {
+            case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(cleft, cright, ctemp, cimg_step, level, rows, cols, h, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
+            case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(cleft, cright, ctemp, cimg_step, level, rows, cols, h, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
+            case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(cleft, cright, ctemp, cimg_step, level, rows, cols, h, ndisp, data_weight, max_data_term, min_disp, msg_step, disp_step); break;
+            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+        }
+
+        template<class T>
+        void init_data_cost(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, bool use_local_init_data_cost, cudaStream_t stream)
+        {
+
+            typedef void (*InitDataCostCaller)(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int cols, int rows, int w, int h, int level, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step, cudaStream_t stream);
+
+            static const InitDataCostCaller init_data_cost_callers[] =
+            {
+                init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,
+                init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,
+                init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>
+            };
+
+            size_t disp_step = msg_step * h;
+
+            init_data_cost_callers[level](cleft, cright, ctemp, cimg_step, rows, cols, h, w, level, ndisp, channels, data_weight, max_data_term, min_disp, msg_step, disp_step, stream);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
+
+            if (use_local_init_data_cost == true)
+                get_first_k_initial_local<<<grid, threads, 0, stream>>> (ctemp, data_cost_selected, disp_selected_pyr, h, w, nr_plane, ndisp, msg_step, disp_step);
+            else
+                get_first_k_initial_global<<<grid, threads, 0, stream>>>(ctemp, data_cost_selected, disp_selected_pyr, h, w, nr_plane, ndisp, msg_step, disp_step);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void init_data_cost<short>(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, bool use_local_init_data_cost, cudaStream_t stream);
+
+        template void init_data_cost<float>(const uchar *cleft, const uchar *cright, uchar *ctemp, size_t cimg_step, int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, bool use_local_init_data_cost, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        ////////////////////// compute data cost //////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T, int channels>
+        __global__ void compute_data_cost(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step1, size_t disp_step2)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                int y0 = y << level;
+                int yt = (y + 1) << level;
+
+                int x0 = x << level;
+                int xt = (x + 1) << level;
+
+                const T* selected_disparity = selected_disp_pyr + y/2 * msg_step + x/2;
+                T* data_cost = data_cost_ + y * msg_step + x;
+
+                for(int d = 0; d < nr_plane; d++)
+                {
+                    float val = 0.0f;
+                    for(int yi = y0; yi < yt; yi++)
+                    {
+                        for(int xi = x0; xi < xt; xi++)
+                        {
+                            int sel_disp = selected_disparity[d * disp_step2];
+                            int xr = xi - sel_disp;
+
+                            if (xr < 0 || sel_disp < min_disp)
+                                val += data_weight * max_data_term;
+                            else
+                            {
+                                const uchar* left_x = cleft + yi * cimg_step + xi * channels;
+                                const uchar* right_x = cright + yi * cimg_step + xr * channels;
+
+                                val += data_weight * pixeldiff<channels>(left_x, right_x, max_data_term);
+                            }
+                        }
+                    }
+                    data_cost[disp_step1 * d] = saturate_cast<T>(val);
+                }
+            }
+        }
+
+        template <typename T, int winsz, int channels>
+        __global__ void compute_data_cost_reduce(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step1, size_t disp_step2)
+        {
+            int x_out = blockIdx.x;
+            int y_out = blockIdx.y % h;
+            int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
+
+            int tid = threadIdx.x;
+
+            const T* selected_disparity = selected_disp_pyr + y_out/2 * msg_step + x_out/2;
+            T* data_cost = data_cost_ + y_out * msg_step + x_out;
+
+            if (d < nr_plane)
+            {
+                int sel_disp = selected_disparity[d * disp_step2];
+
+                int x0 = x_out << level;
+                int y0 = y_out << level;
+
+                int len = ::min(y0 + winsz, rows) - y0;
+
+                float val = 0.0f;
+                if (x0 + tid < cols)
+                {
+                    if (x0 + tid - sel_disp < 0 || sel_disp < min_disp)
+                        val = data_weight * max_data_term * len;
+                    else
+                    {
+                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
+                        const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
+
+                        for(int y = 0; y < len; ++y)
+                        {
+                            val += data_weight * pixeldiff<channels>(lle, lri, max_data_term);
+
+                            lle += cimg_step;
+                            lri += cimg_step;
+                        }
+                    }
+                }
+
+                extern __shared__ float smem[];
+
+                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
+
+                if (tid == 0)
+                    data_cost[disp_step1 * d] = saturate_cast<T>(val);
+            }
+        }
+
+        template <typename T>
+        void compute_data_cost_caller_(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
+                                      int h, int w, int level, int nr_plane, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step1, size_t disp_step2, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
+
+            switch(channels)
+            {
+            case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, h, w, level, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
+            case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, h, w, level, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
+            case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, h, w, level, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
+            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+        }
+
+        template <typename T, int winsz>
+        void compute_data_cost_reduce_caller_(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* disp_selected_pyr, T* data_cost, int rows, int cols,
+                                      int h, int w, int level, int nr_plane, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step1, size_t disp_step2, cudaStream_t stream)
+        {
+            const int threadsNum = 256;
+            const size_t smem_size = threadsNum * sizeof(float);
+
+            dim3 threads(winsz, 1, threadsNum / winsz);
+            dim3 grid(w, h, 1);
+            grid.y *= divUp(nr_plane, threads.z);
+
+            switch (channels)
+            {
+            case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
+            case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
+            case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(cleft, cright, cimg_step, disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2); break;
+            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+        }
+
+        template<class T>
+        void compute_data_cost(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* disp_selected_pyr, T* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, float data_weight, float max_data_term,
+                               int min_disp, cudaStream_t stream)
+        {
+            typedef void (*ComputeDataCostCaller)(const uchar *cleft, const uchar *cright, size_t cimg_step, const T* disp_selected_pyr, T* data_cost, int rows, int cols,
+                int h, int w, int level, int nr_plane, int channels, float data_weight, float max_data_term, int min_disp, size_t msg_step, size_t disp_step1, size_t disp_step2, cudaStream_t stream);
+
+            static const ComputeDataCostCaller callers[] =
+            {
+                compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,
+                compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,
+                compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>
+            };
+
+            size_t disp_step1 = msg_step * h;
+            size_t disp_step2 = msg_step * h2;
+
+            callers[level](cleft, cright, cimg_step, disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, data_weight, max_data_term, min_disp, msg_step, disp_step1, disp_step2, stream);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void compute_data_cost(const uchar *cleft, const uchar *cright, size_t cimg_step, const short* disp_selected_pyr, short* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, float data_weight, float max_data_term, int min_disp, cudaStream_t stream);
+
+        template void compute_data_cost(const uchar *cleft, const uchar *cright, size_t cimg_step, const float* disp_selected_pyr, float* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, float data_weight, float max_data_term, int min_disp, cudaStream_t stream);
+
+
+        ///////////////////////////////////////////////////////////////
+        //////////////////////// init message /////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+
+         template <typename T>
+        __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
+                                                     const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                                                     T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
+                                                     const T* data_cost_cur, const T* disparity_selected_cur,
+                                                     int nr_plane, int nr_plane2, size_t disp_step1, size_t disp_step2)
+        {
+            for(int i = 0; i < nr_plane; i++)
+            {
+                T minimum = numeric_limits<T>::max();
+                int id = 0;
+                for(int j = 0; j < nr_plane2; j++)
+                {
+                    T cur = data_cost_new[j * disp_step1];
+                    if(cur < minimum)
+                    {
+                        minimum = cur;
+                        id = j;
+                    }
+                }
+
+                data_cost_selected[i * disp_step1] = data_cost_cur[id * disp_step1];
+                disparity_selected_new[i * disp_step1] = disparity_selected_cur[id * disp_step2];
+
+                u_new[i * disp_step1] = u_cur[id * disp_step2];
+                d_new[i * disp_step1] = d_cur[id * disp_step2];
+                l_new[i * disp_step1] = l_cur[id * disp_step2];
+                r_new[i * disp_step1] = r_cur[id * disp_step2];
+
+                data_cost_new[id * disp_step1] = numeric_limits<T>::max();
+            }
+        }
+
+        template <typename T>
+        __global__ void init_message(uchar *ctemp, T* u_new_, T* d_new_, T* l_new_, T* r_new_,
+                                     const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
+                                     T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                                     T* data_cost_selected_, const T* data_cost_,
+                                     int h, int w, int nr_plane, int h2, int w2, int nr_plane2,
+                                     size_t msg_step, size_t disp_step1, size_t disp_step2)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * msg_step + x/2;
+                const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * msg_step + x/2;
+                const T* l_cur = l_cur_ + (y/2)                * msg_step + ::min(w2-1, x/2 + 1);
+                const T* r_cur = r_cur_ + (y/2)                * msg_step + ::max(0, x/2 - 1);
+
+                T* data_cost_new = (T*)ctemp + y * msg_step + x;
+
+                const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * msg_step + x/2;
+                const T* data_cost = data_cost_ + y * msg_step + x;
+
+                for(int d = 0; d < nr_plane2; d++)
+                {
+                    int idx2 = d * disp_step2;
+
+                    T val  = data_cost[d * disp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
+                    data_cost_new[d * disp_step1] = val;
+                }
+
+                T* data_cost_selected = data_cost_selected_ + y * msg_step + x;
+                T* disparity_selected_new = selected_disp_pyr_new + y * msg_step + x;
+
+                T* u_new = u_new_ + y * msg_step + x;
+                T* d_new = d_new_ + y * msg_step + x;
+                T* l_new = l_new_ + y * msg_step + x;
+                T* r_new = r_new_ + y * msg_step + x;
+
+                u_cur = u_cur_ + y/2 * msg_step + x/2;
+                d_cur = d_cur_ + y/2 * msg_step + x/2;
+                l_cur = l_cur_ + y/2 * msg_step + x/2;
+                r_cur = r_cur_ + y/2 * msg_step + x/2;
+
+                get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
+                                             data_cost_selected, disparity_selected_new, data_cost_new,
+                                             data_cost, disparity_selected_cur, nr_plane, nr_plane2,
+                                             disp_step1, disp_step2);
+            }
+        }
+
+
+        template<class T>
+        void init_message(uchar *ctemp, T* u_new, T* d_new, T* l_new, T* r_new,
+                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                          T* data_cost_selected, const T* data_cost, size_t msg_step,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
+        {
+
+            size_t disp_step1 = msg_step * h;
+            size_t disp_step2 = msg_step * h2;
+
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
+
+            init_message<<<grid, threads, 0, stream>>>(ctemp, u_new, d_new, l_new, r_new,
+                                                       u_cur, d_cur, l_cur, r_cur,
+                                                       selected_disp_pyr_new, selected_disp_pyr_cur,
+                                                       data_cost_selected, data_cost,
+                                                       h, w, nr_plane, h2, w2, nr_plane2,
+                                                       msg_step, disp_step1, disp_step2);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+        template void init_message(uchar *ctemp, short* u_new, short* d_new, short* l_new, short* r_new,
+                          const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
+                          short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
+                          short* data_cost_selected, const short* data_cost, size_t msg_step,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
+
+        template void init_message(uchar *ctemp, float* u_new, float* d_new, float* l_new, float* r_new,
+                          const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
+                          float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
+                          float* data_cost_selected, const float* data_cost, size_t msg_step,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        ////////////////////  calc all iterations /////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
+                                          const T* dst_disp, const T* src_disp, int nr_plane, int max_disc_term, float disc_single_jump, volatile T* temp,
+                                          size_t disp_step)
+        {
+            T minimum = numeric_limits<T>::max();
+
+            for(int d = 0; d < nr_plane; d++)
+            {
+                int idx = d * disp_step;
+                T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
+
+                if(val < minimum)
+                    minimum = val;
+
+                msg_dst[idx] = val;
+            }
+
+            float sum = 0;
+            for(int d = 0; d < nr_plane; d++)
+            {
+                float cost_min = minimum + max_disc_term;
+                T src_disp_reg = src_disp[d * disp_step];
+
+                for(int d2 = 0; d2 < nr_plane; d2++)
+                    cost_min = fmin(cost_min, msg_dst[d2 * disp_step] + disc_single_jump * ::abs(dst_disp[d2 * disp_step] - src_disp_reg));
+
+                temp[d * disp_step] = saturate_cast<T>(cost_min);
+                sum += cost_min;
+            }
+            sum /= nr_plane;
+
+            for(int d = 0; d < nr_plane; d++)
+                msg_dst[d * disp_step] = saturate_cast<T>(temp[d * disp_step] - sum);
+        }
+
+        template <typename T>
+        __global__ void compute_message(uchar *ctemp, T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i, int max_disc_term, float disc_single_jump, size_t msg_step, size_t disp_step)
+        {
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+            int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
+
+            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+            {
+                const T* data = data_cost_selected + y * msg_step + x;
+
+                T* u = u_ + y * msg_step + x;
+                T* d = d_ + y * msg_step + x;
+                T* l = l_ + y * msg_step + x;
+                T* r = r_ + y * msg_step + x;
+
+                const T* disp = selected_disp_pyr_cur + y * msg_step + x;
+
+                T* temp = (T*)ctemp + y * msg_step + x;
+
+                message_per_pixel(data, u, r - 1, u + msg_step, l + 1, disp, disp - msg_step, nr_plane, max_disc_term, disc_single_jump, temp, disp_step);
+                message_per_pixel(data, d, d - msg_step, r - 1, l + 1, disp, disp + msg_step, nr_plane, max_disc_term, disc_single_jump, temp, disp_step);
+                message_per_pixel(data, l, u + msg_step, d - msg_step, l + 1, disp, disp - 1, nr_plane, max_disc_term, disc_single_jump, temp, disp_step);
+                message_per_pixel(data, r, u + msg_step, d - msg_step, r - 1, disp, disp + 1, nr_plane, max_disc_term, disc_single_jump, temp, disp_step);
+            }
+        }
+
+
+        template<class T>
+        void calc_all_iterations(uchar *ctemp, T* u, T* d, T* l, T* r, const T* data_cost_selected,
+            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, int max_disc_term, float disc_single_jump, cudaStream_t stream)
+        {
+            size_t disp_step = msg_step * h;
+
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(w, threads.x << 1);
+            grid.y = divUp(h, threads.y);
+
+            for(int t = 0; t < iters; ++t)
+            {
+                compute_message<<<grid, threads, 0, stream>>>(ctemp, u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1, max_disc_term, disc_single_jump, msg_step, disp_step);
+                cudaSafeCall( cudaGetLastError() );
+            }
+            if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+        };
+
+        template void calc_all_iterations(uchar *ctemp, short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
+            int h, int w, int nr_plane, int iters, int max_disc_term, float disc_single_jump, cudaStream_t stream);
+
+        template void calc_all_iterations(uchar *ctemp, float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
+            int h, int w, int nr_plane, int iters, int max_disc_term, float disc_single_jump, cudaStream_t stream);
+
+
+        ///////////////////////////////////////////////////////////////
+        /////////////////////////// output ////////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+
+        template <typename T>
+        __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
+                                     const T* data_cost_selected, const T* disp_selected_pyr,
+                                     PtrStepSz<short> disp, int nr_plane, size_t msg_step, size_t disp_step)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
+            {
+                const T* data = data_cost_selected + y * msg_step + x;
+                const T* disp_selected = disp_selected_pyr + y * msg_step + x;
+
+                const T* u = u_ + (y+1) * msg_step + (x+0);
+                const T* d = d_ + (y-1) * msg_step + (x+0);
+                const T* l = l_ + (y+0) * msg_step + (x+1);
+                const T* r = r_ + (y+0) * msg_step + (x-1);
+
+                int best = 0;
+                T best_val = numeric_limits<T>::max();
+                for (int i = 0; i < nr_plane; ++i)
+                {
+                    int idx = i * disp_step;
+                    T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
+
+                    if (val < best_val)
+                    {
+                        best_val = val;
+                        best = saturate_cast<short>(disp_selected[idx]);
+                    }
+                }
+                disp(y, x) = best;
+            }
+        }
+
+        template<class T>
+        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream)
+        {
+            size_t disp_step = disp.rows * msg_step;
+
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(disp.cols, threads.x);
+            grid.y = divUp(disp.rows, threads.y);
+
+            compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, disp, nr_plane, msg_step, disp_step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
+
+        template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
+    } // namespace stereocsbp
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudastereo/src/cuda/stereocsbp.hpp b/modules/cudastereo/src/cuda/stereocsbp.hpp
new file mode 100644
index 00000000000..305497292d5
--- /dev/null
+++ b/modules/cudastereo/src/cuda/stereocsbp.hpp
@@ -0,0 +1,29 @@
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereocsbp
+    {
+        template<class T>
+        void init_data_cost(const uchar *left, const uchar *right, uchar *ctemp, size_t cimg_step, int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, float data_weight, float max_data_term, int min_disp, bool use_local_init_data_cost, cudaStream_t stream);
+
+        template<class T>
+        void compute_data_cost(const uchar *left, const uchar *right, size_t cimg_step, const T* disp_selected_pyr, T* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, float data_weight, float max_data_term,
+                               int min_disp, cudaStream_t stream);
+
+        template<class T>
+        void init_message(uchar *ctemp, T* u_new, T* d_new, T* l_new, T* r_new,
+                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                          T* data_cost_selected, const T* data_cost, size_t msg_step,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
+
+        template<class T>
+        void calc_all_iterations(uchar *ctemp, T* u, T* d, T* l, T* r, const T* data_cost_selected,
+            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, int max_disc_term, float disc_single_jump, cudaStream_t stream);
+
+        template<class T>
+        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
+    }
+}}}
diff --git a/modules/cudastereo/src/cuda/util.cu b/modules/cudastereo/src/cuda/util.cu
new file mode 100644
index 00000000000..b65c240ee2f
--- /dev/null
+++ b/modules/cudastereo/src/cuda/util.cu
@@ -0,0 +1,290 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
+
+    __constant__ float cq[16];
+
+    template <typename T, typename D>
+    __global__ void reprojectImageTo3D(const PtrStepSz<T> disp, PtrStep<D> xyz)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (y >= disp.rows || x >= disp.cols)
+            return;
+
+        const float qx = x * cq[ 0] + y * cq[ 1] + cq[ 3];
+        const float qy = x * cq[ 4] + y * cq[ 5] + cq[ 7];
+        const float qz = x * cq[ 8] + y * cq[ 9] + cq[11];
+        const float qw = x * cq[12] + y * cq[13] + cq[15];
+
+        const T d = disp(y, x);
+
+        const float iW = 1.f / (qw + cq[14] * d);
+
+        D v = VecTraits<D>::all(1.0f);
+        v.x = (qx + cq[2] * d) * iW;
+        v.y = (qy + cq[6] * d) * iW;
+        v.z = (qz + cq[10] * d) * iW;
+
+        xyz(y, x) = v;
+    }
+
+    template <typename T, typename D>
+    void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(disp.cols, block.x), divUp(disp.rows, block.y));
+
+        cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
+
+        reprojectImageTo3D<T, D><<<grid, block, 0, stream>>>((PtrStepSz<T>)disp, (PtrStepSz<D>)xyz);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void reprojectImageTo3D_gpu<uchar, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<uchar, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<short, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<short, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<int, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<int, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<float, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<float, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+
+    /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
+
+    template <typename T>
+    __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
+    {
+        unsigned int H = ((ndisp-d) * 240)/ndisp;
+
+        unsigned int hi = (H/60) % 6;
+        float f = H/60.f - H/60;
+        float p = V * (1 - S);
+        float q = V * (1 - f * S);
+        float t = V * (1 - (1 - f) * S);
+
+        float3 res;
+
+        if (hi == 0) //R = V,	G = t,	B = p
+        {
+            res.x = p;
+            res.y = t;
+            res.z = V;
+        }
+
+        if (hi == 1) // R = q,	G = V,	B = p
+        {
+            res.x = p;
+            res.y = V;
+            res.z = q;
+        }
+
+        if (hi == 2) // R = p,	G = V,	B = t
+        {
+            res.x = t;
+            res.y = V;
+            res.z = p;
+        }
+
+        if (hi == 3) // R = p,	G = q,	B = V
+        {
+            res.x = V;
+            res.y = q;
+            res.z = p;
+        }
+
+        if (hi == 4) // R = t,	G = p,	B = V
+        {
+            res.x = V;
+            res.y = p;
+            res.z = t;
+        }
+
+        if (hi == 5) // R = V,	G = p,	B = q
+        {
+            res.x = q;
+            res.y = p;
+            res.z = V;
+        }
+        const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);
+        const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);
+        const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
+        const unsigned int a = 255U;
+
+        return (a << 24) + (r << 16) + (g << 8) + b;
+    }
+
+    __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+    {
+        const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if(x < width && y < height)
+        {
+            uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);
+
+            uint4 res;
+            res.x = cvtPixel(d4.x, ndisp);
+            res.y = cvtPixel(d4.y, ndisp);
+            res.z = cvtPixel(d4.z, ndisp);
+            res.w = cvtPixel(d4.w, ndisp);
+
+            uint4* line = (uint4*)(out_image + y * out_step);
+            line[x >> 2] = res;
+        }
+    }
+
+    __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+    {
+        const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if(x < width && y < height)
+        {
+            short2 d2 = *(short2*)(disp + y * disp_step + x);
+
+            uint2 res;
+            res.x = cvtPixel(d2.x, ndisp);
+            res.y = cvtPixel(d2.y, ndisp);
+
+            uint2* line = (uint2*)(out_image + y * out_step);
+            line[x >> 1] = res;
+        }
+    }
+
+    __global__ void drawColorDisp(int* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if(x < width && y < height)
+        {
+            uint *line = (uint*)(out_image + y * out_step);
+            line[x] = cvtPixel(disp[y*disp_step + x], ndisp);
+        }
+    }
+
+    __global__ void drawColorDisp(float* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if(x < width && y < height)
+        {
+            uint *line = (uint*)(out_image + y * out_step);
+            line[x] = cvtPixel(disp[y*disp_step + x], ndisp);
+        }
+    }
+
+    void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
+    {
+        dim3 threads(16, 16, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(src.cols, threads.x << 2);
+        grid.y = divUp(src.rows, threads.y);
+
+        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(src.cols, threads.x << 1);
+        grid.y = divUp(src.rows, threads.y);
+
+        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    void drawColorDisp_gpu(const PtrStepSz<int>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(src.cols, threads.x);
+        grid.y = divUp(src.rows, threads.y);
+
+        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(int), dst.data, dst.step, src.cols, src.rows, ndisp);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    void drawColorDisp_gpu(const PtrStepSz<float>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(src.cols, threads.x);
+        grid.y = divUp(src.rows, threads.y);
+
+        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(float), dst.data, dst.step, src.cols, src.rows, ndisp);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudastereo/src/disparity_bilateral_filter.cpp b/modules/cudastereo/src/disparity_bilateral_filter.cpp
new file mode 100644
index 00000000000..c59e3b2cb4a
--- /dev/null
+++ b/modules/cudastereo/src/disparity_bilateral_filter.cpp
@@ -0,0 +1,197 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::DisparityBilateralFilter> cv::cuda::createDisparityBilateralFilter(int, int, int) { throw_no_cuda(); return Ptr<cuda::DisparityBilateralFilter>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+#include "cuda/disparity_bilateral_filter.hpp"
+
+namespace
+{
+    class DispBilateralFilterImpl : public cuda::DisparityBilateralFilter
+    {
+    public:
+        DispBilateralFilterImpl(int ndisp, int radius, int iters);
+
+        void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream);
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getRadius() const { return radius_; }
+        void setRadius(int radius);
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        double getEdgeThreshold() const { return edge_threshold_; }
+        void setEdgeThreshold(double edge_threshold) { edge_threshold_ = (float) edge_threshold; }
+
+        double getMaxDiscThreshold() const { return max_disc_threshold_; }
+        void setMaxDiscThreshold(double max_disc_threshold) { max_disc_threshold_ = (float) max_disc_threshold; }
+
+        double getSigmaRange() const { return sigma_range_; }
+        void setSigmaRange(double sigma_range);
+
+    private:
+        int ndisp_;
+        int radius_;
+        int iters_;
+        float edge_threshold_;
+        float max_disc_threshold_;
+        float sigma_range_;
+
+        GpuMat table_color_;
+        GpuMat table_space_;
+    };
+
+    void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
+    {
+        Mat cpu_table_color(1, len, CV_32F);
+
+        float* line = cpu_table_color.ptr<float>();
+
+        for(int i = 0; i < len; i++)
+            line[i] = static_cast<float>(std::exp(-double(i * i) / (2 * sigma_range * sigma_range)));
+
+        table_color.upload(cpu_table_color);
+    }
+
+    void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
+    {
+        int half = (win_size >> 1);
+
+        Mat cpu_table_space(half + 1, half + 1, CV_32F);
+
+        for (int y = 0; y <= half; ++y)
+        {
+            float* row = cpu_table_space.ptr<float>(y);
+            for (int x = 0; x <= half; ++x)
+                row[x] = exp(-sqrt(float(y * y) + float(x * x)) / dist_space);
+        }
+
+        table_space.upload(cpu_table_space);
+    }
+
+    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
+    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
+    const float DEFAULT_SIGMA_RANGE = 10.0f;
+
+    DispBilateralFilterImpl::DispBilateralFilterImpl(int ndisp, int radius, int iters) :
+        ndisp_(ndisp), radius_(radius), iters_(iters),
+        edge_threshold_(DEFAULT_EDGE_THRESHOLD), max_disc_threshold_(DEFAULT_MAX_DISC_THRESHOLD),
+        sigma_range_(DEFAULT_SIGMA_RANGE)
+    {
+        calc_color_weighted_table(table_color_, sigma_range_, 255);
+        calc_space_weighted_filter(table_space_, radius_ * 2 + 1, radius_ + 1.0f);
+    }
+
+    void DispBilateralFilterImpl::setRadius(int radius)
+    {
+        radius_ = radius;
+        calc_space_weighted_filter(table_space_, radius_ * 2 + 1, radius_ + 1.0f);
+    }
+
+    void DispBilateralFilterImpl::setSigmaRange(double sigma_range)
+    {
+        sigma_range_ = (float) sigma_range;
+        calc_color_weighted_table(table_color_, sigma_range_, 255);
+    }
+
+    template <typename T>
+    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                        GpuMat& table_color, GpuMat& table_space,
+                                        const GpuMat& disp, const GpuMat& img,
+                                        OutputArray _dst, Stream& stream)
+    {
+        using namespace cv::cuda::device::disp_bilateral_filter;
+
+        const short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
+        const short max_disc = short(ndisp * max_disc_threshold + 0.5);
+
+        size_t table_space_step = table_space.step / sizeof(float);
+
+        _dst.create(disp.size(), disp.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        if (dst.data != disp.data)
+            disp.copyTo(dst, stream);
+
+        disp_bilateral_filter<T>(dst, img, img.channels(), iters, table_color.ptr<float>(), (float *)table_space.data, table_space_step, radius, edge_disc, max_disc, StreamAccessor::getStream(stream));
+    }
+
+    void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream)
+    {
+        typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                                    GpuMat& table_color, GpuMat& table_space,
+                                                    const GpuMat& disp, const GpuMat& img, OutputArray dst, Stream& stream);
+        const bilateral_filter_operator_t operators[] =
+            {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
+
+        CV_Assert( 0 < ndisp_ && 0 < radius_ && 0 < iters_ );
+
+        GpuMat disp = _disp.getGpuMat();
+        GpuMat img = _image.getGpuMat();
+
+        CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 );
+        CV_Assert( disp.size() == img.size() );
+
+        operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_,
+                               table_color_, table_space_, disp, img, dst, stream);
+    }
+}
+
+Ptr<cuda::DisparityBilateralFilter> cv::cuda::createDisparityBilateralFilter(int ndisp, int radius, int iters)
+{
+    return makePtr<DispBilateralFilterImpl>(ndisp, radius, iters);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudastereo/src/precomp.hpp b/modules/cudastereo/src/precomp.hpp
new file mode 100644
index 00000000000..bdd0738e7e3
--- /dev/null
+++ b/modules/cudastereo/src/precomp.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "opencv2/cudastereo.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/utility.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudastereo/src/stereobm.cpp b/modules/cudastereo/src/stereobm.cpp
new file mode 100644
index 00000000000..1cfc0a644ff
--- /dev/null
+++ b/modules/cudastereo/src/stereobm.cpp
@@ -0,0 +1,185 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::StereoBM> cv::cuda::createStereoBM(int, int) { throw_no_cuda(); return Ptr<cuda::StereoBM>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereobm
+    {
+        void stereoBM_CUDA(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
+        void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
+        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream);
+    }
+}}}
+
+namespace
+{
+    class StereoBMImpl : public cuda::StereoBM
+    {
+    public:
+        StereoBMImpl(int numDisparities, int blockSize);
+
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
+
+        int getMinDisparity() const { return 0; }
+        void setMinDisparity(int /*minDisparity*/) {}
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getBlockSize() const { return winSize_; }
+        void setBlockSize(int blockSize) { winSize_ = blockSize; }
+
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
+
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
+
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getPreFilterType() const { return preset_; }
+        void setPreFilterType(int preFilterType) { preset_ = preFilterType; }
+
+        int getPreFilterSize() const { return 0; }
+        void setPreFilterSize(int /*preFilterSize*/) {}
+
+        int getPreFilterCap() const { return preFilterCap_; }
+        void setPreFilterCap(int preFilterCap) { preFilterCap_ = preFilterCap; }
+
+        int getTextureThreshold() const { return static_cast<int>(avergeTexThreshold_); }
+        void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = static_cast<float>(textureThreshold); }
+
+        int getUniquenessRatio() const { return 0; }
+        void setUniquenessRatio(int /*uniquenessRatio*/) {}
+
+        int getSmallerBlockSize() const { return 0; }
+        void setSmallerBlockSize(int /*blockSize*/){}
+
+        Rect getROI1() const { return Rect(); }
+        void setROI1(Rect /*roi1*/) {}
+
+        Rect getROI2() const { return Rect(); }
+        void setROI2(Rect /*roi2*/) {}
+
+    private:
+        int preset_;
+        int ndisp_;
+        int winSize_;
+        int preFilterCap_;
+        float avergeTexThreshold_;
+
+        GpuMat minSSD_, leBuf_, riBuf_;
+    };
+
+    StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize)
+        : preset_(0), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3)
+    {
+    }
+
+    void StereoBMImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoBMImpl::compute(InputArray _left, InputArray _right, OutputArray _disparity, Stream& _stream)
+    {
+        using namespace ::cv::cuda::device::stereobm;
+
+        const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8);
+        CV_Assert( 0 < ndisp_ && ndisp_ <= max_supported_ndisp );
+        CV_Assert( ndisp_ % 8 == 0 );
+        CV_Assert( winSize_ % 2 == 1 );
+
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
+
+        CV_Assert( left.type() == CV_8UC1 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
+
+        _disparity.create(left.size(), CV_8UC1);
+        GpuMat disparity = _disparity.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        cuda::ensureSizeIsEnough(left.size(), CV_32SC1, minSSD_);
+
+        PtrStepSzb le_for_bm =  left;
+        PtrStepSzb ri_for_bm = right;
+
+        if (preset_ == cv::StereoBM::PREFILTER_XSOBEL)
+        {
+            cuda::ensureSizeIsEnough(left.size(), left.type(), leBuf_);
+            cuda::ensureSizeIsEnough(right.size(), right.type(), riBuf_);
+
+            prefilter_xsobel( left, leBuf_, preFilterCap_, stream);
+            prefilter_xsobel(right, riBuf_, preFilterCap_, stream);
+
+            le_for_bm = leBuf_;
+            ri_for_bm = riBuf_;
+        }
+
+        stereoBM_CUDA(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream);
+
+        if (avergeTexThreshold_ > 0)
+            postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream);
+    }
+}
+
+Ptr<cuda::StereoBM> cv::cuda::createStereoBM(int numDisparities, int blockSize)
+{
+    return makePtr<StereoBMImpl>(numDisparities, blockSize);
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudastereo/src/stereobp.cpp b/modules/cudastereo/src/stereobp.cpp
new file mode 100644
index 00000000000..953674b904d
--- /dev/null
+++ b/modules/cudastereo/src/stereobp.cpp
@@ -0,0 +1,380 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::StereoBeliefPropagation::estimateRecommendedParams(int, int, int&, int&, int&) { throw_no_cuda(); }
+
+Ptr<cuda::StereoBeliefPropagation> cv::cuda::createStereoBeliefPropagation(int, int, int, int) { throw_no_cuda(); return Ptr<cuda::StereoBeliefPropagation>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereobp
+    {
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
+        template<typename T, typename D>
+        void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
+        template<typename T>
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        template <typename T>
+        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+        template <typename T>
+        void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
+            const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+        template <typename T>
+        void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
+            const PtrStepSz<short>& disp, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    class StereoBPImpl : public cuda::StereoBeliefPropagation
+    {
+    public:
+        StereoBPImpl(int ndisp, int iters, int levels, int msg_type);
+
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
+        void compute(InputArray data, OutputArray disparity, Stream& stream);
+
+        int getMinDisparity() const { return 0; }
+        void setMinDisparity(int /*minDisparity*/) {}
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getBlockSize() const { return 0; }
+        void setBlockSize(int /*blockSize*/) {}
+
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
+
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
+
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        int getNumLevels() const { return levels_; }
+        void setNumLevels(int levels) { levels_ = levels; }
+
+        double getMaxDataTerm() const { return max_data_term_; }
+        void setMaxDataTerm(double max_data_term) { max_data_term_ = (float) max_data_term; }
+
+        double getDataWeight() const { return data_weight_; }
+        void setDataWeight(double data_weight) { data_weight_ = (float) data_weight; }
+
+        double getMaxDiscTerm() const { return max_disc_term_; }
+        void setMaxDiscTerm(double max_disc_term) { max_disc_term_ = (float) max_disc_term; }
+
+        double getDiscSingleJump() const { return disc_single_jump_; }
+        void setDiscSingleJump(double disc_single_jump) { disc_single_jump_ = (float) disc_single_jump; }
+
+        int getMsgType() const { return msg_type_; }
+        void setMsgType(int msg_type) { msg_type_ = msg_type; }
+
+    private:
+        void init(Stream& stream);
+        void calcBP(OutputArray disp, Stream& stream);
+
+        int ndisp_;
+        int iters_;
+        int levels_;
+        float max_data_term_;
+        float data_weight_;
+        float max_disc_term_;
+        float disc_single_jump_;
+        int msg_type_;
+
+        float scale_;
+        int rows_, cols_;
+        std::vector<int> cols_all_, rows_all_;
+        GpuMat u_, d_, l_, r_, u2_, d2_, l2_, r2_;
+        std::vector<GpuMat> datas_;
+        GpuMat outBuf_;
+    };
+
+    const float DEFAULT_MAX_DATA_TERM = 10.0f;
+    const float DEFAULT_DATA_WEIGHT = 0.07f;
+    const float DEFAULT_MAX_DISC_TERM = 1.7f;
+    const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
+
+    StereoBPImpl::StereoBPImpl(int ndisp, int iters, int levels, int msg_type) :
+        ndisp_(ndisp), iters_(iters), levels_(levels),
+        max_data_term_(DEFAULT_MAX_DATA_TERM), data_weight_(DEFAULT_DATA_WEIGHT),
+        max_disc_term_(DEFAULT_MAX_DISC_TERM), disc_single_jump_(DEFAULT_DISC_SINGLE_JUMP),
+        msg_type_(msg_type)
+    {
+    }
+
+    void StereoBPImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoBPImpl::compute(InputArray _left, InputArray _right, OutputArray disparity, Stream& stream)
+    {
+        using namespace cv::cuda::device::stereobp;
+
+        typedef void (*comp_data_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
+        static const comp_data_t comp_data_callers[2][5] =
+        {
+            {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
+            {0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}
+        };
+
+        scale_ = msg_type_ == CV_32F ? 1.0f : 10.0f;
+
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ );
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( msg_type_ == CV_32F || (1 << (levels_ - 1)) * scale_ * max_data_term_ < std::numeric_limits<short>::max() );
+
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
+
+        CV_Assert( left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
+
+        rows_ = left.rows;
+        cols_ = left.cols;
+
+        const int divisor = (int) pow(2.f, levels_ - 1.0f);
+        const int lowest_cols = cols_ / divisor;
+        const int lowest_rows = rows_ / divisor;
+        const int min_image_dim_size = 2;
+        CV_Assert( std::min(lowest_cols, lowest_rows) > min_image_dim_size );
+
+        init(stream);
+
+        datas_[0].create(rows_ * ndisp_, cols_, msg_type_);
+
+        comp_data_callers[msg_type_ == CV_32F][left.channels()](left, right, datas_[0], StreamAccessor::getStream(stream));
+
+        calcBP(disparity, stream);
+    }
+
+    void StereoBPImpl::compute(InputArray _data, OutputArray disparity, Stream& stream)
+    {
+        scale_ = msg_type_ == CV_32F ? 1.0f : 10.0f;
+
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ );
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( msg_type_ == CV_32F || (1 << (levels_ - 1)) * scale_ * max_data_term_ < std::numeric_limits<short>::max() );
+
+        GpuMat data = _data.getGpuMat();
+
+        CV_Assert( (data.type() == msg_type_) && (data.rows % ndisp_ == 0) );
+
+        rows_ = data.rows / ndisp_;
+        cols_ = data.cols;
+
+        const int divisor = (int) pow(2.f, levels_ - 1.0f);
+        const int lowest_cols = cols_ / divisor;
+        const int lowest_rows = rows_ / divisor;
+        const int min_image_dim_size = 2;
+        CV_Assert( std::min(lowest_cols, lowest_rows) > min_image_dim_size );
+
+        init(stream);
+
+        data.copyTo(datas_[0], stream);
+
+        calcBP(disparity, stream);
+    }
+
+    void StereoBPImpl::init(Stream& stream)
+    {
+        using namespace cv::cuda::device::stereobp;
+
+        u_.create(rows_ * ndisp_, cols_, msg_type_);
+        d_.create(rows_ * ndisp_, cols_, msg_type_);
+        l_.create(rows_ * ndisp_, cols_, msg_type_);
+        r_.create(rows_ * ndisp_, cols_, msg_type_);
+
+        if (levels_ & 1)
+        {
+            //can clear less area
+            u_.setTo(0, stream);
+            d_.setTo(0, stream);
+            l_.setTo(0, stream);
+            r_.setTo(0, stream);
+        }
+
+        if (levels_ > 1)
+        {
+            int less_rows = (rows_ + 1) / 2;
+            int less_cols = (cols_ + 1) / 2;
+
+            u2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            d2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            l2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            r2_.create(less_rows * ndisp_, less_cols, msg_type_);
+
+            if ((levels_ & 1) == 0)
+            {
+                u2_.setTo(0, stream);
+                d2_.setTo(0, stream);
+                l2_.setTo(0, stream);
+                r2_.setTo(0, stream);
+            }
+        }
+
+        load_constants(ndisp_, max_data_term_, scale_ * data_weight_, scale_ * max_disc_term_, scale_ * disc_single_jump_);
+
+        datas_.resize(levels_);
+
+        cols_all_.resize(levels_);
+        rows_all_.resize(levels_);
+
+        cols_all_[0] = cols_;
+        rows_all_[0] = rows_;
+    }
+
+    void StereoBPImpl::calcBP(OutputArray disp, Stream& _stream)
+    {
+        using namespace cv::cuda::device::stereobp;
+
+        typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        static const data_step_down_t data_step_down_callers[2] =
+        {
+            data_step_down_gpu<short>, data_step_down_gpu<float>
+        };
+
+        typedef void (*level_up_messages_t)(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+        static const level_up_messages_t level_up_messages_callers[2] =
+        {
+            level_up_messages_gpu<short>, level_up_messages_gpu<float>
+        };
+
+        typedef void (*calc_all_iterations_t)(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+        static const calc_all_iterations_t calc_all_iterations_callers[2] =
+        {
+            calc_all_iterations_gpu<short>, calc_all_iterations_gpu<float>
+        };
+
+        typedef void (*output_t)(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
+        static const output_t output_callers[2] =
+        {
+            output_gpu<short>, output_gpu<float>
+        };
+
+        const int funcIdx = msg_type_ == CV_32F;
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        for (int i = 1; i < levels_; ++i)
+        {
+            cols_all_[i] = (cols_all_[i-1] + 1) / 2;
+            rows_all_[i] = (rows_all_[i-1] + 1) / 2;
+
+            datas_[i].create(rows_all_[i] * ndisp_, cols_all_[i], msg_type_);
+
+            data_step_down_callers[funcIdx](cols_all_[i], rows_all_[i], rows_all_[i-1], datas_[i-1], datas_[i], stream);
+        }
+
+        PtrStepSzb mus[] = {u_, u2_};
+        PtrStepSzb mds[] = {d_, d2_};
+        PtrStepSzb mrs[] = {r_, r2_};
+        PtrStepSzb mls[] = {l_, l2_};
+
+        int mem_idx = (levels_ & 1) ? 0 : 1;
+
+        for (int i = levels_ - 1; i >= 0; --i)
+        {
+            // for lower level we have already computed messages by setting to zero
+            if (i != levels_ - 1)
+                level_up_messages_callers[funcIdx](mem_idx, cols_all_[i], rows_all_[i], rows_all_[i+1], mus, mds, mls, mrs, stream);
+
+            calc_all_iterations_callers[funcIdx](cols_all_[i], rows_all_[i], iters_, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas_[i], stream);
+
+            mem_idx = (mem_idx + 1) & 1;
+        }
+
+        const int dtype = disp.fixedType() ? disp.type() : CV_16SC1;
+
+        disp.create(rows_, cols_, dtype);
+        GpuMat out = disp.getGpuMat();
+
+        if (dtype != CV_16SC1)
+        {
+            outBuf_.create(rows_, cols_, CV_16SC1);
+            out = outBuf_;
+        }
+
+        out.setTo(0, _stream);
+
+        output_callers[funcIdx](u_, d_, l_, r_, datas_.front(), out, stream);
+
+        if (dtype != CV_16SC1)
+            out.convertTo(disp, dtype, _stream);
+    }
+}
+
+Ptr<cuda::StereoBeliefPropagation> cv::cuda::createStereoBeliefPropagation(int ndisp, int iters, int levels, int msg_type)
+{
+    return makePtr<StereoBPImpl>(ndisp, iters, levels, msg_type);
+}
+
+void cv::cuda::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
+{
+    ndisp = width / 4;
+    if ((ndisp & 1) != 0)
+        ndisp++;
+
+    int mm = std::max(width, height);
+    iters = mm / 100 + 2;
+
+    levels = (int)(::log(static_cast<double>(mm)) + 1) * 4 / 5;
+    if (levels == 0) levels++;
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudastereo/src/stereocsbp.cpp b/modules/cudastereo/src/stereocsbp.cpp
new file mode 100644
index 00000000000..bc5a230f63e
--- /dev/null
+++ b/modules/cudastereo/src/stereocsbp.cpp
@@ -0,0 +1,357 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int&, int&, int&, int&) { throw_no_cuda(); }
+
+Ptr<cuda::StereoConstantSpaceBP> cv::cuda::createStereoConstantSpaceBP(int, int, int, int, int) { throw_no_cuda(); return Ptr<cuda::StereoConstantSpaceBP>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+#include "cuda/stereocsbp.hpp"
+
+namespace
+{
+    class StereoCSBPImpl : public cuda::StereoConstantSpaceBP
+    {
+    public:
+        StereoCSBPImpl(int ndisp, int iters, int levels, int nr_plane, int msg_type);
+
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
+        void compute(InputArray data, OutputArray disparity, Stream& stream);
+
+        int getMinDisparity() const { return min_disp_th_; }
+        void setMinDisparity(int minDisparity) { min_disp_th_ = minDisparity; }
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getBlockSize() const { return 0; }
+        void setBlockSize(int /*blockSize*/) {}
+
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
+
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
+
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        int getNumLevels() const { return levels_; }
+        void setNumLevels(int levels) { levels_ = levels; }
+
+        double getMaxDataTerm() const { return max_data_term_; }
+        void setMaxDataTerm(double max_data_term) { max_data_term_ = (float) max_data_term; }
+
+        double getDataWeight() const { return data_weight_; }
+        void setDataWeight(double data_weight) { data_weight_ = (float) data_weight; }
+
+        double getMaxDiscTerm() const { return max_disc_term_; }
+        void setMaxDiscTerm(double max_disc_term) { max_disc_term_ = (float) max_disc_term; }
+
+        double getDiscSingleJump() const { return disc_single_jump_; }
+        void setDiscSingleJump(double disc_single_jump) { disc_single_jump_ = (float) disc_single_jump; }
+
+        int getMsgType() const { return msg_type_; }
+        void setMsgType(int msg_type) { msg_type_ = msg_type; }
+
+        int getNrPlane() const { return nr_plane_; }
+        void setNrPlane(int nr_plane) { nr_plane_ = nr_plane; }
+
+        bool getUseLocalInitDataCost() const { return use_local_init_data_cost_; }
+        void setUseLocalInitDataCost(bool use_local_init_data_cost) { use_local_init_data_cost_ = use_local_init_data_cost; }
+
+    private:
+        int min_disp_th_;
+        int ndisp_;
+        int iters_;
+        int levels_;
+        float max_data_term_;
+        float data_weight_;
+        float max_disc_term_;
+        float disc_single_jump_;
+        int msg_type_;
+        int nr_plane_;
+        bool use_local_init_data_cost_;
+
+        GpuMat mbuf_;
+        GpuMat temp_;
+        GpuMat outBuf_;
+    };
+
+    const float DEFAULT_MAX_DATA_TERM = 30.0f;
+    const float DEFAULT_DATA_WEIGHT = 1.0f;
+    const float DEFAULT_MAX_DISC_TERM = 160.0f;
+    const float DEFAULT_DISC_SINGLE_JUMP = 10.0f;
+
+    StereoCSBPImpl::StereoCSBPImpl(int ndisp, int iters, int levels, int nr_plane, int msg_type) :
+        min_disp_th_(0), ndisp_(ndisp), iters_(iters), levels_(levels),
+        max_data_term_(DEFAULT_MAX_DATA_TERM), data_weight_(DEFAULT_DATA_WEIGHT),
+        max_disc_term_(DEFAULT_MAX_DISC_TERM), disc_single_jump_(DEFAULT_DISC_SINGLE_JUMP),
+        msg_type_(msg_type), nr_plane_(nr_plane), use_local_init_data_cost_(true)
+    {
+    }
+
+    void StereoCSBPImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoCSBPImpl::compute(InputArray _left, InputArray _right, OutputArray disp, Stream& _stream)
+    {
+        using namespace cv::cuda::device::stereocsbp;
+
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ && 0 < nr_plane_ && levels_ <= 8 );
+
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
+
+        CV_Assert( left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        ////////////////////////////////////////////////////////////////////////////////////////////
+        // Init
+
+        int rows = left.rows;
+        int cols = left.cols;
+
+        levels_ = std::min(levels_, int(log((double)ndisp_) / log(2.0)));
+
+        // compute sizes
+        AutoBuffer<int> buf(levels_ * 3);
+        int* cols_pyr = buf.data();
+        int* rows_pyr = cols_pyr + levels_;
+        int* nr_plane_pyr = rows_pyr + levels_;
+
+        cols_pyr[0]     = cols;
+        rows_pyr[0]     = rows;
+        nr_plane_pyr[0] = nr_plane_;
+
+        for (int i = 1; i < levels_; i++)
+        {
+            cols_pyr[i]     = cols_pyr[i-1] / 2;
+            rows_pyr[i]     = rows_pyr[i-1] / 2;
+            nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
+        }
+
+        GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;
+
+        //allocate buffers
+        int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
+        buffers_count += 2; //  data_cost has twice more rows than other buffers, what's why +2, not +1;
+        buffers_count += 1; //  data_cost_selected
+        mbuf_.create(rows * nr_plane_ * buffers_count, cols, msg_type_);
+
+        data_cost          = mbuf_.rowRange(0, rows * nr_plane_ * 2);
+        data_cost_selected = mbuf_.rowRange(data_cost.rows, data_cost.rows + rows * nr_plane_);
+
+        for(int k = 0; k < 2; ++k) // in/out
+        {
+            GpuMat sub1 = mbuf_.rowRange(data_cost.rows + data_cost_selected.rows, mbuf_.rows);
+            GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);
+
+            GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
+            for(int _r = 0; _r < 5; ++_r)
+            {
+                *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
+                CV_DbgAssert( buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * nr_plane_ );
+            }
+        };
+
+        size_t elem_step = mbuf_.step / mbuf_.elemSize();
+
+        Size temp_size = data_cost.size();
+        if ((size_t)temp_size.area() < elem_step * rows_pyr[levels_ - 1] * ndisp_)
+            temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels_ - 1] * ndisp_);
+
+        temp_.create(temp_size, msg_type_);
+
+        ////////////////////////////////////////////////////////////////////////////
+        // Compute
+
+        l[0].setTo(0, _stream);
+        d[0].setTo(0, _stream);
+        r[0].setTo(0, _stream);
+        u[0].setTo(0, _stream);
+
+        l[1].setTo(0, _stream);
+        d[1].setTo(0, _stream);
+        r[1].setTo(0, _stream);
+        u[1].setTo(0, _stream);
+
+        data_cost.setTo(0, _stream);
+        data_cost_selected.setTo(0, _stream);
+
+        int cur_idx = 0;
+
+        if (msg_type_ == CV_32F)
+        {
+            for (int i = levels_ - 1; i >= 0; i--)
+            {
+                if (i == levels_ - 1)
+                {
+                    init_data_cost(left.ptr<uchar>(), right.ptr<uchar>(), temp_.ptr<uchar>(), left.step, left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<float>(), data_cost_selected.ptr<float>(),
+                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), data_weight_, max_data_term_, min_disp_th_, use_local_init_data_cost_, stream);
+                }
+                else
+                {
+                    compute_data_cost(left.ptr<uchar>(), right.ptr<uchar>(), left.step, disp_selected_pyr[cur_idx].ptr<float>(), data_cost.ptr<float>(), elem_step,
+                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), data_weight_, max_data_term_, min_disp_th_, stream);
+
+                    int new_idx = (cur_idx + 1) & 1;
+
+                    init_message(temp_.ptr<uchar>(),
+                                 u[new_idx].ptr<float>(), d[new_idx].ptr<float>(), l[new_idx].ptr<float>(), r[new_idx].ptr<float>(),
+                                 u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                                 disp_selected_pyr[new_idx].ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(),
+                                 data_cost_selected.ptr<float>(), data_cost.ptr<float>(), elem_step, rows_pyr[i],
+                                 cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);
+
+                    cur_idx = new_idx;
+                }
+
+                calc_all_iterations(temp_.ptr<uchar>(), u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                                    data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step,
+                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, max_disc_term_, disc_single_jump_, stream);
+            }
+        }
+        else
+        {
+            for (int i = levels_ - 1; i >= 0; i--)
+            {
+                if (i == levels_ - 1)
+                {
+                    init_data_cost(left.ptr<uchar>(), right.ptr<uchar>(), temp_.ptr<uchar>(), left.step, left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<short>(), data_cost_selected.ptr<short>(),
+                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), data_weight_, max_data_term_, min_disp_th_, use_local_init_data_cost_, stream);
+                }
+                else
+                {
+                    compute_data_cost(left.ptr<uchar>(), right.ptr<uchar>(), left.step, disp_selected_pyr[cur_idx].ptr<short>(), data_cost.ptr<short>(), elem_step,
+                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), data_weight_, max_data_term_, min_disp_th_, stream);
+
+                    int new_idx = (cur_idx + 1) & 1;
+
+                    init_message(temp_.ptr<uchar>(),
+                                 u[new_idx].ptr<short>(), d[new_idx].ptr<short>(), l[new_idx].ptr<short>(), r[new_idx].ptr<short>(),
+                                 u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                                 disp_selected_pyr[new_idx].ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(),
+                                 data_cost_selected.ptr<short>(), data_cost.ptr<short>(), elem_step, rows_pyr[i],
+                                 cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);
+
+                    cur_idx = new_idx;
+                }
+
+                calc_all_iterations(temp_.ptr<uchar>(), u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                                    data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step,
+                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, max_disc_term_, disc_single_jump_, stream);
+            }
+        }
+
+        const int dtype = disp.fixedType() ? disp.type() : CV_16SC1;
+
+        disp.create(rows, cols, dtype);
+        GpuMat out = disp.getGpuMat();
+
+        if (dtype != CV_16SC1)
+        {
+            outBuf_.create(rows, cols, CV_16SC1);
+            out = outBuf_;
+        }
+
+        out.setTo(0, _stream);
+
+        if (msg_type_ == CV_32F)
+        {
+            compute_disp(u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                         data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step, out, nr_plane_pyr[0], stream);
+        }
+        else
+        {
+            compute_disp(u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                         data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step, out, nr_plane_pyr[0], stream);
+        }
+
+        if (dtype != CV_16SC1)
+            out.convertTo(disp, dtype, _stream);
+    }
+
+    void StereoCSBPImpl::compute(InputArray /*data*/, OutputArray /*disparity*/, Stream& /*stream*/)
+    {
+        CV_Error(Error::StsNotImplemented, "Not implemented");
+    }
+}
+
+Ptr<cuda::StereoConstantSpaceBP> cv::cuda::createStereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, int msg_type)
+{
+    return makePtr<StereoCSBPImpl>(ndisp, iters, levels, nr_plane, msg_type);
+}
+
+void cv::cuda::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
+{
+    ndisp = (int) ((float) width / 3.14f);
+    if ((ndisp & 1) != 0)
+        ndisp++;
+
+    int mm = std::max(width, height);
+    iters = mm / 100 + ((mm > 1200)? - 4 : 4);
+
+    levels = (int)::log(static_cast<double>(mm)) * 2 / 3;
+    if (levels == 0) levels++;
+
+    nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1));
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudastereo/src/util.cpp b/modules/cudastereo/src/util.cpp
new file mode 100644
index 00000000000..09b108ca892
--- /dev/null
+++ b/modules/cudastereo/src/util.cpp
@@ -0,0 +1,125 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::reprojectImageTo3D(InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::drawColorDisp(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
+
+#else
+
+////////////////////////////////////////////////////////////////////////
+// reprojectImageTo3D
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename D>
+    void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+}}}
+
+void cv::cuda::reprojectImageTo3D(InputArray _disp, OutputArray _xyz, InputArray _Q, int dst_cn, Stream& stream)
+{
+    using namespace cv::cuda::device;
+
+    typedef void (*func_t)(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    static const func_t funcs[2][6] =
+    {
+        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>, reprojectImageTo3D_gpu<int, float3>, reprojectImageTo3D_gpu<float, float3>},
+        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>, reprojectImageTo3D_gpu<int, float4>, reprojectImageTo3D_gpu<float, float4>}
+    };
+
+    GpuMat disp = _disp.getGpuMat();
+    Mat Q = _Q.getMat();
+
+    CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S || disp.type() == CV_32S || disp.type() == CV_32F );
+    CV_Assert( Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous() );
+    CV_Assert( dst_cn == 3 || dst_cn == 4 );
+
+    _xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));
+    GpuMat xyz = _xyz.getGpuMat();
+
+    funcs[dst_cn == 4][disp.type()](disp, xyz, Q.ptr<float>(), StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// drawColorDisp
+
+namespace cv { namespace cuda { namespace device
+{
+    void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+    void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+    void drawColorDisp_gpu(const PtrStepSz<int>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+    void drawColorDisp_gpu(const PtrStepSz<float>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+}}}
+
+namespace
+{
+    template <typename T>
+    void drawColorDisp_caller(const GpuMat& src, OutputArray _dst, int ndisp, const cudaStream_t& stream)
+    {
+        using namespace ::cv::cuda::device;
+
+        _dst.create(src.size(), CV_8UC4);
+        GpuMat dst = _dst.getGpuMat();
+
+        drawColorDisp_gpu((PtrStepSz<T>)src, dst, ndisp, stream);
+    }
+}
+
+void cv::cuda::drawColorDisp(InputArray _src, OutputArray dst, int ndisp, Stream& stream)
+{
+    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, OutputArray dst, int ndisp, const cudaStream_t& stream);
+    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, drawColorDisp_caller<int>, drawColorDisp_caller<float>, 0, 0};
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8U || src.type() == CV_16S || src.type() == CV_32S || src.type() == CV_32F );
+
+    drawColorDisp_callers[src.type()](src, dst, ndisp, StreamAccessor::getStream(stream));
+}
+
+#endif
diff --git a/modules/cudastereo/test/test_main.cpp b/modules/cudastereo/test/test_main.cpp
new file mode 100644
index 00000000000..04f4fcf6e60
--- /dev/null
+++ b/modules/cudastereo/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cudastereo/test/test_precomp.hpp b/modules/cudastereo/test/test_precomp.hpp
new file mode 100644
index 00000000000..ee7ed69fb6a
--- /dev/null
+++ b/modules/cudastereo/test/test_precomp.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudastereo.hpp"
+#include "opencv2/calib3d.hpp"
+
+#include "cvconfig.h"
+
+#endif
diff --git a/modules/cudastereo/test/test_stereo.cpp b/modules/cudastereo/test/test_stereo.cpp
new file mode 100644
index 00000000000..ebcab085ceb
--- /dev/null
+++ b/modules/cudastereo/test/test_stereo.cpp
@@ -0,0 +1,214 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////////
+// StereoBM
+
+struct StereoBM : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(StereoBM, Regression)
+{
+    cv::Mat left_image  = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
+    cv::Mat disp_gold   = readImage("stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE);
+
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+
+    cv::Ptr<cv::StereoBM> bm = cv::cuda::createStereoBM(128, 19);
+    cv::cuda::GpuMat disp;
+
+    bm->compute(loadMat(left_image), loadMat(right_image), disp);
+
+    EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Stereo, StereoBM, ALL_DEVICES);
+
+//////////////////////////////////////////////////////////////////////////
+// StereoBeliefPropagation
+
+struct StereoBeliefPropagation : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(StereoBeliefPropagation, Regression)
+{
+    cv::Mat left_image  = readImage("stereobp/aloe-L.png");
+    cv::Mat right_image = readImage("stereobp/aloe-R.png");
+    cv::Mat disp_gold   = readImage("stereobp/aloe-disp.png", cv::IMREAD_GRAYSCALE);
+
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+
+    cv::Ptr<cv::cuda::StereoBeliefPropagation> bp = cv::cuda::createStereoBeliefPropagation(64, 8, 2, CV_16S);
+    bp->setMaxDataTerm(25.0);
+    bp->setDataWeight(0.1);
+    bp->setMaxDiscTerm(15.0);
+    bp->setDiscSingleJump(1.0);
+
+    cv::cuda::GpuMat disp;
+
+    bp->compute(loadMat(left_image), loadMat(right_image), disp);
+
+    cv::Mat h_disp(disp);
+    h_disp.convertTo(h_disp, disp_gold.depth());
+
+    EXPECT_MAT_NEAR(disp_gold, h_disp, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Stereo, StereoBeliefPropagation, ALL_DEVICES);
+
+//////////////////////////////////////////////////////////////////////////
+// StereoConstantSpaceBP
+
+struct StereoConstantSpaceBP : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(StereoConstantSpaceBP, Regression)
+{
+    cv::Mat left_image  = readImage("csstereobp/aloe-L.png");
+    cv::Mat right_image = readImage("csstereobp/aloe-R.png");
+
+    cv::Mat disp_gold;
+
+    if (supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_20))
+        disp_gold = readImage("csstereobp/aloe-disp.png", cv::IMREAD_GRAYSCALE);
+    else
+        disp_gold = readImage("csstereobp/aloe-disp_CC1X.png", cv::IMREAD_GRAYSCALE);
+
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+
+    cv::Ptr<cv::cuda::StereoConstantSpaceBP> csbp = cv::cuda::createStereoConstantSpaceBP(128, 16, 4, 4);
+    cv::cuda::GpuMat disp;
+
+    csbp->compute(loadMat(left_image), loadMat(right_image), disp);
+
+    cv::Mat h_disp(disp);
+    h_disp.convertTo(h_disp, disp_gold.depth());
+
+    EXPECT_MAT_SIMILAR(disp_gold, h_disp, 1e-4);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Stereo, StereoConstantSpaceBP, ALL_DEVICES);
+
+////////////////////////////////////////////////////////////////////////////////
+// reprojectImageTo3D
+
+PARAM_TEST_CASE(ReprojectImageTo3D, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(ReprojectImageTo3D, Accuracy)
+{
+    cv::Mat disp = randomMat(size, depth, 5.0, 30.0);
+    cv::Mat Q = randomMat(cv::Size(4, 4), CV_32FC1, 0.1, 1.0);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::reprojectImageTo3D(loadMat(disp, useRoi), dst, Q, 3);
+
+    cv::Mat dst_gold;
+    cv::reprojectImageTo3D(disp, dst_gold, Q, false);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Stereo, ReprojectImageTo3D, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16S)),
+    WHOLE_SUBMAT));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudawarping/CMakeLists.txt b/modules/cudawarping/CMakeLists.txt
new file mode 100644
index 00000000000..6370189b75c
--- /dev/null
+++ b/modules/cudawarping/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR WINRT OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudawarping)
+endif()
+
+set(the_description "CUDA-accelerated Image Warping")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudawarping opencv_core opencv_imgproc OPTIONAL opencv_cudev WRAP python)
diff --git a/modules/cudawarping/include/opencv2/cudawarping.hpp b/modules/cudawarping/include/opencv2/cudawarping.hpp
new file mode 100644
index 00000000000..aa00ac009dc
--- /dev/null
+++ b/modules/cudawarping/include/opencv2/cudawarping.hpp
@@ -0,0 +1,216 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDAWARPING_HPP
+#define OPENCV_CUDAWARPING_HPP
+
+#ifndef __cplusplus
+#  error cudawarping.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/imgproc.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudawarping Image Warping
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudawarping
+//! @{
+
+/** @brief Applies a generic geometrical transformation to an image.
+
+@param src Source image.
+@param dst Destination image with the size the same as xmap and the type the same as src .
+@param xmap X values. Only CV_32FC1 type is supported.
+@param ymap Y values. Only CV_32FC1 type is supported.
+@param interpolation Interpolation method (see resize ). INTER_NEAREST , INTER_LINEAR and
+INTER_CUBIC are supported for now.
+@param borderMode Pixel extrapolation method (see borderInterpolate ). BORDER_REFLECT101 ,
+BORDER_REPLICATE , BORDER_CONSTANT , BORDER_REFLECT and BORDER_WRAP are supported for now.
+@param borderValue Value used in case of a constant border. By default, it is 0.
+@param stream Stream for the asynchronous version.
+
+The function transforms the source image using the specified map:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} (xmap(x,y), ymap(x,y))\f]
+
+Values of pixels with non-integer coordinates are computed using the bilinear interpolation.
+
+@sa remap
+ */
+CV_EXPORTS_W void remap(InputArray src, OutputArray dst, InputArray xmap, InputArray ymap,
+                      int interpolation, int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(),
+                      Stream& stream = Stream::Null());
+
+/** @brief Resizes an image.
+
+@param src Source image.
+@param dst Destination image with the same type as src . The size is dsize (when it is non-zero)
+or the size is computed from src.size() , fx , and fy .
+@param dsize Destination image size. If it is zero, it is computed as:
+\f[\texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}\f]
+Either dsize or both fx and fy must be non-zero.
+@param fx Scale factor along the horizontal axis. If it is zero, it is computed as:
+\f[\texttt{(double)dsize.width/src.cols}\f]
+@param fy Scale factor along the vertical axis. If it is zero, it is computed as:
+\f[\texttt{(double)dsize.height/src.rows}\f]
+@param interpolation Interpolation method. INTER_NEAREST , INTER_LINEAR and INTER_CUBIC are
+supported for now.
+@param stream Stream for the asynchronous version.
+
+@sa resize
+ */
+CV_EXPORTS_W void resize(InputArray src, OutputArray dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
+
+/** @brief Applies an affine transformation to an image.
+
+@param src Source image. CV_8U , CV_16U , CV_32S , or CV_32F depth and 1, 3, or 4 channels are
+supported.
+@param dst Destination image with the same type as src . The size is dsize .
+@param M *2x3* transformation matrix.
+@param dsize Size of the destination image.
+@param flags Combination of interpolation methods (see resize) and the optional flag
+WARP_INVERSE_MAP specifying that M is an inverse transformation ( dst=\>src ). Only
+INTER_NEAREST , INTER_LINEAR , and INTER_CUBIC interpolation methods are supported.
+@param borderMode
+@param borderValue
+@param stream Stream for the asynchronous version.
+
+@sa warpAffine
+ */
+CV_EXPORTS_W void warpAffine(InputArray src, OutputArray dst, InputArray M, Size dsize, int flags = INTER_LINEAR,
+    int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());
+
+/** @brief Builds transformation maps for affine transformation.
+
+@param M *2x3* transformation matrix.
+@param inverse Flag specifying that M is an inverse transformation ( dst=\>src ).
+@param dsize Size of the destination image.
+@param xmap X values with CV_32FC1 type.
+@param ymap Y values with CV_32FC1 type.
+@param stream Stream for the asynchronous version.
+
+@sa cuda::warpAffine , cuda::remap
+ */
+CV_EXPORTS_W void buildWarpAffineMaps(InputArray M, bool inverse, Size dsize, OutputArray xmap, OutputArray ymap, Stream& stream = Stream::Null());
+
+/** @brief Applies a perspective transformation to an image.
+
+@param src Source image. CV_8U , CV_16U , CV_32S , or CV_32F depth and 1, 3, or 4 channels are
+supported.
+@param dst Destination image with the same type as src . The size is dsize .
+@param M *3x3* transformation matrix.
+@param dsize Size of the destination image.
+@param flags Combination of interpolation methods (see resize ) and the optional flag
+WARP_INVERSE_MAP specifying that M is the inverse transformation ( dst =\> src ). Only
+INTER_NEAREST , INTER_LINEAR , and INTER_CUBIC interpolation methods are supported.
+@param borderMode
+@param borderValue
+@param stream Stream for the asynchronous version.
+
+@sa warpPerspective
+ */
+CV_EXPORTS_W void warpPerspective(InputArray src, OutputArray dst, InputArray M, Size dsize, int flags = INTER_LINEAR,
+    int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());
+
+/** @brief Builds transformation maps for perspective transformation.
+
+@param M *3x3* transformation matrix.
+@param inverse Flag specifying that M is an inverse transformation ( dst=\>src ).
+@param dsize Size of the destination image.
+@param xmap X values with CV_32FC1 type.
+@param ymap Y values with CV_32FC1 type.
+@param stream Stream for the asynchronous version.
+
+@sa cuda::warpPerspective , cuda::remap
+ */
+CV_EXPORTS_W void buildWarpPerspectiveMaps(InputArray M, bool inverse, Size dsize, OutputArray xmap, OutputArray ymap, Stream& stream = Stream::Null());
+
+/** @brief Rotates an image around the origin (0,0) and then shifts it.
+
+@param src Source image. Supports 1, 3 or 4 channels images with CV_8U , CV_16U or CV_32F
+depth.
+@param dst Destination image with the same type as src . The size is dsize .
+@param dsize Size of the destination image.
+@param angle Angle of rotation in degrees.
+@param xShift Shift along the horizontal axis.
+@param yShift Shift along the vertical axis.
+@param interpolation Interpolation method. Only INTER_NEAREST , INTER_LINEAR , and INTER_CUBIC
+are supported.
+@param stream Stream for the asynchronous version.
+
+@sa cuda::warpAffine
+ */
+CV_EXPORTS_W void rotate(InputArray src, OutputArray dst, Size dsize, double angle, double xShift = 0, double yShift = 0,
+                       int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
+
+/** @brief Smoothes an image and downsamples it.
+
+@param src Source image.
+@param dst Destination image. Will have Size((src.cols+1)/2, (src.rows+1)/2) size and the same
+type as src .
+@param stream Stream for the asynchronous version.
+
+@sa pyrDown
+ */
+CV_EXPORTS_W void pyrDown(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+/** @brief Upsamples an image and then smoothes it.
+
+@param src Source image.
+@param dst Destination image. Will have Size(src.cols\*2, src.rows\*2) size and the same type as
+src .
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS_W void pyrUp(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! @}
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_CUDAWARPING_HPP */
diff --git a/modules/cudawarping/perf/perf_main.cpp b/modules/cudawarping/perf/perf_main.cpp
new file mode 100644
index 00000000000..831844491c3
--- /dev/null
+++ b/modules/cudawarping/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudawarping)
diff --git a/modules/cudawarping/perf/perf_precomp.hpp b/modules/cudawarping/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..2311a482aab
--- /dev/null
+++ b/modules/cudawarping/perf/perf_precomp.hpp
@@ -0,0 +1,55 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudawarping.hpp"
+
+namespace opencv_test {
+using namespace perf;
+}
+
+#endif
diff --git a/modules/cudawarping/perf/perf_warping.cpp b/modules/cudawarping/perf/perf_warping.cpp
new file mode 100644
index 00000000000..3e7aa18f559
--- /dev/null
+++ b/modules/cudawarping/perf/perf_warping.cpp
@@ -0,0 +1,436 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//////////////////////////////////////////////////////////////////////
+// Remap
+
+enum { HALF_SIZE=0, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH };
+CV_ENUM(RemapMode, HALF_SIZE, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH)
+
+void generateMap(cv::Mat& map_x, cv::Mat& map_y, int remapMode)
+{
+    for (int j = 0; j < map_x.rows; ++j)
+    {
+        for (int i = 0; i < map_x.cols; ++i)
+        {
+            switch (remapMode)
+            {
+            case HALF_SIZE:
+                if (i > map_x.cols*0.25 && i < map_x.cols*0.75 && j > map_x.rows*0.25 && j < map_x.rows*0.75)
+                {
+                    map_x.at<float>(j,i) = 2.f * (i - map_x.cols * 0.25f) + 0.5f;
+                    map_y.at<float>(j,i) = 2.f * (j - map_x.rows * 0.25f) + 0.5f;
+                }
+                else
+                {
+                    map_x.at<float>(j,i) = 0.f;
+                    map_y.at<float>(j,i) = 0.f;
+                }
+                break;
+            case UPSIDE_DOWN:
+                map_x.at<float>(j,i) = static_cast<float>(i);
+                map_y.at<float>(j,i) = static_cast<float>(map_x.rows - j);
+                break;
+            case REFLECTION_X:
+                map_x.at<float>(j,i) = static_cast<float>(map_x.cols - i);
+                map_y.at<float>(j,i) = static_cast<float>(j);
+                break;
+            case REFLECTION_BOTH:
+                map_x.at<float>(j,i) = static_cast<float>(map_x.cols - i);
+                map_y.at<float>(j,i) = static_cast<float>(map_x.rows - j);
+                break;
+            } // end of switch
+        }
+    }
+}
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border_Mode, cv::Size, MatDepth, MatCn, Interpolation, BorderMode, RemapMode);
+
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, Remap,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+                    ALL_BORDER_MODES,
+                    RemapMode::all()))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+    const int borderMode = GET_PARAM(4);
+    const int remapMode = GET_PARAM(5);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat xmap(size, CV_32FC1);
+    cv::Mat ymap(size, CV_32FC1);
+    generateMap(xmap, ymap, remapMode);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        const cv::cuda::GpuMat d_xmap(xmap);
+        const cv::cuda::GpuMat d_ymap(ymap);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::remap(d_src, dst, d_xmap, d_ymap, interpolation, borderMode);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Resize
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Scale, cv::Size, MatDepth, MatCn, Interpolation, double);
+
+PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, Resize,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+                    Values(0.5, 0.3, 2.0)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+    const double f = GET_PARAM(4);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::resize(d_src, dst, cv::Size(), f, f, interpolation);
+
+        CUDA_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::resize(src, dst, cv::Size(), f, f, interpolation);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ResizeArea
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Scale, cv::Size, MatDepth, MatCn, double);
+
+PERF_TEST_P(Sz_Depth_Cn_Scale, ResizeArea,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4,
+                    Values(0.2, 0.1, 0.05)))
+{
+    declare.time(1.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = cv::INTER_AREA;
+    const double f = GET_PARAM(3);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::resize(d_src, dst, cv::Size(), f, f, interpolation);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::resize(src, dst, cv::Size(), f, f, interpolation);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// WarpAffine
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border, cv::Size, MatDepth, MatCn, Interpolation, BorderMode);
+
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, WarpAffine,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+                    ALL_BORDER_MODES))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+    const int borderMode = GET_PARAM(4);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const double aplha = CV_PI / 4;
+    const double mat[2 * 3] =
+    {
+        std::cos(aplha), -std::sin(aplha), static_cast<double>(src.cols) / 2.0,
+        std::sin(aplha),  std::cos(aplha), 0
+    };
+    const cv::Mat M(2, 3, CV_64F, (void*) mat);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::warpAffine(d_src, dst, M, size, interpolation, borderMode);
+
+        CUDA_SANITY_CHECK(dst, 1);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::warpAffine(src, dst, M, size, interpolation, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// WarpPerspective
+
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, WarpPerspective,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+                    ALL_BORDER_MODES))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+    const int borderMode = GET_PARAM(4);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const double aplha = CV_PI / 4;
+    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), static_cast<double>(src.cols) / 2.0},
+                         {std::sin(aplha),  std::cos(aplha), 0},
+                         {0.0,              0.0,             1.0}};
+    const cv::Mat M(3, 3, CV_64F, (void*) mat);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::warpPerspective(d_src, dst, M, size, interpolation, borderMode);
+
+        CUDA_SANITY_CHECK(dst, 1);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::warpPerspective(src, dst, M, size, interpolation, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Rotate
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Inter, cv::Size, MatDepth, MatCn, Interpolation);
+
+PERF_TEST_P(Sz_Depth_Cn_Inter, Rotate,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4,
+                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int interpolation = GET_PARAM(3);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::rotate(d_src, dst, size, 30.0, 0, 0, interpolation);
+
+        CUDA_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// PyrDown
+
+DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
+
+PERF_TEST_P(Sz_Depth_Cn, PyrDown,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::pyrDown(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::pyrDown(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// PyrUp
+
+PERF_TEST_P(Sz_Depth_Cn, PyrUp,
+            Combine(CUDA_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    CUDA_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::pyrUp(d_src, dst);
+
+        CUDA_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::pyrUp(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+}} // namespace
diff --git a/modules/cudawarping/src/cuda/pyr_down.cu b/modules/cudawarping/src/cuda/pyr_down.cu
new file mode 100644
index 00000000000..03e791dcf35
--- /dev/null
+++ b/modules/cudawarping/src/cuda/pyr_down.cu
@@ -0,0 +1,228 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
+
+            __shared__ work_t smem[256 + 4];
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y;
+
+            const int src_y = 2 * y;
+
+            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, x);
+                    sum = sum + 0.25f   * src(src_y - 1, x);
+                    sum = sum + 0.375f  * src(src_y    , x);
+                    sum = sum + 0.25f   * src(src_y + 1, x);
+                    sum = sum + 0.0625f * src(src_y + 2, x);
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, left_x);
+                    sum = sum + 0.25f   * src(src_y - 1, left_x);
+                    sum = sum + 0.375f  * src(src_y    , left_x);
+                    sum = sum + 0.25f   * src(src_y + 1, left_x);
+                    sum = sum + 0.0625f * src(src_y + 2, left_x);
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, right_x);
+                    sum = sum + 0.25f   * src(src_y - 1, right_x);
+                    sum = sum + 0.375f  * src(src_y    , right_x);
+                    sum = sum + 0.25f   * src(src_y + 1, right_x);
+                    sum = sum + 0.0625f * src(src_y + 2, right_x);
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+            else
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+
+            __syncthreads();
+
+            if (threadIdx.x < 128)
+            {
+                const int tid2 = threadIdx.x * 2;
+
+                work_t sum;
+
+                sum =       0.0625f * smem[2 + tid2 - 2];
+                sum = sum + 0.25f   * smem[2 + tid2 - 1];
+                sum = sum + 0.375f  * smem[2 + tid2    ];
+                sum = sum + 0.25f   * smem[2 + tid2 + 1];
+                sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+
+                if (dst_x < dst_cols)
+                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
+            }
+        }
+
+        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(src.cols, block.x), dst.rows);
+
+            B<T> b(src.rows, src.cols);
+
+            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
\ No newline at end of file
diff --git a/modules/cudawarping/src/cuda/pyr_up.cu b/modules/cudawarping/src/cuda/pyr_up.cu
new file mode 100644
index 00000000000..b22454964bd
--- /dev/null
+++ b/modules/cudawarping/src/cuda/pyr_up.cu
@@ -0,0 +1,196 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            __shared__ sum_t s_srcPatch[10][10];
+            __shared__ sum_t s_dstPatch[20][16];
+
+            if (threadIdx.x < 10 && threadIdx.y < 10)
+            {
+                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
+                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
+
+                srcx = ::abs(srcx);
+                srcx = ::min(src.cols - 1, srcx);
+
+                srcy = ::abs(srcy);
+                srcy = ::min(src.rows - 1, srcy);
+
+                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
+            }
+
+            __syncthreads();
+
+            sum_t sum = VecTraits<sum_t>::all(0);
+
+            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
+            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
+            const bool eveny = ((threadIdx.y & 1) == 0);
+            const int tidx = threadIdx.x;
+
+            if (eveny)
+            {
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
+                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
+            }
+
+            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
+
+            if (threadIdx.y < 2)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
+            }
+
+            if (threadIdx.y > 13)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
+            }
+
+            __syncthreads();
+
+            sum = VecTraits<sum_t>::all(0);
+
+            const int tidy = threadIdx.y;
+
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
+            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
+
+            if (x < dst.cols && y < dst.rows)
+                dst(y, x) = saturate_cast<T>(4.0f * sum);
+        }
+
+        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(16, 16);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            pyrUp<<<grid, block, 0, stream>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudawarping/src/cuda/remap.cu b/modules/cudawarping/src/cuda/remap.cu
new file mode 100644
index 00000000000..79f155ddfb9
--- /dev/null
+++ b/modules/cudawarping/src/cuda/remap.cu
@@ -0,0 +1,274 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = mapx.ptr(y)[x];
+                const float ycoo = mapy.ptr(y)[x];
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
+            {
+                CV_UNUSED(srcWhole);
+                CV_UNUSED(xoff);
+                CV_UNUSED(yoff);
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_CUDA_IMPLEMENT_REMAP_TEX(type) \
+            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_remap_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float*, bool) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar)
+        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar2)
+        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar4)
+
+        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(schar)
+        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char2)
+        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char4)
+
+        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort)
+        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort2)
+        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort4)
+
+        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short)
+        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short2)
+        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short4)
+
+        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int)
+        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int2)
+        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int4)
+
+        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float)
+        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float2)
+        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float4)
+
+        #undef OPENCV_CUDA_IMPLEMENT_REMAP_TEX
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
+            {
+                if (stream == 0)
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
+                else
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
+            }
+        };
+
+        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
+
+            static const caller_t callers[3][5] =
+            {
+                {
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call
+                },
+                {
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call
+                },
+                {
+                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call
+                }
+            };
+
+            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
+        }
+
+        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    } // namespace imgproc
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudawarping/src/cuda/resize.cu b/modules/cudawarping/src/cuda/resize.cu
new file mode 100644
index 00000000000..7285a474870
--- /dev/null
+++ b/modules/cudawarping/src/cuda/resize.cu
@@ -0,0 +1,482 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <cfloat>
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    // kernels
+
+    template <typename T> __global__ void resize_nearest(const PtrStep<T> src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (dst_x < dst.cols && dst_y < dst.rows)
+        {
+            const float src_x = dst_x * fx;
+            const float src_y = dst_y * fy;
+
+            dst(dst_y, dst_x) = src(__float2int_rz(src_y), __float2int_rz(src_x));
+        }
+    }
+
+    template <typename T> __global__ void resize_linear(const PtrStepSz<T> src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (dst_x < dst.cols && dst_y < dst.rows)
+        {
+            const float src_x = dst_x * fx;
+            const float src_y = dst_y * fy;
+
+            work_type out = VecTraits<work_type>::all(0);
+
+            const int x1 = __float2int_rd(src_x);
+            const int y1 = __float2int_rd(src_y);
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
+            const int x2_read = ::min(x2, src.cols - 1);
+            const int y2_read = ::min(y2, src.rows - 1);
+
+            T src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - src_x) * (y2 - src_y));
+
+            src_reg = src(y1, x2_read);
+            out = out + src_reg * ((src_x - x1) * (y2 - src_y));
+
+            src_reg = src(y2_read, x1);
+            out = out + src_reg * ((x2 - src_x) * (src_y - y1));
+
+            src_reg = src(y2_read, x2_read);
+            out = out + src_reg * ((src_x - x1) * (src_y - y1));
+
+            dst(dst_y, dst_x) = saturate_cast<T>(out);
+        }
+    }
+
+    template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (dst_x < dst.cols && dst_y < dst.rows)
+        {
+            const float src_x = dst_x * fx;
+            const float src_y = dst_y * fy;
+
+            dst(dst_y, dst_x) = src(src_y, src_x);
+        }
+    }
+
+    template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, PtrStepSz<T> dst)
+    {
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < dst.cols && y < dst.rows)
+        {
+            dst(y, x) = src(y, x);
+        }
+    }
+
+    // textures
+
+    template <typename T> struct TextureAccessor;
+
+    #define OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(type) \
+        texture<type, cudaTextureType2D, cudaReadModeElementType> tex_resize_##type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        template <> struct TextureAccessor<type> \
+        { \
+            typedef type elem_type; \
+            typedef int index_type; \
+            int xoff; \
+            int yoff; \
+            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+            { \
+                return tex2D(tex_resize_##type, x + xoff, y + yoff); \
+            } \
+            __host__ static void bind(const PtrStepSz<type>& mat) \
+            { \
+                bindTexture(&tex_resize_##type, mat); \
+            } \
+        };
+
+    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar)
+    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar4)
+
+    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort)
+    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort4)
+
+    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short)
+    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short4)
+
+    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float)
+    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float4)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RESIZE_TEX
+
+    template <typename T>
+    TextureAccessor<T> texAccessor(const PtrStepSz<T>& mat, int yoff, int xoff)
+    {
+        TextureAccessor<T>::bind(mat);
+
+        TextureAccessor<T> t;
+        t.xoff = xoff;
+        t.yoff = yoff;
+
+        return t;
+    }
+
+    // callers for nearest interpolation
+
+    template <typename T>
+    void call_resize_nearest_glob(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        resize_nearest<<<grid, block, 0, stream>>>(src, dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T>
+    void call_resize_nearest_tex(const PtrStepSz<T>& /*src*/, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        resize<<<grid, block>>>(texAccessor(srcWhole, yoff, xoff), dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    // callers for linear interpolation
+
+    template <typename T>
+    void call_resize_linear_glob(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        resize_linear<<<grid, block, 0, stream>>>(src, dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T>
+    void call_resize_linear_tex(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        if (srcWhole.data == src.data)
+        {
+            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
+            LinearFilter< TextureAccessor<T> > filteredSrc(texSrc);
+
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+        else
+        {
+            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
+
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
+            LinearFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    // callers for cubic interpolation
+
+    template <typename T>
+    void call_resize_cubic_glob(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        BrdReplicate<T> brd(src.rows, src.cols);
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+        CubicFilter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+        resize<<<grid, block, 0, stream>>>(filteredSrc, dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T>
+    void call_resize_cubic_tex(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        if (srcWhole.data == src.data)
+        {
+            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
+            CubicFilter< TextureAccessor<T> > filteredSrc(texSrc);
+
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+        else
+        {
+            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
+
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
+            CubicFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    // ResizeNearestDispatcher
+
+    template <typename T> struct ResizeNearestDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_nearest_glob(src, dst, fy, fx, stream);
+        }
+    };
+
+    template <typename T> struct SelectImplForNearest
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            if (stream)
+                call_resize_nearest_glob(src, dst, fy, fx, stream);
+            else
+            {
+                if (fx > 1 || fy > 1)
+                    call_resize_nearest_glob(src, dst, fy, fx, 0);
+                else
+                    call_resize_nearest_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
+            }
+        }
+    };
+
+    template <> struct ResizeNearestDispatcher<uchar> : SelectImplForNearest<uchar> {};
+    template <> struct ResizeNearestDispatcher<uchar4> : SelectImplForNearest<uchar4> {};
+
+    template <> struct ResizeNearestDispatcher<ushort> : SelectImplForNearest<ushort> {};
+    template <> struct ResizeNearestDispatcher<ushort4> : SelectImplForNearest<ushort4> {};
+
+    template <> struct ResizeNearestDispatcher<short> : SelectImplForNearest<short> {};
+    template <> struct ResizeNearestDispatcher<short4> : SelectImplForNearest<short4> {};
+
+    template <> struct ResizeNearestDispatcher<float> : SelectImplForNearest<float> {};
+    template <> struct ResizeNearestDispatcher<float4> : SelectImplForNearest<float4> {};
+
+    // ResizeLinearDispatcher
+
+    template <typename T> struct ResizeLinearDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_linear_glob(src, dst, fy, fx, stream);
+        }
+    };
+
+    template <typename T> struct SelectImplForLinear
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            if (stream)
+                call_resize_linear_glob(src, dst, fy, fx, stream);
+            else
+            {
+                if (fx > 1 || fy > 1)
+                    call_resize_linear_glob(src, dst, fy, fx, 0);
+                else
+                    call_resize_linear_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
+            }
+        }
+    };
+
+    template <> struct ResizeLinearDispatcher<uchar> : SelectImplForLinear<uchar> {};
+    template <> struct ResizeLinearDispatcher<uchar4> : SelectImplForLinear<uchar4> {};
+
+    template <> struct ResizeLinearDispatcher<ushort> : SelectImplForLinear<ushort> {};
+    template <> struct ResizeLinearDispatcher<ushort4> : SelectImplForLinear<ushort4> {};
+
+    template <> struct ResizeLinearDispatcher<short> : SelectImplForLinear<short> {};
+    template <> struct ResizeLinearDispatcher<short4> : SelectImplForLinear<short4> {};
+
+    template <> struct ResizeLinearDispatcher<float> : SelectImplForLinear<float> {};
+    template <> struct ResizeLinearDispatcher<float4> : SelectImplForLinear<float4> {};
+
+    // ResizeCubicDispatcher
+
+    template <typename T> struct ResizeCubicDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& /*srcWhole*/, int /*yoff*/, int /*xoff*/, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_cubic_glob(src, dst, fy, fx, stream);
+        }
+    };
+
+    template <typename T> struct SelectImplForCubic
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            if (stream)
+                call_resize_cubic_glob(src, dst, fy, fx, stream);
+            else
+                call_resize_cubic_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
+        }
+    };
+
+    template <> struct ResizeCubicDispatcher<uchar> : SelectImplForCubic<uchar> {};
+    template <> struct ResizeCubicDispatcher<uchar4> : SelectImplForCubic<uchar4> {};
+
+    template <> struct ResizeCubicDispatcher<ushort> : SelectImplForCubic<ushort> {};
+    template <> struct ResizeCubicDispatcher<ushort4> : SelectImplForCubic<ushort4> {};
+
+    template <> struct ResizeCubicDispatcher<short> : SelectImplForCubic<short> {};
+    template <> struct ResizeCubicDispatcher<short4> : SelectImplForCubic<short4> {};
+
+    template <> struct ResizeCubicDispatcher<float> : SelectImplForCubic<float> {};
+    template <> struct ResizeCubicDispatcher<float4> : SelectImplForCubic<float4> {};
+
+    // ResizeAreaDispatcher
+
+    template <typename T> struct ResizeAreaDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>&, int, int, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            const int iscale_x = (int) round(fx);
+            const int iscale_y = (int) round(fy);
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            if (std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
+            {
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
+            }
+            else
+            {
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    // resize
+
+    template <typename T> void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream)
+    {
+        typedef void (*func_t)(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream);
+        static const func_t funcs[4] =
+        {
+            ResizeNearestDispatcher<T>::call,
+            ResizeLinearDispatcher<T>::call,
+            ResizeCubicDispatcher<T>::call,
+            ResizeAreaDispatcher<T>::call
+        };
+
+        // change to linear if area interpolation upscaling
+        if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
+            interpolation = 1;
+
+        funcs[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
+    }
+
+    template void resize<uchar >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<uchar3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<uchar4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+    template void resize<ushort >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<ushort3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<ushort4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+    template void resize<short >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<short3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<short4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+    template void resize<float >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<float3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<float4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudawarping/src/cuda/warp.cu b/modules/cudawarping/src/cuda/warp.cu
new file mode 100644
index 00000000000..2412f6ee88d
--- /dev/null
+++ b/modules/cudawarping/src/cuda/warp.cu
@@ -0,0 +1,389 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        __constant__ float c_warpMat[3 * 3];
+
+        struct AffineTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
+                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        struct PerspectiveTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
+
+                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
+                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////
+        // Build Maps
+
+        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < xmap.cols && y < xmap.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                xmap(y, x) = coord.x;
+                ymap(y, x) = coord.y;
+            }
+        }
+
+        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
+
+            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
+        }
+
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
+        }
+
+        ///////////////////////////////////////////////////////////////////
+        // Warp
+
+        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
+            }
+        }
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
+            {
+                CV_UNUSED(xoff);
+                CV_UNUSED(yoff);
+                CV_UNUSED(srcWhole);
+
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_CUDA_IMPLEMENT_WARP_TEX(type) \
+            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_warp_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar)
+        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar2)
+        OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar4)
+
+        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(schar)
+        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(char2)
+        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(char4)
+
+        OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort)
+        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort2)
+        OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort4)
+
+        OPENCV_CUDA_IMPLEMENT_WARP_TEX(short)
+        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(short2)
+        OPENCV_CUDA_IMPLEMENT_WARP_TEX(short4)
+
+        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int)
+        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int2)
+        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int4)
+
+        OPENCV_CUDA_IMPLEMENT_WARP_TEX(float)
+        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(float2)
+        OPENCV_CUDA_IMPLEMENT_WARP_TEX(float4)
+
+        #undef OPENCV_CUDA_IMPLEMENT_WARP_TEX
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
+            {
+                if (stream == 0)
+                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
+                else
+                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
+            }
+        };
+
+        template <class Transform, typename T>
+        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
+                         int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
+
+            static const func_t funcs[3][5] =
+            {
+                {
+                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call
+                }
+            };
+
+            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
+        }
+
+        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
+        }
+
+        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
+        }
+
+        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    } // namespace imgproc
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudawarping/src/precomp.hpp b/modules/cudawarping/src/precomp.hpp
new file mode 100644
index 00000000000..a59a4e92574
--- /dev/null
+++ b/modules/cudawarping/src/precomp.hpp
@@ -0,0 +1,50 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include "opencv2/cudawarping.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudawarping/src/pyramids.cpp b/modules/cudawarping/src/pyramids.cpp
new file mode 100644
index 00000000000..817a1671599
--- /dev/null
+++ b/modules/cudawarping/src/pyramids.cpp
@@ -0,0 +1,134 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::pyrDown(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::pyrUp(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+//////////////////////////////////////////////////////////////////////////////
+// pyrDown
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::pyrDown(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    using namespace cv::cuda::device::imgproc;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
+        {0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/},
+        {pyrDown_gpu<ushort>     , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3>     , pyrDown_gpu<ushort4>     },
+        {pyrDown_gpu<short>      , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3>      , pyrDown_gpu<short4>      },
+        {pyrDown_gpu<int>        , 0 /*pyrDown_gpu<int2>*/   , pyrDown_gpu<int3>        , pyrDown_gpu<int4>        },
+        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert( func != 0 );
+
+    _dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    func(src, dst, StreamAccessor::getStream(stream));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// pyrUp
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::pyrUp(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    using namespace cv::cuda::device::imgproc;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
+        {0 /*pyrUp_gpu<schar>*/, 0 /*pyrUp_gpu<schar2>*/ , 0 /*pyrUp_gpu<schar3>*/, 0 /*pyrUp_gpu<schar4>*/},
+        {pyrUp_gpu<ushort>     , 0 /*pyrUp_gpu<ushort2>*/, pyrUp_gpu<ushort3>     , pyrUp_gpu<ushort4>     },
+        {pyrUp_gpu<short>      , 0 /*pyrUp_gpu<short2>*/ , pyrUp_gpu<short3>      , pyrUp_gpu<short4>      },
+        {0 /*pyrUp_gpu<int>*/  , 0 /*pyrUp_gpu<int2>*/   , 0 /*pyrUp_gpu<int3>*/  , 0 /*pyrUp_gpu<int4>*/  },
+        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert( func != 0 );
+
+    _dst.create(src.rows * 2, src.cols * 2, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    func(src, dst, StreamAccessor::getStream(stream));
+}
+
+#endif
\ No newline at end of file
diff --git a/modules/cudawarping/src/remap.cpp b/modules/cudawarping/src/remap.cpp
new file mode 100644
index 00000000000..c1351daf2d4
--- /dev/null
+++ b/modules/cudawarping/src/remap.cpp
@@ -0,0 +1,104 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::remap(InputArray, OutputArray, InputArray, InputArray, int, int, Scalar, Stream&){ throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T>
+        void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst,
+                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    }
+}}}
+
+void cv::cuda::remap(InputArray _src, OutputArray _dst, InputArray _xmap, InputArray _ymap, int interpolation, int borderMode, Scalar borderValue, Stream& stream)
+{
+    using namespace cv::cuda::device::imgproc;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
+        int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    static const func_t funcs[6][4] =
+    {
+        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
+        {0 /*remap_gpu<schar>*/, 0 /*remap_gpu<char2>*/  , 0 /*remap_gpu<char3>*/, 0 /*remap_gpu<char4>*/},
+        {remap_gpu<ushort>     , 0 /*remap_gpu<ushort2>*/, remap_gpu<ushort3>    , remap_gpu<ushort4>    },
+        {remap_gpu<short>      , 0 /*remap_gpu<short2>*/ , remap_gpu<short3>     , remap_gpu<short4>     },
+        {0 /*remap_gpu<int>*/  , 0 /*remap_gpu<int2>*/   , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ },
+        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
+    };
+
+    GpuMat src = _src.getGpuMat();
+    GpuMat xmap = _xmap.getGpuMat();
+    GpuMat ymap = _ymap.getGpuMat();
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size() );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
+    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    if (!func)
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported input type");
+
+    _dst.create(xmap.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    Scalar_<float> borderValueFloat;
+    borderValueFloat = borderValue;
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap,
+        dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/cudawarping/src/resize.cpp b/modules/cudawarping/src/resize.cpp
new file mode 100644
index 00000000000..9943a6cdc6a
--- /dev/null
+++ b/modules/cudawarping/src/resize.cpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::resize(InputArray, OutputArray, Size, double, double, int, Stream&) { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T>
+    void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+}}}
+
+void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, double fy, int interpolation, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    typedef void (*func_t)(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {device::resize<uchar>      , 0 /*device::resize<uchar2>*/ , device::resize<uchar3>     , device::resize<uchar4>     },
+        {0 /*device::resize<schar>*/, 0 /*device::resize<char2>*/  , 0 /*device::resize<char3>*/, 0 /*device::resize<char4>*/},
+        {device::resize<ushort>     , 0 /*device::resize<ushort2>*/, device::resize<ushort3>    , device::resize<ushort4>    },
+        {device::resize<short>      , 0 /*device::resize<short2>*/ , device::resize<short3>     , device::resize<short4>     },
+        {0 /*device::resize<int>*/  , 0 /*device::resize<int2>*/   , 0 /*device::resize<int3>*/ , 0 /*device::resize<int4>*/ },
+        {device::resize<float>      , 0 /*device::resize<float2>*/ , device::resize<float3>     , device::resize<float4>     }
+    };
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA );
+    CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
+
+    if (dsize == Size())
+    {
+        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
+    }
+    else
+    {
+        fx = static_cast<double>(dsize.width) / src.cols;
+        fy = static_cast<double>(dsize.height) / src.rows;
+    }
+
+    _dst.create(dsize, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    if (dsize == src.size())
+    {
+        src.copyTo(dst, stream);
+        return;
+    }
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+
+    if (!func)
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+    PtrStepSzb wholeSrc(wholeSize.height, wholeSize.width, src.datastart, src.step);
+
+    func(src, wholeSrc, ofs.y, ofs.x, dst, static_cast<float>(1.0 / fy), static_cast<float>(1.0 / fx), interpolation, StreamAccessor::getStream(stream));
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp
new file mode 100644
index 00000000000..a18a459c2b1
--- /dev/null
+++ b/modules/cudawarping/src/warp.cpp
@@ -0,0 +1,534 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::warpAffine(InputArray, OutputArray, InputArray, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
+void cv::cuda::buildWarpAffineMaps(InputArray, bool, Size, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::warpPerspective(InputArray, OutputArray, InputArray, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
+void cv::cuda::buildWarpPerspectiveMaps(InputArray, bool, Size, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::rotate(InputArray, OutputArray, Size, double, double, double, int, Stream&) { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
+
+        template <typename T>
+        void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
+
+        template <typename T>
+        void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    }
+}}}
+
+void cv::cuda::buildWarpAffineMaps(InputArray _M, bool inverse, Size dsize, OutputArray _xmap, OutputArray _ymap, Stream& stream)
+{
+    using namespace cv::cuda::device::imgproc;
+
+    Mat M = _M.getMat();
+
+    CV_Assert( M.rows == 2 && M.cols == 3 );
+
+    _xmap.create(dsize, CV_32FC1);
+    _ymap.create(dsize, CV_32FC1);
+
+    GpuMat xmap = _xmap.getGpuMat();
+    GpuMat ymap = _ymap.getGpuMat();
+
+    float coeffs[2 * 3];
+    Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
+
+    if (inverse)
+        M.convertTo(coeffsMat, coeffsMat.type());
+    else
+    {
+        cv::Mat iM;
+        invertAffineTransform(M, iM);
+        iM.convertTo(coeffsMat, coeffsMat.type());
+    }
+
+    buildWarpAffineMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
+}
+
+void cv::cuda::buildWarpPerspectiveMaps(InputArray _M, bool inverse, Size dsize, OutputArray _xmap, OutputArray _ymap, Stream& stream)
+{
+    using namespace cv::cuda::device::imgproc;
+
+    Mat M = _M.getMat();
+
+    CV_Assert( M.rows == 3 && M.cols == 3 );
+
+    _xmap.create(dsize, CV_32FC1);
+    _ymap.create(dsize, CV_32FC1);
+
+    GpuMat xmap = _xmap.getGpuMat();
+    GpuMat ymap = _ymap.getGpuMat();
+
+    float coeffs[3 * 3];
+    Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
+
+    if (inverse)
+        M.convertTo(coeffsMat, coeffsMat.type());
+    else
+    {
+        cv::Mat iM;
+        invert(M, iM);
+        iM.convertTo(coeffsMat, coeffsMat.type());
+    }
+
+    buildWarpPerspectiveMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
+}
+
+namespace
+{
+    template <int DEPTH> struct NppWarpFunc
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_type* pDst,
+                                    int dstStep, NppiRect dstRoi, const double coeffs[][3],
+                                    int interpolation);
+    };
+
+    template <int DEPTH, typename NppWarpFunc<DEPTH>::func_t func> struct NppWarp
+    {
+        typedef typename NppWarpFunc<DEPTH>::npp_type npp_type;
+
+        static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
+        {
+            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+
+            NppiSize srcsz;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;
+
+            NppiRect srcroi;
+            srcroi.x = 0;
+            srcroi.y = 0;
+            srcroi.height = src.rows;
+            srcroi.width = src.cols;
+
+            NppiRect dstroi;
+            dstroi.x = 0;
+            dstroi.y = 0;
+            dstroi.height = dst.rows;
+            dstroi.width = dst.cols;
+
+            cv::cuda::NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi,
+                              dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi,
+                              coeffs, npp_inter[interpolation]) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+    Mat M = _M.getMat();
+
+    CV_Assert( M.rows == 2 && M.cols == 3 );
+
+    const int interpolation = flags & INTER_MAX;
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
+    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
+
+    _dst.create(dsize, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    static const bool useNppTab[6][4][3] =
+    {
+        {
+            {false, false, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        }
+    };
+
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    // NPP bug on float data
+    useNpp = useNpp && src.depth() != CV_32F;
+
+    if (useNpp)
+    {
+        typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
+
+        static const func_t funcs[2][6][4] =
+        {
+            {
+                {NppWarp<CV_8U, nppiWarpAffine_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpAffine_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffine_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffine_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpAffine_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffine_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffine_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpAffine_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffine_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffine_32f_C4R>::call}
+            },
+            {
+                {NppWarp<CV_8U, nppiWarpAffineBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffineBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffineBack_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpAffineBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffineBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffineBack_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R>::call}
+            }
+        };
+
+        dst.setTo(borderValue, stream);
+
+        double coeffs[2][3];
+        Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
+        M.convertTo(coeffsMat, coeffsMat.type());
+
+        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(stream));
+    }
+    else
+    {
+        using namespace cv::cuda::device::imgproc;
+
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        static const func_t funcs[6][4] =
+        {
+            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
+            {0 /*warpAffine_gpu<schar>*/, 0 /*warpAffine_gpu<char2>*/  , 0 /*warpAffine_gpu<char3>*/, 0 /*warpAffine_gpu<char4>*/},
+            {warpAffine_gpu<ushort>     , 0 /*warpAffine_gpu<ushort2>*/, warpAffine_gpu<ushort3>    , warpAffine_gpu<ushort4>    },
+            {warpAffine_gpu<short>      , 0 /*warpAffine_gpu<short2>*/ , warpAffine_gpu<short3>     , warpAffine_gpu<short4>     },
+            {0 /*warpAffine_gpu<int>*/  , 0 /*warpAffine_gpu<int2>*/   , 0 /*warpAffine_gpu<int3>*/ , 0 /*warpAffine_gpu<int4>*/ },
+            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
+        };
+
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        float coeffs[2 * 3];
+        Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
+
+        if (flags & WARP_INVERSE_MAP)
+            M.convertTo(coeffsMat, coeffsMat.type());
+        else
+        {
+            cv::Mat iM;
+            invertAffineTransform(M, iM);
+            iM.convertTo(coeffsMat, coeffsMat.type());
+        }
+
+        Scalar_<float> borderValueFloat;
+        borderValueFloat = borderValue;
+
+        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
+            dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
+    }
+}
+
+void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+    Mat M = _M.getMat();
+
+    CV_Assert( M.rows == 3 && M.cols == 3 );
+
+    const int interpolation = flags & INTER_MAX;
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
+    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP) ;
+
+    _dst.create(dsize, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    static const bool useNppTab[6][4][3] =
+    {
+        {
+            {false, false, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        }
+    };
+
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    // NPP bug on float data
+    useNpp = useNpp && src.depth() != CV_32F;
+
+    if (useNpp)
+    {
+        typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
+
+        static const func_t funcs[2][6][4] =
+        {
+            {
+                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
+            },
+            {
+                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
+            }
+        };
+
+        dst.setTo(borderValue, stream);
+
+        double coeffs[3][3];
+        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
+        M.convertTo(coeffsMat, coeffsMat.type());
+
+        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(stream));
+    }
+    else
+    {
+        using namespace cv::cuda::device::imgproc;
+
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        static const func_t funcs[6][4] =
+        {
+            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
+            {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/  , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
+            {warpPerspective_gpu<ushort>     , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3>    , warpPerspective_gpu<ushort4>    },
+            {warpPerspective_gpu<short>      , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3>     , warpPerspective_gpu<short4>     },
+            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
+            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
+        };
+
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        float coeffs[3 * 3];
+        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
+
+        if (flags & WARP_INVERSE_MAP)
+            M.convertTo(coeffsMat, coeffsMat.type());
+        else
+        {
+            cv::Mat iM;
+            invert(M, iM);
+            iM.convertTo(coeffsMat, coeffsMat.type());
+        }
+
+        Scalar_<float> borderValueFloat;
+        borderValueFloat = borderValue;
+
+        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
+            dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// rotate
+
+namespace
+{
+    template <int DEPTH> struct NppRotateFunc
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI,
+                                    npp_type* pDst, int nDstStep, NppiRect oDstROI,
+                                    double nAngle, double nShiftX, double nShiftY, int eInterpolation);
+    };
+
+    template <int DEPTH, typename NppRotateFunc<DEPTH>::func_t func> struct NppRotate
+    {
+        typedef typename NppRotateFunc<DEPTH>::npp_type npp_type;
+
+        static void call(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream)
+        {
+            CV_UNUSED(dsize);
+            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+
+            NppStreamHandler h(stream);
+
+            NppiSize srcsz;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;
+            NppiRect srcroi;
+            srcroi.x = srcroi.y = 0;
+            srcroi.height = src.rows;
+            srcroi.width = src.cols;
+            NppiRect dstroi;
+            dstroi.x = dstroi.y = 0;
+            dstroi.height = dst.rows;
+            dstroi.width = dst.cols;
+
+            nppSafeCall( func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi,
+                dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::cuda::rotate(InputArray _src, OutputArray _dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call},
+        {0,0,0,0},
+        {NppRotate<CV_16U, nppiRotate_16u_C1R>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R>::call, NppRotate<CV_16U, nppiRotate_16u_C4R>::call},
+        {0,0,0,0},
+        {0,0,0,0},
+        {NppRotate<CV_32F, nppiRotate_32f_C1R>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R>::call, NppRotate<CV_32F, nppiRotate_32f_C4R>::call}
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+    CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
+
+    _dst.create(dsize, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    dst.setTo(Scalar::all(0), stream);
+
+    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/cudawarping/test/interpolation.hpp b/modules/cudawarping/test/interpolation.hpp
new file mode 100644
index 00000000000..7a00143e1d9
--- /dev/null
+++ b/modules/cudawarping/test/interpolation.hpp
@@ -0,0 +1,131 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
+#define __OPENCV_TEST_INTERPOLATION_HPP__
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+template <typename T> T readVal(const cv::Mat& src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+{
+    if (border_type == cv::BORDER_CONSTANT)
+        return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
+
+    return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
+}
+
+template <typename T> struct NearestInterpolator
+{
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        return readVal<T>(src, int(y), int(x), c, border_type, borderVal);
+    }
+};
+
+template <typename T> struct LinearInterpolator
+{
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        int x1 = cvFloor(x);
+        int y1 = cvFloor(y);
+        int x2 = x1 + 1;
+        int y2 = y1 + 1;
+
+        float res = 0;
+
+        res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
+        res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
+        res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
+        res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
+
+        return cv::saturate_cast<T>(res);
+    }
+};
+
+template <typename T> struct CubicInterpolator
+{
+    static float bicubicCoeff(float x_)
+    {
+        float x = fabsf(x_);
+        if (x <= 1.0f)
+        {
+            return x * x * (1.5f * x - 2.5f) + 1.0f;
+        }
+        else if (x < 2.0f)
+        {
+            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+        }
+        else
+        {
+            return 0.0f;
+        }
+    }
+
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        const float xmin = ceilf(x - 2.0f);
+        const float xmax = floorf(x + 2.0f);
+
+        const float ymin = ceilf(y - 2.0f);
+        const float ymax = floorf(y + 2.0f);
+
+        float sum  = 0.0f;
+        float wsum = 0.0f;
+
+        for (float cy = ymin; cy <= ymax; cy += 1.0f)
+        {
+            for (float cx = xmin; cx <= xmax; cx += 1.0f)
+            {
+                const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
+                sum += w * readVal<T>(src, (int) floorf(cy), (int) floorf(cx), c, border_type, borderVal);
+                wsum += w;
+            }
+        }
+
+        float res = (!wsum)? 0 : sum / wsum;
+
+        return cv::saturate_cast<T>(res);
+    }
+};
+
+#endif // __OPENCV_TEST_INTERPOLATION_HPP__
diff --git a/modules/cudawarping/test/test_main.cpp b/modules/cudawarping/test/test_main.cpp
new file mode 100644
index 00000000000..04f4fcf6e60
--- /dev/null
+++ b/modules/cudawarping/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cudawarping/test/test_precomp.hpp b/modules/cudawarping/test/test_precomp.hpp
new file mode 100644
index 00000000000..1d80af7229b
--- /dev/null
+++ b/modules/cudawarping/test/test_precomp.hpp
@@ -0,0 +1,54 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudawarping.hpp"
+
+#include "cvconfig.h"
+
+#include "interpolation.hpp"
+
+#endif
diff --git a/modules/cudawarping/test/test_pyramids.cpp b/modules/cudawarping/test/test_pyramids.cpp
new file mode 100644
index 00000000000..5b498aed132
--- /dev/null
+++ b/modules/cudawarping/test/test_pyramids.cpp
@@ -0,0 +1,131 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+////////////////////////////////////////////////////////
+// pyrDown
+
+PARAM_TEST_CASE(PyrDown, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(PyrDown, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size((size.width + 1) / 2, (size.height + 1) / 2), type, useRoi);
+    cv::cuda::pyrDown(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::pyrDown(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, PyrDown, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////
+// pyrUp
+
+PARAM_TEST_CASE(PyrUp, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(PyrUp, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(size.width * 2, size.height * 2), type, useRoi);
+    cv::cuda::pyrUp(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::pyrUp(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, PyrUp, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudawarping/test/test_remap.cpp b/modules/cudawarping/test/test_remap.cpp
new file mode 100644
index 00000000000..5839013d62c
--- /dev/null
+++ b/modules/cudawarping/test/test_remap.cpp
@@ -0,0 +1,182 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator> void remapImpl(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int borderType, cv::Scalar borderVal)
+    {
+        const int cn = src.channels();
+
+        cv::Size dsize = xmap.size();
+
+        dst.create(dsize, src.type());
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ymap.at<float>(y, x), xmap.at<float>(y, x), c, borderType, borderVal);
+            }
+        }
+    }
+
+    void remapGold(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
+    {
+        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& xmap, const cv::Mat& ymap, cv::Mat& dst, int borderType, cv::Scalar borderVal);
+
+        static const func_t nearest_funcs[] =
+        {
+            remapImpl<unsigned char, NearestInterpolator>,
+            remapImpl<signed char, NearestInterpolator>,
+            remapImpl<unsigned short, NearestInterpolator>,
+            remapImpl<short, NearestInterpolator>,
+            remapImpl<int, NearestInterpolator>,
+            remapImpl<float, NearestInterpolator>
+        };
+
+        static const func_t linear_funcs[] =
+        {
+            remapImpl<unsigned char, LinearInterpolator>,
+            remapImpl<signed char, LinearInterpolator>,
+            remapImpl<unsigned short, LinearInterpolator>,
+            remapImpl<short, LinearInterpolator>,
+            remapImpl<int, LinearInterpolator>,
+            remapImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            remapImpl<unsigned char, CubicInterpolator>,
+            remapImpl<signed char, CubicInterpolator>,
+            remapImpl<unsigned short, CubicInterpolator>,
+            remapImpl<short, CubicInterpolator>,
+            remapImpl<int, CubicInterpolator>,
+            remapImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        funcs[interpolation][src.depth()](src, xmap, ymap, dst, borderType, borderVal);
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(Remap, cv::cuda::DeviceInfo, cv::Size, MatType, Interpolation, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int interpolation;
+    int borderType;
+    bool useRoi;
+
+    cv::Mat xmap;
+    cv::Mat ymap;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+        borderType = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        // rotation matrix
+
+        const double aplha = CV_PI / 4;
+        static double M[2][3] = { {std::cos(aplha), -std::sin(aplha), size.width / 2.0},
+                                  {std::sin(aplha),  std::cos(aplha), 0.0}};
+
+        xmap.create(size, CV_32FC1);
+        ymap.create(size, CV_32FC1);
+
+        for (int y = 0; y < size.height; ++y)
+        {
+            for (int x = 0; x < size.width; ++x)
+            {
+                xmap.at<float>(y, x) = static_cast<float>(M[0][0] * x + M[0][1] * y + M[0][2]);
+                ymap.at<float>(y, x) = static_cast<float>(M[1][0] * x + M[1][1] * y + M[1][2]);
+            }
+        }
+    }
+};
+
+CUDA_TEST_P(Remap, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    cv::cuda::GpuMat dst = createMat(xmap.size(), type, useRoi);
+    cv::cuda::remap(loadMat(src, useRoi), dst, loadMat(xmap, useRoi), loadMat(ymap, useRoi), interpolation, borderType, val);
+
+    cv::Mat dst_gold;
+    remapGold(src, xmap, ymap, dst_gold, interpolation, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, Remap, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
+    WHOLE_SUBMAT));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudawarping/test/test_resize.cpp b/modules/cudawarping/test/test_resize.cpp
new file mode 100644
index 00000000000..5822f87c037
--- /dev/null
+++ b/modules/cudawarping/test/test_resize.cpp
@@ -0,0 +1,211 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator>
+    void resizeImpl(const cv::Mat& src, cv::Mat& dst, double fx, double fy)
+    {
+        const int cn = src.channels();
+
+        cv::Size dsize(cv::saturate_cast<int>(src.cols * fx), cv::saturate_cast<int>(src.rows * fy));
+
+        dst.create(dsize, src.type());
+
+        float ifx = static_cast<float>(1.0 / fx);
+        float ify = static_cast<float>(1.0 / fy);
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, y * ify, x * ifx, c, cv::BORDER_REPLICATE);
+            }
+        }
+    }
+
+    void resizeGold(const cv::Mat& src, cv::Mat& dst, double fx, double fy, int interpolation)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Mat& dst, double fx, double fy);
+
+        static const func_t nearest_funcs[] =
+        {
+            resizeImpl<unsigned char, NearestInterpolator>,
+            resizeImpl<signed char, NearestInterpolator>,
+            resizeImpl<unsigned short, NearestInterpolator>,
+            resizeImpl<short, NearestInterpolator>,
+            resizeImpl<int, NearestInterpolator>,
+            resizeImpl<float, NearestInterpolator>
+        };
+
+
+        static const func_t linear_funcs[] =
+        {
+            resizeImpl<unsigned char, LinearInterpolator>,
+            resizeImpl<signed char, LinearInterpolator>,
+            resizeImpl<unsigned short, LinearInterpolator>,
+            resizeImpl<short, LinearInterpolator>,
+            resizeImpl<int, LinearInterpolator>,
+            resizeImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            resizeImpl<unsigned char, CubicInterpolator>,
+            resizeImpl<signed char, CubicInterpolator>,
+            resizeImpl<unsigned short, CubicInterpolator>,
+            resizeImpl<short, CubicInterpolator>,
+            resizeImpl<int, CubicInterpolator>,
+            resizeImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        funcs[interpolation][src.depth()](src, dst, fx, fy);
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(Resize, cv::cuda::DeviceInfo, cv::Size, MatType, double, Interpolation, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    double coeff;
+    int interpolation;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        coeff = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(Resize, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
+    cv::cuda::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, interpolation);
+
+    cv::Mat dst_gold;
+    resizeGold(src, dst_gold, coeff, coeff, interpolation);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, Resize, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    WHOLE_SUBMAT));
+
+/////////////////
+
+PARAM_TEST_CASE(ResizeSameAsHost, cv::cuda::DeviceInfo, cv::Size, MatType, double, Interpolation, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    double coeff;
+    int interpolation;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        coeff = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+// downscaling only: used for classifiers
+CUDA_TEST_P(ResizeSameAsHost, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
+    cv::cuda::resize(loadMat(src, useRoi), dst, cv::Size(), coeff, coeff, interpolation);
+
+    cv::Mat dst_gold;
+    cv::resize(src, dst_gold, cv::Size(), coeff, coeff, interpolation);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeSameAsHost, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_AREA)),
+    WHOLE_SUBMAT));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudawarping/test/test_warp_affine.cpp b/modules/cudawarping/test/test_warp_affine.cpp
new file mode 100644
index 00000000000..d26a5fdeb7c
--- /dev/null
+++ b/modules/cudawarping/test/test_warp_affine.cpp
@@ -0,0 +1,282 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+namespace
+{
+    cv::Mat createTransformMatrix(cv::Size srcSize, double angle)
+    {
+        cv::Mat M(2, 3, CV_64FC1);
+
+        M.at<double>(0, 0) = std::cos(angle); M.at<double>(0, 1) = -std::sin(angle); M.at<double>(0, 2) = srcSize.width / 2;
+        M.at<double>(1, 0) = std::sin(angle); M.at<double>(1, 1) =  std::cos(angle); M.at<double>(1, 2) = 0.0;
+
+        return M;
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test buildWarpAffineMaps
+
+PARAM_TEST_CASE(BuildWarpAffineMaps, cv::cuda::DeviceInfo, cv::Size, Inverse)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool inverse;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(BuildWarpAffineMaps, Accuracy)
+{
+    cv::Mat M = createTransformMatrix(size, CV_PI / 4);
+    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
+
+    cv::cuda::GpuMat xmap, ymap;
+    cv::cuda::buildWarpAffineMaps(M, inverse, size, xmap, ymap);
+
+    int interpolation = cv::INTER_NEAREST;
+    int borderMode = cv::BORDER_CONSTANT;
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    cv::Mat dst;
+    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), interpolation, borderMode);
+
+    cv::Mat dst_gold;
+    cv::warpAffine(src, dst_gold, M, size, flags, borderMode);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, BuildWarpAffineMaps, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DIRECT_INVERSE));
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator> void warpAffineImpl(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal)
+    {
+        const int cn = src.channels();
+
+        dst.create(dsize, src.type());
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                float xcoo = static_cast<float>(M.at<double>(0, 0) * x + M.at<double>(0, 1) * y + M.at<double>(0, 2));
+                float ycoo = static_cast<float>(M.at<double>(1, 0) * x + M.at<double>(1, 1) * y + M.at<double>(1, 2));
+
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ycoo, xcoo, c, borderType, borderVal);
+            }
+        }
+    }
+
+    void warpAffineGold(const cv::Mat& src, const cv::Mat& M, bool inverse, cv::Size dsize, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
+    {
+        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal);
+
+        static const func_t nearest_funcs[] =
+        {
+            warpAffineImpl<unsigned char, NearestInterpolator>,
+            warpAffineImpl<signed char, NearestInterpolator>,
+            warpAffineImpl<unsigned short, NearestInterpolator>,
+            warpAffineImpl<short, NearestInterpolator>,
+            warpAffineImpl<int, NearestInterpolator>,
+            warpAffineImpl<float, NearestInterpolator>
+        };
+
+        static const func_t linear_funcs[] =
+        {
+            warpAffineImpl<unsigned char, LinearInterpolator>,
+            warpAffineImpl<signed char, LinearInterpolator>,
+            warpAffineImpl<unsigned short, LinearInterpolator>,
+            warpAffineImpl<short, LinearInterpolator>,
+            warpAffineImpl<int, LinearInterpolator>,
+            warpAffineImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            warpAffineImpl<unsigned char, CubicInterpolator>,
+            warpAffineImpl<signed char, CubicInterpolator>,
+            warpAffineImpl<unsigned short, CubicInterpolator>,
+            warpAffineImpl<short, CubicInterpolator>,
+            warpAffineImpl<int, CubicInterpolator>,
+            warpAffineImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        if (inverse)
+            funcs[interpolation][src.depth()](src, M, dsize, dst, borderType, borderVal);
+        else
+        {
+            cv::Mat iM;
+            cv::invertAffineTransform(M, iM);
+            funcs[interpolation][src.depth()](src, iM, dsize, dst, borderType, borderVal);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(WarpAffine, cv::cuda::DeviceInfo, cv::Size, MatType, Inverse, Interpolation, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool inverse;
+    int interpolation;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        inverse = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(WarpAffine, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat M = createTransformMatrix(size, CV_PI / 3);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    cv::cuda::warpAffine(loadMat(src, useRoi), dst, M, size, flags, borderType, val);
+
+    cv::Mat dst_gold;
+    warpAffineGold(src, M, inverse, size, dst_gold, interpolation, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, WarpAffine, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////
+// Test NPP
+
+PARAM_TEST_CASE(WarpAffineNPP, cv::cuda::DeviceInfo, MatType, Inverse, Interpolation)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int type;
+    bool inverse;
+    int interpolation;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(WarpAffineNPP, Accuracy)
+{
+    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
+    cv::Mat M = createTransformMatrix(src.size(), CV_PI / 4);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::warpAffine(loadMat(src), dst, M, src.size(), flags);
+
+    cv::Mat dst_gold;
+    warpAffineGold(src, M, inverse, src.size(), dst_gold, interpolation, cv::BORDER_CONSTANT, cv::Scalar::all(0));
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, WarpAffineNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudawarping/test/test_warp_perspective.cpp b/modules/cudawarping/test/test_warp_perspective.cpp
new file mode 100644
index 00000000000..7c5c758892c
--- /dev/null
+++ b/modules/cudawarping/test/test_warp_perspective.cpp
@@ -0,0 +1,285 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+namespace
+{
+    cv::Mat createTransformMatrix(cv::Size srcSize, double angle)
+    {
+        cv::Mat M(3, 3, CV_64FC1);
+
+        M.at<double>(0, 0) = std::cos(angle); M.at<double>(0, 1) = -std::sin(angle); M.at<double>(0, 2) = srcSize.width / 2;
+        M.at<double>(1, 0) = std::sin(angle); M.at<double>(1, 1) =  std::cos(angle); M.at<double>(1, 2) = 0.0;
+        M.at<double>(2, 0) = 0.0            ; M.at<double>(2, 1) =  0.0            ; M.at<double>(2, 2) = 1.0;
+
+        return M;
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test buildWarpPerspectiveMaps
+
+PARAM_TEST_CASE(BuildWarpPerspectiveMaps, cv::cuda::DeviceInfo, cv::Size, Inverse)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool inverse;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(BuildWarpPerspectiveMaps, Accuracy)
+{
+    cv::Mat M = createTransformMatrix(size, CV_PI / 4);
+
+    cv::cuda::GpuMat xmap, ymap;
+    cv::cuda::buildWarpPerspectiveMaps(M, inverse, size, xmap, ymap);
+
+    cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
+    int interpolation = cv::INTER_NEAREST;
+    int borderMode = cv::BORDER_CONSTANT;
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    cv::Mat dst;
+    cv::remap(src, dst, cv::Mat(xmap), cv::Mat(ymap), interpolation, borderMode);
+
+    cv::Mat dst_gold;
+    cv::warpPerspective(src, dst_gold, M, size, flags, borderMode);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, BuildWarpPerspectiveMaps, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DIRECT_INVERSE));
+
+///////////////////////////////////////////////////////////////////
+// Gold implementation
+
+namespace
+{
+    template <typename T, template <typename> class Interpolator> void warpPerspectiveImpl(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal)
+    {
+        const int cn = src.channels();
+
+        dst.create(dsize, src.type());
+
+        for (int y = 0; y < dsize.height; ++y)
+        {
+            for (int x = 0; x < dsize.width; ++x)
+            {
+                float coeff = static_cast<float>(M.at<double>(2, 0) * x + M.at<double>(2, 1) * y + M.at<double>(2, 2));
+
+                float xcoo = static_cast<float>((M.at<double>(0, 0) * x + M.at<double>(0, 1) * y + M.at<double>(0, 2)) / coeff);
+                float ycoo = static_cast<float>((M.at<double>(1, 0) * x + M.at<double>(1, 1) * y + M.at<double>(1, 2)) / coeff);
+
+                for (int c = 0; c < cn; ++c)
+                    dst.at<T>(y, x * cn + c) = Interpolator<T>::getValue(src, ycoo, xcoo, c, borderType, borderVal);
+            }
+        }
+    }
+
+    void warpPerspectiveGold(const cv::Mat& src, const cv::Mat& M, bool inverse, cv::Size dsize, cv::Mat& dst, int interpolation, int borderType, cv::Scalar borderVal)
+    {
+        typedef void (*func_t)(const cv::Mat& src, const cv::Mat& M, cv::Size dsize, cv::Mat& dst, int borderType, cv::Scalar borderVal);
+
+        static const func_t nearest_funcs[] =
+        {
+            warpPerspectiveImpl<unsigned char, NearestInterpolator>,
+            warpPerspectiveImpl<signed char, NearestInterpolator>,
+            warpPerspectiveImpl<unsigned short, NearestInterpolator>,
+            warpPerspectiveImpl<short, NearestInterpolator>,
+            warpPerspectiveImpl<int, NearestInterpolator>,
+            warpPerspectiveImpl<float, NearestInterpolator>
+        };
+
+        static const func_t linear_funcs[] =
+        {
+            warpPerspectiveImpl<unsigned char, LinearInterpolator>,
+            warpPerspectiveImpl<signed char, LinearInterpolator>,
+            warpPerspectiveImpl<unsigned short, LinearInterpolator>,
+            warpPerspectiveImpl<short, LinearInterpolator>,
+            warpPerspectiveImpl<int, LinearInterpolator>,
+            warpPerspectiveImpl<float, LinearInterpolator>
+        };
+
+        static const func_t cubic_funcs[] =
+        {
+            warpPerspectiveImpl<unsigned char, CubicInterpolator>,
+            warpPerspectiveImpl<signed char, CubicInterpolator>,
+            warpPerspectiveImpl<unsigned short, CubicInterpolator>,
+            warpPerspectiveImpl<short, CubicInterpolator>,
+            warpPerspectiveImpl<int, CubicInterpolator>,
+            warpPerspectiveImpl<float, CubicInterpolator>
+        };
+
+        static const func_t* funcs[] = {nearest_funcs, linear_funcs, cubic_funcs};
+
+        if (inverse)
+            funcs[interpolation][src.depth()](src, M, dsize, dst, borderType, borderVal);
+        else
+        {
+            cv::Mat iM;
+            cv::invert(M, iM);
+            funcs[interpolation][src.depth()](src, iM, dsize, dst, borderType, borderVal);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Test
+
+PARAM_TEST_CASE(WarpPerspective, cv::cuda::DeviceInfo, cv::Size, MatType, Inverse, Interpolation, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool inverse;
+    int interpolation;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        inverse = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(WarpPerspective, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat M = createTransformMatrix(size, CV_PI / 3);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    cv::cuda::warpPerspective(loadMat(src, useRoi), dst, M, size, flags, borderType, val);
+
+    cv::Mat dst_gold;
+    warpPerspectiveGold(src, M, inverse, size, dst_gold, interpolation, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, WarpPerspective, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////
+// Test NPP
+
+PARAM_TEST_CASE(WarpPerspectiveNPP, cv::cuda::DeviceInfo, MatType, Inverse, Interpolation)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int type;
+    bool inverse;
+    int interpolation;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        type = GET_PARAM(1);
+        inverse = GET_PARAM(2);
+        interpolation = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(WarpPerspectiveNPP, Accuracy)
+{
+    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
+    cv::Mat M = createTransformMatrix(src.size(), CV_PI / 4);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::warpPerspective(loadMat(src), dst, M, src.size(), flags);
+
+    cv::Mat dst_gold;
+    warpPerspectiveGold(src, M, inverse, src.size(), dst_gold, interpolation, cv::BORDER_CONSTANT, cv::Scalar::all(0));
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, WarpPerspectiveNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+
+
+}} // namespace
+#endif // HAVE_CUDA
diff --git a/modules/cudev/CMakeLists.txt b/modules/cudev/CMakeLists.txt
new file mode 100644
index 00000000000..742f7c8ae55
--- /dev/null
+++ b/modules/cudev/CMakeLists.txt
@@ -0,0 +1,25 @@
+if(NOT HAVE_CUDA)
+  ocv_module_disable(cudev)
+endif()
+
+set(the_description "CUDA device layer")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable -Wenum-compare -Wshadow)
+
+ocv_add_module(cudev)
+
+ocv_module_include_directories(opencv_core)
+
+file(GLOB_RECURSE lib_hdrs "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/*.hpp")
+file(GLOB         lib_srcs "${CMAKE_CURRENT_LIST_DIR}/src/*.cpp")
+
+source_group("Include" FILES ${lib_hdrs})
+source_group("Src" FILES ${lib_srcs})
+
+ocv_glob_module_sources(HEADERS ${lib_hdrs} SOURCES ${lib_srcs})
+
+ocv_create_module()
+
+if(BUILD_TESTS AND NOT BUILD_opencv_world)
+  add_subdirectory(test)
+endif()
diff --git a/modules/cudev/include/opencv2/cudev.hpp b/modules/cudev/include/opencv2/cudev.hpp
new file mode 100644
index 00000000000..1f435c2b6a5
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev.hpp
@@ -0,0 +1,119 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_HPP
+#define OPENCV_CUDEV_HPP
+
+#include "cudev/common.hpp"
+
+#include "cudev/util/atomic.hpp"
+#include "cudev/util/limits.hpp"
+#include "cudev/util/saturate_cast.hpp"
+#include "cudev/util/simd_functions.hpp"
+#include "cudev/util/tuple.hpp"
+#include "cudev/util/type_traits.hpp"
+#include "cudev/util/vec_math.hpp"
+#include "cudev/util/vec_traits.hpp"
+
+#include "cudev/functional/color_cvt.hpp"
+#include "cudev/functional/functional.hpp"
+#include "cudev/functional/tuple_adapter.hpp"
+
+#include "cudev/warp/reduce.hpp"
+#include "cudev/warp/scan.hpp"
+#include "cudev/warp/shuffle.hpp"
+#include "cudev/warp/warp.hpp"
+
+#include "cudev/block/block.hpp"
+#include "cudev/block/dynamic_smem.hpp"
+#include "cudev/block/reduce.hpp"
+#include "cudev/block/scan.hpp"
+#include "cudev/block/vec_distance.hpp"
+
+#include "cudev/grid/copy.hpp"
+#include "cudev/grid/reduce.hpp"
+#include "cudev/grid/histogram.hpp"
+#include "cudev/grid/integral.hpp"
+#include "cudev/grid/pyramids.hpp"
+#include "cudev/grid/reduce_to_vec.hpp"
+#include "cudev/grid/split_merge.hpp"
+#include "cudev/grid/transform.hpp"
+#include "cudev/grid/transpose.hpp"
+
+#include "cudev/ptr2d/constant.hpp"
+#include "cudev/ptr2d/deriv.hpp"
+#include "cudev/ptr2d/extrapolation.hpp"
+#include "cudev/ptr2d/glob.hpp"
+#include "cudev/ptr2d/gpumat.hpp"
+#include "cudev/ptr2d/interpolation.hpp"
+#include "cudev/ptr2d/lut.hpp"
+#include "cudev/ptr2d/mask.hpp"
+#include "cudev/ptr2d/remap.hpp"
+#include "cudev/ptr2d/resize.hpp"
+#include "cudev/ptr2d/texture.hpp"
+#include "cudev/ptr2d/traits.hpp"
+#include "cudev/ptr2d/transform.hpp"
+#include "cudev/ptr2d/warping.hpp"
+#include "cudev/ptr2d/zip.hpp"
+
+#include "cudev/expr/binary_func.hpp"
+#include "cudev/expr/binary_op.hpp"
+#include "cudev/expr/color.hpp"
+#include "cudev/expr/deriv.hpp"
+#include "cudev/expr/expr.hpp"
+#include "cudev/expr/per_element_func.hpp"
+#include "cudev/expr/reduction.hpp"
+#include "cudev/expr/unary_func.hpp"
+#include "cudev/expr/unary_op.hpp"
+#include "cudev/expr/warping.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudev Device layer
+  @}
+*/
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/block/block.hpp b/modules/cudev/include/opencv2/cudev/block/block.hpp
new file mode 100644
index 00000000000..54e93003608
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/block/block.hpp
@@ -0,0 +1,133 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_BLOCK_BLOCK_HPP
+#define OPENCV_CUDEV_BLOCK_BLOCK_HPP
+
+#include "../common.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+struct Block
+{
+    __device__ __forceinline__ static uint blockId()
+    {
+        return (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
+    }
+
+    __device__ __forceinline__ static uint blockSize()
+    {
+        return blockDim.x * blockDim.y * blockDim.z;
+    }
+
+    __device__ __forceinline__ static uint threadLineId()
+    {
+        return (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
+    }
+};
+
+template <class It, typename T>
+__device__ __forceinline__ static void blockFill(It beg, It end, const T& value)
+{
+    uint STRIDE = Block::blockSize();
+    It t = beg + Block::threadLineId();
+
+    for(; t < end; t += STRIDE)
+        *t = value;
+}
+
+template <class OutIt, typename T>
+__device__ __forceinline__ static void blockYota(OutIt beg, OutIt end, T value)
+{
+    uint STRIDE = Block::blockSize();
+    uint tid = Block::threadLineId();
+    value += tid;
+
+    for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
+        *t = value;
+}
+
+template <class InIt, class OutIt>
+__device__ __forceinline__ static void blockCopy(InIt beg, InIt end, OutIt out)
+{
+    uint STRIDE = Block::blockSize();
+    InIt  t = beg + Block::threadLineId();
+    OutIt o = out + (t - beg);
+
+    for(; t < end; t += STRIDE, o += STRIDE)
+        *o = *t;
+}
+
+template <class InIt, class OutIt, class UnOp>
+__device__ __forceinline__ static void blockTransform(InIt beg, InIt end, OutIt out, const UnOp& op)
+{
+    uint STRIDE = Block::blockSize();
+    InIt  t = beg + Block::threadLineId();
+    OutIt o = out + (t - beg);
+
+    for(; t < end; t += STRIDE, o += STRIDE)
+        *o = op(*t);
+}
+
+template <class InIt1, class InIt2, class OutIt, class BinOp>
+__device__ __forceinline__ static void blockTransform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, const BinOp& op)
+{
+    uint STRIDE = Block::blockSize();
+    InIt1 t1 = beg1 + Block::threadLineId();
+    InIt2 t2 = beg2 + Block::threadLineId();
+    OutIt o  = out + (t1 - beg1);
+
+    for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
+        *o = op(*t1, *t2);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp b/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp
new file mode 100644
index 00000000000..151e949a617
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp
@@ -0,0 +1,392 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_BLOCK_REDUCE_DETAIL_HPP
+#define OPENCV_CUDEV_BLOCK_REDUCE_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/tuple.hpp"
+#include "../../util/type_traits.hpp"
+#include "../../warp/warp.hpp"
+#include "../../warp/shuffle.hpp"
+
+namespace cv { namespace cudev {
+
+namespace block_reduce_detail
+{
+    // GetType
+
+    template <typename T> struct GetType;
+
+    template <typename T> struct GetType<T*>
+    {
+        typedef T type;
+    };
+
+    template <typename T> struct GetType<volatile T*>
+    {
+        typedef T type;
+    };
+
+    template <typename T> struct GetType<T&>
+    {
+        typedef T type;
+    };
+
+    // For
+
+    template <int I, int N> struct For
+    {
+        template <class PointerTuple, class ValTuple>
+        __device__ static void loadToSmem(const PointerTuple& smem, const ValTuple& val, uint tid)
+        {
+            get<I>(smem)[tid] = get<I>(val);
+
+            For<I + 1, N>::loadToSmem(smem, val, tid);
+        }
+
+        template <class PointerTuple, class ValTuple>
+        __device__ static void loadFromSmem(const PointerTuple& smem, const ValTuple& val, uint tid)
+        {
+            get<I>(val) = get<I>(smem)[tid];
+
+            For<I + 1, N>::loadFromSmem(smem, val, tid);
+        }
+
+        template <class PointerTuple, class ValTuple, class OpTuple>
+        __device__ static void merge(const PointerTuple& smem, const ValTuple& val, uint tid, uint delta, const OpTuple& op)
+        {
+            typename GetType<typename tuple_element<I, PointerTuple>::type>::type reg = get<I>(smem)[tid + delta];
+            get<I>(smem)[tid] = get<I>(val) = get<I>(op)(get<I>(val), reg);
+
+            For<I + 1, N>::merge(smem, val, tid, delta, op);
+        }
+
+#if CV_CUDEV_ARCH >= 300
+        template <class ValTuple, class OpTuple>
+        __device__ static void mergeShfl(const ValTuple& val, uint delta, uint width, const OpTuple& op)
+        {
+            typename GetType<typename tuple_element<I, ValTuple>::type>::type reg = shfl_down(get<I>(val), delta, width);
+            get<I>(val) = get<I>(op)(get<I>(val), reg);
+
+            For<I + 1, N>::mergeShfl(val, delta, width, op);
+        }
+#endif
+    };
+
+    template <int N> struct For<N, N>
+    {
+        template <class PointerTuple, class ValTuple>
+        __device__ __forceinline__ static void loadToSmem(const PointerTuple&, const ValTuple&, uint)
+        {
+        }
+        template <class PointerTuple, class ValTuple>
+        __device__ __forceinline__ static void loadFromSmem(const PointerTuple&, const ValTuple&, uint)
+        {
+        }
+
+        template <class PointerTuple, class ValTuple, class OpTuple>
+        __device__ __forceinline__ static void merge(const PointerTuple&, const ValTuple&, uint, uint, const OpTuple&)
+        {
+        }
+
+#if CV_CUDEV_ARCH >= 300
+        template <class ValTuple, class OpTuple>
+        __device__ __forceinline__ static void mergeShfl(const ValTuple&, uint, uint, const OpTuple&)
+        {
+        }
+#endif
+    };
+
+    // loadToSmem / loadFromSmem
+
+    template <typename T>
+    __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, uint tid)
+    {
+        smem[tid] = val;
+    }
+
+    template <typename T>
+    __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, uint tid)
+    {
+        val = smem[tid];
+    }
+
+    template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+    __device__ __forceinline__ void loadToSmem(const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                               const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                               uint tid)
+    {
+        For<0, tuple_size<tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+    }
+
+    template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+    __device__ __forceinline__ void loadFromSmem(const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                     const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                     uint tid)
+    {
+        For<0, tuple_size<tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+    }
+
+    // merge
+
+    template <typename T, class Op>
+    __device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op)
+    {
+        T reg = smem[tid + delta];
+        smem[tid] = val = op(val, reg);
+    }
+
+    template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+              class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+    __device__ __forceinline__ void merge(const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                          const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                          uint tid,
+                                          uint delta,
+                                          const tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+    {
+        For<0, tuple_size<tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+    }
+
+    // mergeShfl
+
+#if CV_CUDEV_ARCH >= 300
+    template <typename T, class Op>
+    __device__ __forceinline__ void mergeShfl(T& val, uint delta, uint width, const Op& op)
+    {
+        T reg = shfl_down(val, delta, width);
+        val = op(val, reg);
+    }
+
+    template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+              class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+    __device__ __forceinline__ void mergeShfl(const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              uint delta,
+                                              uint width,
+                                              const tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+    {
+        For<0, tuple_size<tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+    }
+#endif
+
+    // Generic
+
+    template <int N> struct Generic
+    {
+        template <typename Pointer, typename Reference, class Op>
+        __device__ static void reduce(Pointer smem, Reference val, uint tid, Op op)
+        {
+            loadToSmem(smem, val, tid);
+            if (N >= 32)
+                __syncthreads();
+
+            if (N >= 2048)
+            {
+                if (tid < 1024)
+                    merge(smem, val, tid, 1024, op);
+
+                __syncthreads();
+            }
+            if (N >= 1024)
+            {
+                if (tid < 512)
+                    merge(smem, val, tid, 512, op);
+
+                __syncthreads();
+            }
+            if (N >= 512)
+            {
+                if (tid < 256)
+                    merge(smem, val, tid, 256, op);
+
+                __syncthreads();
+            }
+            if (N >= 256)
+            {
+                if (tid < 128)
+                    merge(smem, val, tid, 128, op);
+
+                __syncthreads();
+            }
+            if (N >= 128)
+            {
+                if (tid < 64)
+                    merge(smem, val, tid, 64, op);
+
+                __syncthreads();
+            }
+            if (N >= 64)
+            {
+                if (tid < 32)
+                    merge(smem, val, tid, 32, op);
+            }
+
+            if (tid < 16)
+            {
+                merge(smem, val, tid, 16, op);
+                merge(smem, val, tid, 8, op);
+                merge(smem, val, tid, 4, op);
+                merge(smem, val, tid, 2, op);
+                merge(smem, val, tid, 1, op);
+            }
+        }
+    };
+
+    // Unroll
+
+    template <int I, typename Pointer, typename Reference, class Op> struct Unroll
+    {
+        __device__ static void loop(Pointer smem, Reference val, uint tid, Op op)
+        {
+            merge(smem, val, tid, I, op);
+            Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+        }
+
+#if CV_CUDEV_ARCH >= 300
+        __device__ static void loopShfl(Reference val, Op op, uint N)
+        {
+            mergeShfl(val, I, N, op);
+            Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+        }
+#endif
+    };
+
+    template <typename Pointer, typename Reference, class Op> struct Unroll<0, Pointer, Reference, Op>
+    {
+        __device__ __forceinline__ static void loop(Pointer, Reference, uint, Op)
+        {
+        }
+
+#if CV_CUDEV_ARCH >= 300
+        __device__ __forceinline__ static void loopShfl(Reference, Op, uint)
+        {
+        }
+#endif
+    };
+
+    // WarpOptimized
+
+    template <int N> struct WarpOptimized
+    {
+        template <typename Pointer, typename Reference, class Op>
+        __device__ static void reduce(Pointer smem, Reference val, uint tid, Op op)
+        {
+        #if CV_CUDEV_ARCH >= 300
+            CV_UNUSED(smem);
+            CV_UNUSED(tid);
+
+            Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+        #else
+            loadToSmem(smem, val, tid);
+
+            if (tid < N / 2)
+                Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+        #endif
+        }
+    };
+
+    // GenericOptimized32
+
+    template <int N> struct GenericOptimized32
+    {
+        enum { M = N / 32 };
+
+        template <typename Pointer, typename Reference, class Op>
+        __device__ static void reduce(Pointer smem, Reference val, uint tid, Op op)
+        {
+            const uint laneId = Warp::laneId();
+
+        #if CV_CUDEV_ARCH >= 300
+            Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
+
+            if (laneId == 0)
+                loadToSmem(smem, val, tid / 32);
+        #else
+            loadToSmem(smem, val, tid);
+
+            if (laneId < 16)
+                Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
+
+            __syncthreads();
+
+            if (laneId == 0)
+                loadToSmem(smem, val, tid / 32);
+        #endif
+
+            __syncthreads();
+
+            loadFromSmem(smem, val, tid);
+
+            if (tid < 32)
+            {
+        #if CV_CUDEV_ARCH >= 300
+                Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
+        #else
+                Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+        #endif
+            }
+        }
+    };
+
+    template <int N> struct Dispatcher
+    {
+        typedef typename SelectIf<
+            (N <= 32) && IsPowerOf2<N>::value,
+            WarpOptimized<N>,
+            typename SelectIf<
+                (N <= 1024) && IsPowerOf2<N>::value,
+                GenericOptimized32<N>,
+                Generic<N>
+            >::type
+        >::type reductor;
+    };
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp b/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp
new file mode 100644
index 00000000000..4af834a446e
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp
@@ -0,0 +1,394 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_BLOCK_REDUCE_KEY_VAL_DETAIL_HPP
+#define OPENCV_CUDEV_BLOCK_REDUCE_KEY_VAL_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/tuple.hpp"
+#include "../../util/type_traits.hpp"
+#include "../../warp/warp.hpp"
+
+namespace cv { namespace cudev {
+
+namespace block_reduce_key_val_detail
+{
+    // GetType
+
+    template <typename T> struct GetType;
+
+    template <typename T> struct GetType<T*>
+    {
+        typedef T type;
+    };
+
+    template <typename T> struct GetType<volatile T*>
+    {
+        typedef T type;
+    };
+
+    template <typename T> struct GetType<T&>
+    {
+        typedef T type;
+    };
+
+    // For
+
+    template <int I, int N> struct For
+    {
+        template <class PointerTuple, class ReferenceTuple>
+        __device__ static void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, uint tid)
+        {
+            get<I>(smem)[tid] = get<I>(data);
+
+            For<I + 1, N>::loadToSmem(smem, data, tid);
+        }
+
+        template <class PointerTuple, class ReferenceTuple>
+        __device__ static void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, uint tid)
+        {
+            get<I>(data) = get<I>(smem)[tid];
+
+            For<I + 1, N>::loadFromSmem(smem, data, tid);
+        }
+
+        template <class PointerTuple, class ReferenceTuple>
+        __device__ static void copy(const PointerTuple& svals, const ReferenceTuple& val, uint tid, uint delta)
+        {
+            get<I>(svals)[tid] = get<I>(val) = get<I>(svals)[tid + delta];
+
+            For<I + 1, N>::copy(svals, val, tid, delta);
+        }
+
+        template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+        __device__ static void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+                                     const ValPointerTuple& svals, const ValReferenceTuple& val,
+                                     const CmpTuple& cmp,
+                                     uint tid, uint delta)
+        {
+            typename GetType<typename tuple_element<I, KeyPointerTuple>::type>::type reg = get<I>(skeys)[tid + delta];
+
+            if (get<I>(cmp)(reg, get<I>(key)))
+            {
+                get<I>(skeys)[tid] = get<I>(key) = reg;
+                get<I>(svals)[tid] = get<I>(val) = get<I>(svals)[tid + delta];
+            }
+
+            For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+    };
+
+    template <int N> struct For<N, N>
+    {
+        template <class PointerTuple, class ReferenceTuple>
+        __device__ static void loadToSmem(const PointerTuple&, const ReferenceTuple&, uint)
+        {
+        }
+
+        template <class PointerTuple, class ReferenceTuple>
+        __device__ static void loadFromSmem(const PointerTuple&, const ReferenceTuple&, uint)
+        {
+        }
+
+        template <class PointerTuple, class ReferenceTuple>
+        __device__ static void copy(const PointerTuple&, const ReferenceTuple&, uint, uint)
+        {
+        }
+
+        template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+        __device__ static void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+                                     const ValPointerTuple&, const ValReferenceTuple&,
+                                     const CmpTuple&,
+                                     uint, uint)
+        {
+        }
+    };
+
+    // loadToSmem / loadFromSmem
+
+    template <typename T>
+    __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, uint tid)
+    {
+        smem[tid] = data;
+    }
+
+    template <typename T>
+    __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, uint tid)
+    {
+        data = smem[tid];
+    }
+
+    template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+    __device__ __forceinline__ void loadToSmem(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                               const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                               uint tid)
+    {
+        For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+    }
+
+    template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+    __device__ __forceinline__ void loadFromSmem(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                 const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                 uint tid)
+    {
+        For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
+    }
+
+    // copyVals
+
+    template <typename V>
+    __device__ __forceinline__ void copyVals(volatile V* svals, V& val, uint tid, uint delta)
+    {
+        svals[tid] = val = svals[tid + delta];
+    }
+
+    template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+    __device__ __forceinline__ void copyVals(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                             const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                             uint tid, uint delta)
+    {
+        For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+    }
+
+    // merge
+
+    template <typename K, typename V, class Cmp>
+    __device__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, uint tid, uint delta)
+    {
+        K reg = skeys[tid + delta];
+
+        if (cmp(reg, key))
+        {
+            skeys[tid] = key = reg;
+            copyVals(svals, val, tid, delta);
+        }
+    }
+
+    template <typename K,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp>
+    __device__ void merge(volatile K* skeys, K& key,
+                          const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                          const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                          const Cmp& cmp, uint tid, uint delta)
+    {
+        K reg = skeys[tid + delta];
+
+        if (cmp(reg, key))
+        {
+            skeys[tid] = key = reg;
+            copyVals(svals, val, tid, delta);
+        }
+    }
+
+    template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+              typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+    __device__ __forceinline__ void merge(const tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                          const tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                          const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                          const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                          const tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                          uint tid, uint delta)
+    {
+        For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+    }
+
+    // Generic
+
+    template <int N> struct Generic
+    {
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        __device__ static void reduce(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp)
+        {
+            loadToSmem(skeys, key, tid);
+            loadValsToSmem(svals, val, tid);
+            if (N >= 32)
+                __syncthreads();
+
+            if (N >= 2048)
+            {
+                if (tid < 1024)
+                    merge(skeys, key, svals, val, cmp, tid, 1024);
+
+                __syncthreads();
+            }
+            if (N >= 1024)
+            {
+                if (tid < 512)
+                    merge(skeys, key, svals, val, cmp, tid, 512);
+
+                __syncthreads();
+            }
+            if (N >= 512)
+            {
+                if (tid < 256)
+                    merge(skeys, key, svals, val, cmp, tid, 256);
+
+                __syncthreads();
+            }
+            if (N >= 256)
+            {
+                if (tid < 128)
+                    merge(skeys, key, svals, val, cmp, tid, 128);
+
+                __syncthreads();
+            }
+            if (N >= 128)
+            {
+                if (tid < 64)
+                    merge(skeys, key, svals, val, cmp, tid, 64);
+
+                __syncthreads();
+            }
+            if (N >= 64)
+            {
+                if (tid < 32)
+                    merge(skeys, key, svals, val, cmp, tid, 32);
+            }
+
+            if (tid < 16)
+            {
+                merge(skeys, key, svals, val, cmp, tid, 16);
+                merge(skeys, key, svals, val, cmp, tid, 8);
+                merge(skeys, key, svals, val, cmp, tid, 4);
+                merge(skeys, key, svals, val, cmp, tid, 2);
+                merge(skeys, key, svals, val, cmp, tid, 1);
+            }
+        }
+    };
+
+    // Unroll
+
+    template <int I, class KP, class KR, class VP, class VR, class Cmp> struct Unroll
+    {
+        __device__ static void loop(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp)
+        {
+            merge(skeys, key, svals, val, cmp, tid, I);
+            Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+        }
+    };
+
+    template <class KP, class KR, class VP, class VR, class Cmp> struct Unroll<0, KP, KR, VP, VR, Cmp>
+    {
+        __device__ __forceinline__ static void loop(KP, KR, VP, VR, uint, Cmp)
+        {
+        }
+    };
+
+    // WarpOptimized
+
+    template <int N> struct WarpOptimized
+    {
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        __device__ static void reduce(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp)
+        {
+            loadToSmem(skeys, key, tid);
+            loadToSmem(svals, val, tid);
+
+            if (tid < N / 2)
+                Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+        }
+    };
+
+    // GenericOptimized32
+
+    template <uint N> struct GenericOptimized32
+    {
+        enum { M = N / 32 };
+
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        __device__ static void reduce(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp)
+        {
+            const uint laneId = Warp::laneId();
+
+            loadToSmem(skeys, key, tid);
+            loadToSmem(svals, val, tid);
+
+            if (laneId < 16)
+                Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+
+            __syncthreads();
+
+            if (laneId == 0)
+            {
+                loadToSmem(skeys, key, tid / 32);
+                loadToSmem(svals, val, tid / 32);
+            }
+
+            __syncthreads();
+
+            loadFromSmem(skeys, key, tid);
+
+            if (tid < 32)
+            {
+                Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            }
+        }
+    };
+
+    template <int N> struct Dispatcher
+    {
+        typedef typename SelectIf<
+            (N <= 32) && IsPowerOf2<N>::value,
+            WarpOptimized<N>,
+            typename SelectIf<
+                (N <= 1024) && IsPowerOf2<N>::value,
+                GenericOptimized32<N>,
+                Generic<N>
+            >::type
+        >::type reductor;
+    };
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/block/dynamic_smem.hpp b/modules/cudev/include/opencv2/cudev/block/dynamic_smem.hpp
new file mode 100644
index 00000000000..610de3023d8
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/block/dynamic_smem.hpp
@@ -0,0 +1,91 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_BLOCK_DYNAMIC_SMEM_HPP
+#define OPENCV_CUDEV_BLOCK_DYNAMIC_SMEM_HPP
+
+#include "../common.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class T> struct DynamicSharedMem
+{
+    __device__ __forceinline__ operator T*()
+    {
+        extern __shared__ int __smem[];
+        return (T*) __smem;
+    }
+
+    __device__ __forceinline__ operator const T*() const
+    {
+        extern __shared__ int __smem[];
+        return (T*) __smem;
+    }
+};
+
+// specialize for double to avoid unaligned memory access compile errors
+template <> struct DynamicSharedMem<double>
+{
+    __device__ __forceinline__ operator double*()
+    {
+        extern __shared__ double __smem_d[];
+        return (double*) __smem_d;
+    }
+
+    __device__ __forceinline__ operator const double*() const
+    {
+        extern __shared__ double __smem_d[];
+        return (double*) __smem_d;
+    }
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/block/reduce.hpp b/modules/cudev/include/opencv2/cudev/block/reduce.hpp
new file mode 100644
index 00000000000..06f59a16ae9
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/block/reduce.hpp
@@ -0,0 +1,133 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_BLOCK_REDUCE_HPP
+#define OPENCV_CUDEV_BLOCK_REDUCE_HPP
+
+#include "../common.hpp"
+#include "../util/tuple.hpp"
+#include "../warp/reduce.hpp"
+#include "detail/reduce.hpp"
+#include "detail/reduce_key_val.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// blockReduce
+
+template <int N, typename T, class Op>
+__device__ __forceinline__ void blockReduce(volatile T* smem, T& val, uint tid, const Op& op)
+{
+    block_reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
+}
+
+template <int N,
+          typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+          typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+          class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+__device__ __forceinline__ void blockReduce(const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                            const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                            uint tid,
+                                            const tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+{
+    block_reduce_detail::Dispatcher<N>::reductor::template reduce<
+            const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
+            const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
+            const tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
+}
+
+// blockReduceKeyVal
+
+template <int N, typename K, typename V, class Cmp>
+__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, uint tid, const Cmp& cmp)
+{
+    block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+}
+
+template <int N,
+          typename K,
+          typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+          typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+          class Cmp>
+__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key,
+                                                  const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                  const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  uint tid, const Cmp& cmp)
+{
+    block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
+            const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+            const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+            const Cmp&>(skeys, key, svals, val, tid, cmp);
+}
+
+template <int N,
+          typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+          typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+          typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+          typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+          class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+__device__ __forceinline__ void blockReduceKeyVal(const tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                                  const tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                  const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                  const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  uint tid,
+                                                  const tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
+{
+    block_reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
+            const tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
+            const tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
+            const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+            const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+            const tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
+            >(skeys, key, svals, val, tid, cmp);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/block/scan.hpp b/modules/cudev/include/opencv2/cudev/block/scan.hpp
new file mode 100644
index 00000000000..cd75a3e197b
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@@ -0,0 +1,106 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_BLOCK_SCAN_HPP
+#define OPENCV_CUDEV_BLOCK_SCAN_HPP
+
+#include "../common.hpp"
+#include "../warp/scan.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <int THREADS_NUM, typename T>
+__device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
+{
+    if (THREADS_NUM > WARP_SIZE)
+    {
+        // bottom-level inclusive warp scan
+        T warpResult = warpScanInclusive(data, smem, tid);
+
+        __syncthreads();
+
+        // save top elements of each warp for exclusive warp scan
+        // sync to wait for warp scans to complete (because s_Data is being overwritten)
+        if ((tid & (WARP_SIZE - 1)) == (WARP_SIZE - 1))
+        {
+            smem[tid >> LOG_WARP_SIZE] = warpResult;
+        }
+
+        __syncthreads();
+
+        if (tid < (THREADS_NUM / WARP_SIZE))
+        {
+            // grab top warp elements
+            T val = smem[tid];
+
+            // calculate exclusive scan and write back to shared memory
+            smem[tid] = warpScanExclusive(val, smem, tid);
+        }
+
+        __syncthreads();
+
+        // return updated warp scans with exclusive scan results
+        return warpResult + smem[tid >> LOG_WARP_SIZE];
+    }
+    else
+    {
+        return warpScanInclusive(data, smem, tid);
+    }
+}
+
+template <int THREADS_NUM, typename T>
+__device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint tid)
+{
+    return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/block/vec_distance.hpp b/modules/cudev/include/opencv2/cudev/block/vec_distance.hpp
new file mode 100644
index 00000000000..3dc38757279
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/block/vec_distance.hpp
@@ -0,0 +1,189 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_BLOCK_VEC_DISTANCE_HPP
+#define OPENCV_CUDEV_BLOCK_VEC_DISTANCE_HPP
+
+#include "../common.hpp"
+#include "../functional/functional.hpp"
+#include "../warp/reduce.hpp"
+#include "reduce.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// NormL1
+
+template <typename T> struct NormL1
+{
+    typedef int value_type;
+    typedef uint result_type;
+
+    result_type mySum;
+
+    __device__ __forceinline__ NormL1() : mySum(0) {}
+
+    __device__ __forceinline__ void reduceThread(value_type val1, value_type val2)
+    {
+        mySum = __sad(val1, val2, mySum);
+    }
+
+    __device__ __forceinline__ void reduceWarp(result_type* smem, uint tid)
+    {
+        warpReduce(smem, mySum, tid, plus<result_type>());
+    }
+
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceBlock(result_type* smem, uint tid)
+    {
+        blockReduce<THREAD_DIM>(smem, mySum, tid, plus<result_type>());
+    }
+
+    __device__ __forceinline__ operator result_type() const
+    {
+        return mySum;
+    }
+};
+template <> struct NormL1<float>
+{
+    typedef float value_type;
+    typedef float result_type;
+
+    result_type mySum;
+
+    __device__ __forceinline__ NormL1() : mySum(0.0f) {}
+
+    __device__ __forceinline__ void reduceThread(value_type val1, value_type val2)
+    {
+        mySum += ::fabsf(val1 - val2);
+    }
+
+    __device__ __forceinline__ void reduceWarp(result_type* smem, uint tid)
+    {
+        warpReduce(smem, mySum, tid, plus<result_type>());
+    }
+
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceBlock(result_type* smem, uint tid)
+    {
+        blockReduce<THREAD_DIM>(smem, mySum, tid, plus<result_type>());
+    }
+
+    __device__ __forceinline__ operator result_type() const
+    {
+        return mySum;
+    }
+};
+
+// NormL2
+
+struct NormL2
+{
+    typedef float value_type;
+    typedef float result_type;
+
+    result_type mySum;
+
+    __device__ __forceinline__ NormL2() : mySum(0.0f) {}
+
+    __device__ __forceinline__ void reduceThread(value_type val1, value_type val2)
+    {
+        const float diff = val1 - val2;
+        mySum += diff * diff;
+    }
+
+    __device__ __forceinline__ void reduceWarp(result_type* smem, uint tid)
+    {
+        warpReduce(smem, mySum, tid, plus<result_type>());
+    }
+
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceBlock(result_type* smem, uint tid)
+    {
+        blockReduce<THREAD_DIM>(smem, mySum, tid, plus<result_type>());
+    }
+
+    __device__ __forceinline__ operator result_type() const
+    {
+        return ::sqrtf(mySum);
+    }
+};
+
+// NormHamming
+
+struct NormHamming
+{
+    typedef int value_type;
+    typedef int result_type;
+
+    result_type mySum;
+
+    __device__ __forceinline__ NormHamming() : mySum(0) {}
+
+    __device__ __forceinline__ void reduceThread(value_type val1, value_type val2)
+    {
+        mySum += __popc(val1 ^ val2);
+    }
+
+    __device__ __forceinline__ void reduceWarp(result_type* smem, uint tid)
+    {
+        warpReduce(smem, mySum, tid, plus<result_type>());
+    }
+
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceBlock(result_type* smem, uint tid)
+    {
+        blockReduce<THREAD_DIM>(smem, mySum, tid, plus<result_type>());
+    }
+
+    __device__ __forceinline__ operator result_type() const
+    {
+        return mySum;
+    }
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/common.hpp b/modules/cudev/include/opencv2/cudev/common.hpp
new file mode 100644
index 00000000000..b4439f55154
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/common.hpp
@@ -0,0 +1,94 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_COMMON_HPP
+#define OPENCV_CUDEV_COMMON_HPP
+
+#include <cuda_runtime.h>
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/core/cuda_stream_accessor.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+using namespace cv::cuda;
+
+// CV_CUDEV_ARCH
+
+#ifndef __CUDA_ARCH__
+#   define CV_CUDEV_ARCH 0
+#else
+#   define CV_CUDEV_ARCH __CUDA_ARCH__
+#endif
+
+// CV_CUDEV_SAFE_CALL
+
+__host__ __forceinline__ void checkCudaError(cudaError_t err, const char* file, const int line, const char* func)
+{
+    if (cudaSuccess != err)
+        cv::error(cv::Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
+}
+
+#define CV_CUDEV_SAFE_CALL(expr) cv::cudev::checkCudaError((expr), __FILE__, __LINE__, CV_Func)
+
+// divUp
+
+__host__ __device__ __forceinline__ int divUp(int total, int grain)
+{
+    return (total + grain - 1) / grain;
+}
+
+// math constants
+
+#define CV_PI_F   ((float)CV_PI)
+#define CV_LOG2_F ((float)CV_LOG2)
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/binary_func.hpp b/modules/cudev/include/opencv2/cudev/expr/binary_func.hpp
new file mode 100644
index 00000000000..3e5c009517b
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/expr/binary_func.hpp
@@ -0,0 +1,80 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_EXPR_BINARY_FUNC_HPP
+#define OPENCV_CUDEV_EXPR_BINARY_FUNC_HPP
+
+#include "../common.hpp"
+#include "../util/type_traits.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/transform.hpp"
+#include "../functional/functional.hpp"
+#include "expr.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+#define CV_CUDEV_EXPR_BINARY_FUNC(name) \
+    template <class SrcPtr1, class SrcPtr2> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<SrcPtr1>::ptr_type, typename PtrTraits<SrcPtr2>::ptr_type, name ## _func<typename LargerType<typename PtrTraits<SrcPtr1>::value_type, typename PtrTraits<SrcPtr2>::value_type>::type> > > \
+    name ## _(const SrcPtr1& src1, const SrcPtr2& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2, name ## _func<typename LargerType<typename PtrTraits<SrcPtr1>::value_type, typename PtrTraits<SrcPtr2>::value_type>::type>())); \
+    }
+
+CV_CUDEV_EXPR_BINARY_FUNC(hypot)
+CV_CUDEV_EXPR_BINARY_FUNC(magnitude)
+CV_CUDEV_EXPR_BINARY_FUNC(atan2)
+CV_CUDEV_EXPR_BINARY_FUNC(absdiff)
+
+#undef CV_CUDEV_EXPR_BINARY_FUNC
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/binary_op.hpp b/modules/cudev/include/opencv2/cudev/expr/binary_op.hpp
new file mode 100644
index 00000000000..e3c9ebbc895
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/expr/binary_op.hpp
@@ -0,0 +1,240 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_EXPR_BINARY_OP_HPP
+#define OPENCV_CUDEV_EXPR_BINARY_OP_HPP
+
+#include "../common.hpp"
+#include "../util/type_traits.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/transform.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/texture.hpp"
+#include "../ptr2d/glob.hpp"
+#include "../functional/functional.hpp"
+#include "expr.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// Binary Operations
+
+#define CV_CUDEV_EXPR_BINOP_INST(op, functor) \
+    template <typename T> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<GpuMat_<T> >::ptr_type, typename PtrTraits<GpuMat_<T> >::ptr_type, functor<T> > > \
+    operator op(const GpuMat_<T>& src1, const GpuMat_<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2, functor<T>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<GpuMat_<T> >::ptr_type, typename PtrTraits<GlobPtrSz<T> >::ptr_type, functor<T> > > \
+    operator op(const GpuMat_<T>& src1, const GlobPtrSz<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2, functor<T>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<GlobPtrSz<T> >::ptr_type, typename PtrTraits<GpuMat_<T> >::ptr_type, functor<T> > > \
+    operator op(const GlobPtrSz<T>& src1, const GpuMat_<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2, functor<T>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<GpuMat_<T> >::ptr_type, typename PtrTraits<Texture<T> >::ptr_type, functor<T> > > \
+    operator op(const GpuMat_<T>& src1, const Texture<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2, functor<T>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<Texture<T> >::ptr_type, typename PtrTraits<GpuMat_<T> >::ptr_type, functor<T> > > \
+    operator op(const Texture<T>& src1, const GpuMat_<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2, functor<T>())); \
+    } \
+    template <typename T, class Body> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<GpuMat_<T> >::ptr_type, typename PtrTraits<Body>::ptr_type, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type> > > \
+    operator op(const GpuMat_<T>& src1, const Expr<Body>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2.body, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type>())); \
+    } \
+    template <typename T, class Body> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<Body>::ptr_type, typename PtrTraits<GpuMat_<T> >::ptr_type, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type> > > \
+    operator op(const Expr<Body>& src1, const GpuMat_<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1.body, src2, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<GpuMat_<T> >::ptr_type, Binder2nd< functor<T> > > > \
+    operator op(const GpuMat_<T>& src, T val) \
+    { \
+        return makeExpr(transformPtr(src, bind2nd(functor<T>(), val))); \
+    } \
+    template <typename T> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<GpuMat_<T> >::ptr_type, Binder1st< functor<T> > > > \
+    operator op(T val, const GpuMat_<T>& src) \
+    { \
+        return makeExpr(transformPtr(src, bind1st(functor<T>(), val))); \
+    } \
+    template <typename T> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<GlobPtrSz<T> >::ptr_type, typename PtrTraits<GlobPtrSz<T> >::ptr_type, functor<T> > > \
+    operator op(const GlobPtrSz<T>& src1, const GlobPtrSz<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2, functor<T>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<GlobPtrSz<T> >::ptr_type, typename PtrTraits<Texture<T> >::ptr_type, functor<T> > > \
+    operator op(const GlobPtrSz<T>& src1, const Texture<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2, functor<T>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<Texture<T> >::ptr_type, typename PtrTraits<GlobPtrSz<T> >::ptr_type, functor<T> > > \
+    operator op(const Texture<T>& src1, const GlobPtrSz<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2, functor<T>())); \
+    } \
+    template <typename T, class Body> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<GlobPtrSz<T> >::ptr_type, typename PtrTraits<Body>::ptr_type, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type> > > \
+    operator op(const GlobPtrSz<T>& src1, const Expr<Body>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2.body, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type>())); \
+    } \
+    template <typename T, class Body> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<Body>::ptr_type, typename PtrTraits<GlobPtrSz<T> >::ptr_type, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type> > > \
+    operator op(const Expr<Body>& src1, const GlobPtrSz<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1.body, src2, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<GlobPtrSz<T> >::ptr_type, Binder2nd< functor<T> > > > \
+    operator op(const GlobPtrSz<T>& src, T val) \
+    { \
+        return makeExpr(transformPtr(src, bind2nd(functor<T>(), val))); \
+    } \
+    template <typename T> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<GlobPtrSz<T> >::ptr_type, Binder1st< functor<T> > > > \
+    operator op(T val, const GlobPtrSz<T>& src) \
+    { \
+        return makeExpr(transformPtr(src, bind1st(functor<T>(), val))); \
+    } \
+    template <typename T> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<Texture<T> >::ptr_type, typename PtrTraits<Texture<T> >::ptr_type, functor<T> > > \
+    operator op(const Texture<T>& src1, const Texture<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2, functor<T>())); \
+    } \
+    template <typename T, class Body> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<Texture<T> >::ptr_type, typename PtrTraits<Body>::ptr_type, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type> > > \
+    operator op(const Texture<T>& src1, const Expr<Body>& src2) \
+    { \
+        return makeExpr(transformPtr(src1, src2.body, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type>())); \
+    } \
+    template <typename T, class Body> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<Body>::ptr_type, typename PtrTraits<Texture<T> >::ptr_type, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type> > > \
+    operator op(const Expr<Body>& src1, const Texture<T>& src2) \
+    { \
+        return makeExpr(transformPtr(src1.body, src2, functor<typename LargerType<T, typename PtrTraits<Body>::value_type>::type>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<Texture<T> >::ptr_type, Binder2nd< functor<T> > > > \
+    operator op(const Texture<T>& src, T val) \
+    { \
+        return makeExpr(transformPtr(src, bind2nd(functor<T>(), val))); \
+    } \
+    template <typename T> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<Texture<T> >::ptr_type, Binder1st< functor<T> > > > \
+    operator op(T val, const Texture<T>& src) \
+    { \
+        return makeExpr(transformPtr(src, bind1st(functor<T>(), val))); \
+    } \
+    template <class Body1, class Body2> \
+    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<Body1>::ptr_type, typename PtrTraits<Body2>::ptr_type, functor<typename LargerType<typename PtrTraits<Body1>::value_type, typename PtrTraits<Body2>::value_type>::type> > > \
+    operator op(const Expr<Body1>& a, const Expr<Body2>& b) \
+    { \
+        return makeExpr(transformPtr(a.body, b.body, functor<typename LargerType<typename PtrTraits<Body1>::value_type, typename PtrTraits<Body2>::value_type>::type>())); \
+    } \
+    template <class Body> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<Body>::ptr_type, Binder2nd< functor<typename Body::value_type> > > > \
+    operator op(const Expr<Body>& a, typename Body::value_type val) \
+    { \
+        return makeExpr(transformPtr(a.body, bind2nd(functor<typename Body::value_type>(), val))); \
+    } \
+    template <class Body> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<Body>::ptr_type, Binder1st< functor<typename Body::value_type> > > > \
+    operator op(typename Body::value_type val, const Expr<Body>& a) \
+    { \
+        return makeExpr(transformPtr(a.body, bind1st(functor<typename Body::value_type>(), val))); \
+    }
+
+CV_CUDEV_EXPR_BINOP_INST(+, plus)
+CV_CUDEV_EXPR_BINOP_INST(-, minus)
+CV_CUDEV_EXPR_BINOP_INST(*, multiplies)
+CV_CUDEV_EXPR_BINOP_INST(/, divides)
+CV_CUDEV_EXPR_BINOP_INST(%, modulus)
+
+CV_CUDEV_EXPR_BINOP_INST(==, equal_to)
+CV_CUDEV_EXPR_BINOP_INST(!=, not_equal_to)
+CV_CUDEV_EXPR_BINOP_INST(>, greater)
+CV_CUDEV_EXPR_BINOP_INST(<, less)
+CV_CUDEV_EXPR_BINOP_INST(>=, greater_equal)
+CV_CUDEV_EXPR_BINOP_INST(<=, less_equal)
+
+CV_CUDEV_EXPR_BINOP_INST(&&, logical_and)
+CV_CUDEV_EXPR_BINOP_INST(||, logical_or)
+
+CV_CUDEV_EXPR_BINOP_INST(&, bit_and)
+CV_CUDEV_EXPR_BINOP_INST(|, bit_or)
+CV_CUDEV_EXPR_BINOP_INST(^, bit_xor)
+CV_CUDEV_EXPR_BINOP_INST(<<, bit_lshift)
+CV_CUDEV_EXPR_BINOP_INST(>>, bit_rshift)
+
+#undef CV_CUDEV_EXPR_BINOP_INST
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/color.hpp b/modules/cudev/include/opencv2/cudev/expr/color.hpp
new file mode 100644
index 00000000000..ca487a270f1
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/expr/color.hpp
@@ -0,0 +1,287 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_EXPR_COLOR_HPP
+#define OPENCV_CUDEV_EXPR_COLOR_HPP
+
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/transform.hpp"
+#include "../functional/color_cvt.hpp"
+#include "expr.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+#define CV_CUDEV_EXPR_CVTCOLOR_INST(name) \
+    template <class SrcPtr> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, name ## _func<typename VecTraits<typename PtrTraits<SrcPtr>::value_type>::elem_type> > > \
+    name ## _(const SrcPtr& src) \
+    { \
+        return makeExpr(transformPtr(src, name ## _func<typename VecTraits<typename PtrTraits<SrcPtr>::value_type>::elem_type>())); \
+    }
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_BGRA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_RGBA)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_GRAY)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_GRAY)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_GRAY)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_GRAY)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(GRAY_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(GRAY_to_BGRA)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_YUV)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_YUV)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_YUV4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_YUV4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_YUV)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_YUV)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_YUV4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_YUV4)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(YUV_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YUV_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YUV4_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YUV4_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YUV_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YUV_to_BGRA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YUV4_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YUV4_to_BGRA)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_YCrCb)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_YCrCb)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_YCrCb4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_YCrCb4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_YCrCb)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_YCrCb)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_YCrCb4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_YCrCb4)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(YCrCb_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YCrCb_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YCrCb4_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YCrCb4_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YCrCb_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YCrCb_to_BGRA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YCrCb4_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(YCrCb4_to_BGRA)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_XYZ)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_XYZ)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_XYZ4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_XYZ4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_XYZ)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_XYZ)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_XYZ4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_XYZ4)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(XYZ_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(XYZ4_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(XYZ_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(XYZ4_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(XYZ_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(XYZ4_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(XYZ_to_BGRA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(XYZ4_to_BGRA)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_HSV)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_HSV)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_HSV4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_HSV4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_HSV)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_HSV)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_HSV4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_HSV4)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_HSV_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_HSV_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_HSV4_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_HSV4_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_HSV_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_HSV_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_HSV4_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_HSV4_FULL)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV4_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV4_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV_to_BGRA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV4_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV4_to_BGRA)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV_to_RGB_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV_to_RGBA_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV4_to_RGB_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV4_to_RGBA_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV_to_BGR_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV_to_BGRA_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV4_to_BGR_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HSV4_to_BGRA_FULL)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_HLS)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_HLS)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_HLS4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_HLS4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_HLS)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_HLS)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_HLS4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_HLS4)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_HLS_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_HLS_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_HLS4_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_HLS4_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_HLS_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_HLS_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_HLS4_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_HLS4_FULL)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS4_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS4_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS_to_BGRA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS4_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS4_to_BGRA)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS_to_RGB_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS_to_RGBA_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS4_to_RGB_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS4_to_RGBA_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS_to_BGR_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS_to_BGRA_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS4_to_BGR_FULL)
+CV_CUDEV_EXPR_CVTCOLOR_INST(HLS4_to_BGRA_FULL)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_Lab)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_Lab)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_Lab4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_Lab4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_Lab)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_Lab)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_Lab4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_Lab4)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(LRGB_to_Lab)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LRGBA_to_Lab)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LRGB_to_Lab4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LRGBA_to_Lab4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LBGR_to_Lab)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LBGRA_to_Lab)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LBGR_to_Lab4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LBGRA_to_Lab4)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab4_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab4_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab4_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab_to_BGRA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab4_to_BGRA)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab_to_LRGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab4_to_LRGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab_to_LRGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab4_to_LRGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab_to_LBGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab4_to_LBGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab_to_LBGRA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Lab4_to_LBGRA)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_Luv)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_Luv)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGB_to_Luv4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(RGBA_to_Luv4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_Luv)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_Luv)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGR_to_Luv4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(BGRA_to_Luv4)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(LRGB_to_Luv)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LRGBA_to_Luv)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LRGB_to_Luv4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LRGBA_to_Luv4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LBGR_to_Luv)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LBGRA_to_Luv)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LBGR_to_Luv4)
+CV_CUDEV_EXPR_CVTCOLOR_INST(LBGRA_to_Luv4)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv4_to_RGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv4_to_RGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv4_to_BGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv_to_BGRA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv4_to_BGRA)
+
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv_to_LRGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv4_to_LRGB)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv_to_LRGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv4_to_LRGBA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv_to_LBGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv4_to_LBGR)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv_to_LBGRA)
+CV_CUDEV_EXPR_CVTCOLOR_INST(Luv4_to_LBGRA)
+
+#undef CV_CUDEV_EXPR_CVTCOLOR_INST
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/deriv.hpp b/modules/cudev/include/opencv2/cudev/expr/deriv.hpp
new file mode 100644
index 00000000000..d1b06155de6
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/expr/deriv.hpp
@@ -0,0 +1,126 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_EXPR_DERIV_HPP
+#define OPENCV_CUDEV_EXPR_DERIV_HPP
+
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/deriv.hpp"
+#include "expr.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// derivX
+
+template <class SrcPtr>
+__host__ Expr<DerivXPtrSz<typename PtrTraits<SrcPtr>::ptr_type> >
+derivX_(const SrcPtr& src)
+{
+    return makeExpr(derivXPtr(src));
+}
+
+// derivY
+
+template <class SrcPtr>
+__host__ Expr<DerivYPtrSz<typename PtrTraits<SrcPtr>::ptr_type> >
+derivY_(const SrcPtr& src)
+{
+    return makeExpr(derivYPtr(src));
+}
+
+// sobelX
+
+template <class SrcPtr>
+__host__ Expr<SobelXPtrSz<typename PtrTraits<SrcPtr>::ptr_type> >
+sobelX_(const SrcPtr& src)
+{
+    return makeExpr(sobelXPtr(src));
+}
+
+// sobelY
+
+template <class SrcPtr>
+__host__ Expr<SobelYPtrSz<typename PtrTraits<SrcPtr>::ptr_type> >
+sobelY_(const SrcPtr& src)
+{
+    return makeExpr(sobelYPtr(src));
+}
+
+// scharrX
+
+template <class SrcPtr>
+__host__ Expr<ScharrXPtrSz<typename PtrTraits<SrcPtr>::ptr_type> >
+scharrX_(const SrcPtr& src)
+{
+    return makeExpr(scharrXPtr(src));
+}
+
+// scharrY
+
+template <class SrcPtr>
+__host__ Expr<ScharrYPtrSz<typename PtrTraits<SrcPtr>::ptr_type> >
+scharrY_(const SrcPtr& src)
+{
+    return makeExpr(scharrYPtr(src));
+}
+
+// laplacian
+
+template <int ksize, class SrcPtr>
+__host__ Expr<LaplacianPtrSz<ksize, typename PtrTraits<SrcPtr>::ptr_type> >
+laplacian_(const SrcPtr& src)
+{
+    return makeExpr(laplacianPtr<ksize>(src));
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/expr.hpp b/modules/cudev/include/opencv2/cudev/expr/expr.hpp
new file mode 100644
index 00000000000..5f4a49111fe
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/expr/expr.hpp
@@ -0,0 +1,97 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_EXPR_EXPR_HPP
+#define OPENCV_CUDEV_EXPR_EXPR_HPP
+
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class Body> struct Expr
+{
+    Body body;
+};
+
+template <class Body>
+__host__ Expr<Body> makeExpr(const Body& body)
+{
+    Expr<Body> e;
+    e.body = body;
+    return e;
+}
+
+template <class Body> struct PtrTraits< Expr<Body> >
+{
+    typedef Expr<Body>                         ptr_sz_type;
+    typedef typename PtrTraits<Body>::ptr_type ptr_type;
+
+    typedef typename ptr_type::value_type value_type;
+
+    __host__ static ptr_type shrinkPtr(const Expr<Body>& expr)
+    {
+        return PtrTraits<Body>::shrinkPtr(expr.body);
+    }
+
+    __host__ static int getRows(const Expr<Body>& expr)
+    {
+        return PtrTraits<Body>::getRows(expr.body);
+    }
+
+    __host__ static int getCols(const Expr<Body>& expr)
+    {
+        return PtrTraits<Body>::getCols(expr.body);
+    }
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/per_element_func.hpp b/modules/cudev/include/opencv2/cudev/expr/per_element_func.hpp
new file mode 100644
index 00000000000..9eac3331b34
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/expr/per_element_func.hpp
@@ -0,0 +1,137 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_EXPR_PER_ELEMENT_FUNC_HPP
+#define OPENCV_CUDEV_EXPR_PER_ELEMENT_FUNC_HPP
+
+#include "../common.hpp"
+#include "../util/type_traits.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/transform.hpp"
+#include "../ptr2d/lut.hpp"
+#include "../functional/functional.hpp"
+#include "expr.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// min/max
+
+template <class SrcPtr1, class SrcPtr2>
+__host__ Expr<BinaryTransformPtrSz<typename PtrTraits<SrcPtr1>::ptr_type, typename PtrTraits<SrcPtr2>::ptr_type, minimum<typename LargerType<typename PtrTraits<SrcPtr1>::value_type, typename PtrTraits<SrcPtr2>::value_type>::type> > >
+min_(const SrcPtr1& src1, const SrcPtr2& src2)
+{
+    return makeExpr(transformPtr(src1, src2, minimum<typename LargerType<typename PtrTraits<SrcPtr1>::value_type, typename PtrTraits<SrcPtr2>::value_type>::type>()));
+}
+
+template <class SrcPtr1, class SrcPtr2>
+__host__ Expr<BinaryTransformPtrSz<typename PtrTraits<SrcPtr1>::ptr_type, typename PtrTraits<SrcPtr2>::ptr_type, maximum<typename LargerType<typename PtrTraits<SrcPtr1>::value_type, typename PtrTraits<SrcPtr2>::value_type>::type> > >
+max_(const SrcPtr1& src1, const SrcPtr2& src2)
+{
+    return makeExpr(transformPtr(src1, src2, maximum<typename LargerType<typename PtrTraits<SrcPtr1>::value_type, typename PtrTraits<SrcPtr2>::value_type>::type>()));
+}
+
+// threshold
+
+template <class SrcPtr>
+__host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, ThreshBinaryFunc<typename PtrTraits<SrcPtr>::value_type> > >
+threshBinary_(const SrcPtr& src, typename PtrTraits<SrcPtr>::value_type thresh, typename PtrTraits<SrcPtr>::value_type maxVal)
+{
+    return makeExpr(transformPtr(src, thresh_binary_func(thresh, maxVal)));
+}
+
+template <class SrcPtr>
+__host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, ThreshBinaryInvFunc<typename PtrTraits<SrcPtr>::value_type> > >
+threshBinaryInv_(const SrcPtr& src, typename PtrTraits<SrcPtr>::value_type thresh, typename PtrTraits<SrcPtr>::value_type maxVal)
+{
+    return makeExpr(transformPtr(src, thresh_binary_inv_func(thresh, maxVal)));
+}
+
+template <class SrcPtr>
+__host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, ThreshTruncFunc<typename PtrTraits<SrcPtr>::value_type> > >
+threshTrunc_(const SrcPtr& src, typename PtrTraits<SrcPtr>::value_type thresh)
+{
+    return makeExpr(transformPtr(src, thresh_trunc_func(thresh)));
+}
+
+template <class SrcPtr>
+__host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, ThreshToZeroFunc<typename PtrTraits<SrcPtr>::value_type> > >
+threshToZero_(const SrcPtr& src, typename PtrTraits<SrcPtr>::value_type thresh)
+{
+    return makeExpr(transformPtr(src, thresh_to_zero_func(thresh)));
+}
+
+template <class SrcPtr>
+__host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, ThreshToZeroInvFunc<typename PtrTraits<SrcPtr>::value_type> > >
+threshToZeroInv_(const SrcPtr& src, typename PtrTraits<SrcPtr>::value_type thresh)
+{
+    return makeExpr(transformPtr(src, thresh_to_zero_inv_func(thresh)));
+}
+
+// cvt
+
+template <typename D, class SrcPtr>
+__host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, saturate_cast_func<typename PtrTraits<SrcPtr>::value_type, D> > >
+cvt_(const SrcPtr& src)
+{
+    return makeExpr(transformPtr(src, saturate_cast_func<typename PtrTraits<SrcPtr>::value_type, D>()));
+}
+
+// lut
+
+template <class SrcPtr, class TablePtr>
+__host__ Expr<LutPtrSz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<TablePtr>::ptr_type> >
+lut_(const SrcPtr& src, const TablePtr& tbl)
+{
+    return makeExpr(lutPtr(src, tbl));
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/reduction.hpp b/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
new file mode 100644
index 00000000000..aa772d03480
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
@@ -0,0 +1,264 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_EXPR_REDUCTION_HPP
+#define OPENCV_CUDEV_EXPR_REDUCTION_HPP
+
+#include "../common.hpp"
+#include "../grid/reduce.hpp"
+#include "../grid/histogram.hpp"
+#include "../grid/integral.hpp"
+#include "../grid/reduce_to_vec.hpp"
+#include "../ptr2d/traits.hpp"
+#include "expr.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// sum
+
+template <class SrcPtr> struct SumExprBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCalcSum(src, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ Expr<SumExprBody<SrcPtr> >
+sum_(const SrcPtr& src)
+{
+    SumExprBody<SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+// minVal
+
+template <class SrcPtr> struct FindMinValExprBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridFindMinVal(src, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ Expr<FindMinValExprBody<SrcPtr> >
+minVal_(const SrcPtr& src)
+{
+    FindMinValExprBody<SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+// maxVal
+
+template <class SrcPtr> struct FindMaxValExprBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridFindMaxVal(src, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ Expr<FindMaxValExprBody<SrcPtr> >
+maxVal_(const SrcPtr& src)
+{
+    FindMaxValExprBody<SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+// minMaxVal
+
+template <class SrcPtr> struct FindMinMaxValExprBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridFindMinMaxVal(src, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ Expr<FindMinMaxValExprBody<SrcPtr> >
+minMaxVal_(const SrcPtr& src)
+{
+    FindMinMaxValExprBody<SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+// countNonZero
+
+template <class SrcPtr> struct CountNonZeroExprBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCountNonZero(src, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ Expr<CountNonZeroExprBody<SrcPtr> >
+countNonZero_(const SrcPtr& src)
+{
+    CountNonZeroExprBody<SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+// reduceToRow
+
+template <class Reductor, class SrcPtr> struct ReduceToRowBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridReduceToRow<Reductor>(src, dst, stream);
+    }
+};
+
+template <class Reductor, class SrcPtr>
+__host__ Expr<ReduceToRowBody<Reductor, SrcPtr> >
+reduceToRow_(const SrcPtr& src)
+{
+    ReduceToRowBody<Reductor, SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+// reduceToColumn
+
+template <class Reductor, class SrcPtr> struct ReduceToColumnBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridReduceToColumn<Reductor>(src, dst, stream);
+    }
+};
+
+template <class Reductor, class SrcPtr>
+__host__ Expr<ReduceToColumnBody<Reductor, SrcPtr> >
+reduceToColumn_(const SrcPtr& src)
+{
+    ReduceToColumnBody<Reductor, SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+// histogram
+
+template <int BIN_COUNT, class SrcPtr> struct HistogramBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridHistogram<BIN_COUNT>(src, dst, stream);
+    }
+};
+
+template <int BIN_COUNT, class SrcPtr>
+__host__ Expr<HistogramBody<BIN_COUNT, SrcPtr> >
+histogram_(const SrcPtr& src)
+{
+    HistogramBody<BIN_COUNT, SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+// integral
+
+template <class SrcPtr> struct IntegralBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridIntegral(src, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ Expr<IntegralBody<SrcPtr> >
+integral_(const SrcPtr& src)
+{
+    IntegralBody<SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/unary_func.hpp b/modules/cudev/include/opencv2/cudev/expr/unary_func.hpp
new file mode 100644
index 00000000000..567e15c41ad
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/expr/unary_func.hpp
@@ -0,0 +1,103 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_EXPR_UNARY_FUNC_HPP
+#define OPENCV_CUDEV_EXPR_UNARY_FUNC_HPP
+
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/transform.hpp"
+#include "../functional/functional.hpp"
+#include "expr.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+#define CV_CUDEV_EXPR_UNARY_FUNC(name) \
+    template <class SrcPtr> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, name ## _func<typename PtrTraits<SrcPtr>::value_type> > > \
+    name ## _(const SrcPtr& src) \
+    { \
+        return makeExpr(transformPtr(src, name ## _func<typename PtrTraits<SrcPtr>::value_type>())); \
+    }
+
+CV_CUDEV_EXPR_UNARY_FUNC(abs)
+CV_CUDEV_EXPR_UNARY_FUNC(sqr)
+CV_CUDEV_EXPR_UNARY_FUNC(sqrt)
+CV_CUDEV_EXPR_UNARY_FUNC(exp)
+CV_CUDEV_EXPR_UNARY_FUNC(exp2)
+CV_CUDEV_EXPR_UNARY_FUNC(exp10)
+CV_CUDEV_EXPR_UNARY_FUNC(log)
+CV_CUDEV_EXPR_UNARY_FUNC(log2)
+CV_CUDEV_EXPR_UNARY_FUNC(log10)
+CV_CUDEV_EXPR_UNARY_FUNC(sin)
+CV_CUDEV_EXPR_UNARY_FUNC(cos)
+CV_CUDEV_EXPR_UNARY_FUNC(tan)
+CV_CUDEV_EXPR_UNARY_FUNC(asin)
+CV_CUDEV_EXPR_UNARY_FUNC(acos)
+CV_CUDEV_EXPR_UNARY_FUNC(atan)
+CV_CUDEV_EXPR_UNARY_FUNC(sinh)
+CV_CUDEV_EXPR_UNARY_FUNC(cosh)
+CV_CUDEV_EXPR_UNARY_FUNC(tanh)
+CV_CUDEV_EXPR_UNARY_FUNC(asinh)
+CV_CUDEV_EXPR_UNARY_FUNC(acosh)
+CV_CUDEV_EXPR_UNARY_FUNC(atanh)
+
+#undef CV_CUDEV_EXPR_UNARY_FUNC
+
+template <class SrcPtr>
+__host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, Binder2nd<pow_func<typename PtrTraits<SrcPtr>::value_type> > > >
+pow_(const SrcPtr& src, float power)
+{
+    return makeExpr(transformPtr(src, bind2nd(pow_func<typename PtrTraits<SrcPtr>::value_type>(), power)));
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/unary_op.hpp b/modules/cudev/include/opencv2/cudev/expr/unary_op.hpp
new file mode 100644
index 00000000000..4e067fe9157
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/expr/unary_op.hpp
@@ -0,0 +1,99 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_EXPR_UNARY_OP_HPP
+#define OPENCV_CUDEV_EXPR_UNARY_OP_HPP
+
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/transform.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/texture.hpp"
+#include "../ptr2d/glob.hpp"
+#include "../functional/functional.hpp"
+#include "expr.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+#define CV_CUDEV_EXPR_UNOP_INST(op, functor) \
+    template <typename T> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<GpuMat_<T> >::ptr_type, functor<T> > > \
+    operator op(const GpuMat_<T>& src) \
+    { \
+        return makeExpr(transformPtr(src, functor<T>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<GlobPtrSz<T> >::ptr_type, functor<T> > > \
+    operator op(const GlobPtrSz<T>& src) \
+    { \
+        return makeExpr(transformPtr(src, functor<T>())); \
+    } \
+    template <typename T> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<Texture<T> >::ptr_type, functor<T> > > \
+    operator op(const Texture<T>& src) \
+    { \
+        return makeExpr(transformPtr(src, functor<T>())); \
+    } \
+    template <class Body> \
+    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<Body>::ptr_type, functor<typename Body::value_type> > > \
+    operator op(const Expr<Body>& src) \
+    { \
+        return makeExpr(transformPtr(src.body, functor<typename Body::value_type>())); \
+    }
+
+CV_CUDEV_EXPR_UNOP_INST(-, negate)
+CV_CUDEV_EXPR_UNOP_INST(!, logical_not)
+CV_CUDEV_EXPR_UNOP_INST(~, bit_not)
+
+#undef CV_CUDEV_EXPR_UNOP_INST
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/warping.hpp b/modules/cudev/include/opencv2/cudev/expr/warping.hpp
new file mode 100644
index 00000000000..ca04b4dd0ac
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/expr/warping.hpp
@@ -0,0 +1,176 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_EXPR_WARPING_HPP
+#define OPENCV_CUDEV_EXPR_WARPING_HPP
+
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/resize.hpp"
+#include "../ptr2d/remap.hpp"
+#include "../ptr2d/warping.hpp"
+#include "../grid/pyramids.hpp"
+#include "../grid/transpose.hpp"
+#include "expr.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// resize
+
+template <class SrcPtr>
+__host__ Expr<ResizePtrSz<typename PtrTraits<SrcPtr>::ptr_type> >
+resize_(const SrcPtr& src, float fx, float fy)
+{
+    return makeExpr(resizePtr(src, fx, fy));
+}
+
+// remap
+
+template <class SrcPtr, class MapPtr>
+__host__ Expr<RemapPtr1Sz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<MapPtr>::ptr_type> >
+remap_(const SrcPtr& src, const MapPtr& map)
+{
+    return makeExpr(remapPtr(src, map));
+}
+
+template <class SrcPtr, class MapXPtr, class MapYPtr>
+__host__ Expr<RemapPtr2Sz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<MapXPtr>::ptr_type, typename PtrTraits<MapYPtr>::ptr_type> >
+remap_(const SrcPtr& src, const MapXPtr& mapx, const MapYPtr& mapy)
+{
+    return makeExpr(remapPtr(src, mapx, mapy));
+}
+
+// warpAffine
+
+template <class SrcPtr>
+__host__ Expr<RemapPtr1Sz<typename PtrTraits<SrcPtr>::ptr_type, AffineMapPtr> >
+warpAffine_(const SrcPtr& src, Size dstSize, const GpuMat_<float>& warpMat)
+{
+    return makeExpr(warpAffinePtr(src, dstSize, warpMat));
+}
+
+// warpPerspective
+
+template <class SrcPtr>
+__host__ Expr<RemapPtr1Sz<typename PtrTraits<SrcPtr>::ptr_type, PerspectiveMapPtr> >
+warpPerspective_(const SrcPtr& src, Size dstSize, const GpuMat_<float>& warpMat)
+{
+    return makeExpr(warpPerspectivePtr(src, dstSize, warpMat));
+}
+
+// pyrDown
+
+template <class SrcPtr> struct PyrDownBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridPyrDown(src, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ Expr<PyrDownBody<SrcPtr> >
+pyrDown_(const SrcPtr& src)
+{
+    PyrDownBody<SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+// pyrUp
+
+template <class SrcPtr> struct PyrUpBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridPyrUp(src, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ Expr<PyrUpBody<SrcPtr> >
+pyrUp_(const SrcPtr& src)
+{
+    PyrUpBody<SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+// transpose
+
+template <class SrcPtr> struct TransposeBody
+{
+    SrcPtr src;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridTranspose(src, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ Expr<TransposeBody<SrcPtr> >
+transpose_(const SrcPtr& src)
+{
+    TransposeBody<SrcPtr> body;
+    body.src = src;
+    return makeExpr(body);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/functional/color_cvt.hpp b/modules/cudev/include/opencv2/cudev/functional/color_cvt.hpp
new file mode 100644
index 00000000000..2681030ae01
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/functional/color_cvt.hpp
@@ -0,0 +1,479 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_FUNCTIONAL_COLOR_CVT_HPP
+#define OPENCV_CUDEV_FUNCTIONAL_COLOR_CVT_HPP
+
+#include "../common.hpp"
+#include "detail/color_cvt.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// Various 3/4-channel to 3/4-channel RGB transformations
+
+#define CV_CUDEV_RGB2RGB_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::RGB2RGB<SrcDepth, scn, dcn, bidx> \
+    { \
+    };
+
+CV_CUDEV_RGB2RGB_INST(BGR_to_RGB, 3, 3, 2)
+CV_CUDEV_RGB2RGB_INST(BGR_to_BGRA, 3, 4, 0)
+CV_CUDEV_RGB2RGB_INST(BGR_to_RGBA, 3, 4, 2)
+CV_CUDEV_RGB2RGB_INST(BGRA_to_BGR, 4, 3, 0)
+CV_CUDEV_RGB2RGB_INST(BGRA_to_RGB, 4, 3, 2)
+CV_CUDEV_RGB2RGB_INST(BGRA_to_RGBA, 4, 4, 2)
+
+#undef CV_CUDEV_RGB2RGB_INST
+
+// RGB to Grayscale
+
+#define CV_CUDEV_RGB2GRAY_INST(name, scn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::RGB2Gray<SrcDepth, scn, bidx> \
+    { \
+    };
+
+CV_CUDEV_RGB2GRAY_INST(RGB_to_GRAY, 3, 2)
+CV_CUDEV_RGB2GRAY_INST(BGR_to_GRAY, 3, 0)
+CV_CUDEV_RGB2GRAY_INST(RGBA_to_GRAY, 4, 2)
+CV_CUDEV_RGB2GRAY_INST(BGRA_to_GRAY, 4, 0)
+
+#undef CV_CUDEV_RGB2GRAY_INST
+
+// Grayscale to RGB
+
+#define CV_CUDEV_GRAY2RGB_INST(name, dcn) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::Gray2RGB<SrcDepth, dcn> \
+    { \
+    };
+
+CV_CUDEV_GRAY2RGB_INST(GRAY_to_BGR, 3)
+CV_CUDEV_GRAY2RGB_INST(GRAY_to_BGRA, 4)
+
+#undef CV_CUDEV_GRAY2RGB_INST
+
+// RGB to YUV
+
+#define CV_CUDEV_RGB2YUV_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::RGB2YUV<SrcDepth, scn, dcn, bidx> \
+    { \
+    };
+
+CV_CUDEV_RGB2YUV_INST(RGB_to_YUV, 3, 3, 2)
+CV_CUDEV_RGB2YUV_INST(RGBA_to_YUV, 4, 3, 2)
+CV_CUDEV_RGB2YUV_INST(RGB_to_YUV4, 3, 4, 2)
+CV_CUDEV_RGB2YUV_INST(RGBA_to_YUV4, 4, 4, 2)
+CV_CUDEV_RGB2YUV_INST(BGR_to_YUV, 3, 3, 0)
+CV_CUDEV_RGB2YUV_INST(BGRA_to_YUV, 4, 3, 0)
+CV_CUDEV_RGB2YUV_INST(BGR_to_YUV4, 3, 4, 0)
+CV_CUDEV_RGB2YUV_INST(BGRA_to_YUV4, 4, 4, 0)
+
+#undef CV_CUDEV_RGB2YUV_INST
+
+// YUV to RGB
+
+#define CV_CUDEV_YUV2RGB_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::YUV2RGB<SrcDepth, scn, dcn, bidx> \
+    { \
+    };
+
+CV_CUDEV_YUV2RGB_INST(YUV_to_RGB, 3, 3, 2)
+CV_CUDEV_YUV2RGB_INST(YUV_to_RGBA, 3, 4, 2)
+CV_CUDEV_YUV2RGB_INST(YUV4_to_RGB, 4, 3, 2)
+CV_CUDEV_YUV2RGB_INST(YUV4_to_RGBA, 4, 4, 2)
+CV_CUDEV_YUV2RGB_INST(YUV_to_BGR, 3, 3, 0)
+CV_CUDEV_YUV2RGB_INST(YUV_to_BGRA, 3, 4, 0)
+CV_CUDEV_YUV2RGB_INST(YUV4_to_BGR, 4, 3, 0)
+CV_CUDEV_YUV2RGB_INST(YUV4_to_BGRA, 4, 4, 0)
+
+#undef CV_CUDEV_YUV2RGB_INST
+
+// RGB to YCrCb
+
+#define CV_CUDEV_RGB2YCrCb_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::RGB2YCrCb<SrcDepth, scn, dcn, bidx> \
+    { \
+    };
+
+CV_CUDEV_RGB2YCrCb_INST(RGB_to_YCrCb, 3, 3, 2)
+CV_CUDEV_RGB2YCrCb_INST(RGBA_to_YCrCb, 4, 3, 2)
+CV_CUDEV_RGB2YCrCb_INST(RGB_to_YCrCb4, 3, 4, 2)
+CV_CUDEV_RGB2YCrCb_INST(RGBA_to_YCrCb4, 4, 4, 2)
+CV_CUDEV_RGB2YCrCb_INST(BGR_to_YCrCb, 3, 3, 0)
+CV_CUDEV_RGB2YCrCb_INST(BGRA_to_YCrCb, 4, 3, 0)
+CV_CUDEV_RGB2YCrCb_INST(BGR_to_YCrCb4, 3, 4, 0)
+CV_CUDEV_RGB2YCrCb_INST(BGRA_to_YCrCb4, 4, 4, 0)
+
+#undef CV_CUDEV_RGB2YCrCb_INST
+
+// YCrCb to RGB
+
+#define CV_CUDEV_YCrCb2RGB_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::YCrCb2RGB<SrcDepth, scn, dcn, bidx> \
+    { \
+    };
+
+CV_CUDEV_YCrCb2RGB_INST(YCrCb_to_RGB, 3, 3, 2)
+CV_CUDEV_YCrCb2RGB_INST(YCrCb_to_RGBA, 3, 4, 2)
+CV_CUDEV_YCrCb2RGB_INST(YCrCb4_to_RGB, 4, 3, 2)
+CV_CUDEV_YCrCb2RGB_INST(YCrCb4_to_RGBA, 4, 4, 2)
+CV_CUDEV_YCrCb2RGB_INST(YCrCb_to_BGR, 3, 3, 0)
+CV_CUDEV_YCrCb2RGB_INST(YCrCb_to_BGRA, 3, 4, 0)
+CV_CUDEV_YCrCb2RGB_INST(YCrCb4_to_BGR, 4, 3, 0)
+CV_CUDEV_YCrCb2RGB_INST(YCrCb4_to_BGRA, 4, 4, 0)
+
+#undef CV_CUDEV_YCrCb2RGB_INST
+
+// RGB to XYZ
+
+#define CV_CUDEV_RGB2XYZ_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::RGB2XYZ<SrcDepth, scn, dcn, bidx> \
+    { \
+    };
+
+CV_CUDEV_RGB2XYZ_INST(RGB_to_XYZ, 3, 3, 2)
+CV_CUDEV_RGB2XYZ_INST(RGBA_to_XYZ, 4, 3, 2)
+CV_CUDEV_RGB2XYZ_INST(RGB_to_XYZ4, 3, 4, 2)
+CV_CUDEV_RGB2XYZ_INST(RGBA_to_XYZ4, 4, 4, 2)
+CV_CUDEV_RGB2XYZ_INST(BGR_to_XYZ, 3, 3, 0)
+CV_CUDEV_RGB2XYZ_INST(BGRA_to_XYZ, 4, 3, 0)
+CV_CUDEV_RGB2XYZ_INST(BGR_to_XYZ4, 3, 4, 0)
+CV_CUDEV_RGB2XYZ_INST(BGRA_to_XYZ4, 4, 4, 0)
+
+#undef CV_CUDEV_RGB2XYZ_INST
+
+// XYZ to RGB
+
+#define CV_CUDEV_XYZ2RGB_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::XYZ2RGB<SrcDepth, scn, dcn, bidx> \
+    { \
+    };
+
+CV_CUDEV_XYZ2RGB_INST(XYZ_to_RGB, 3, 3, 2)
+CV_CUDEV_XYZ2RGB_INST(XYZ4_to_RGB, 4, 3, 2)
+CV_CUDEV_XYZ2RGB_INST(XYZ_to_RGBA, 3, 4, 2)
+CV_CUDEV_XYZ2RGB_INST(XYZ4_to_RGBA, 4, 4, 2)
+CV_CUDEV_XYZ2RGB_INST(XYZ_to_BGR, 3, 3, 0)
+CV_CUDEV_XYZ2RGB_INST(XYZ4_to_BGR, 4, 3, 0)
+CV_CUDEV_XYZ2RGB_INST(XYZ_to_BGRA, 3, 4, 0)
+CV_CUDEV_XYZ2RGB_INST(XYZ4_to_BGRA, 4, 4, 0)
+
+#undef CV_CUDEV_XYZ2RGB_INST
+
+// RGB to HSV
+
+#define CV_CUDEV_RGB2HSV_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::RGB2HSV<SrcDepth, scn, dcn, bidx, 180> \
+    { \
+    }; \
+    template <typename SrcDepth> struct name ## _FULL ## _func : cv::cudev::color_cvt_detail::RGB2HSV<SrcDepth, scn, dcn, bidx, 256> \
+    { \
+    }; \
+    template <> struct name ## _func<float> : cv::cudev::color_cvt_detail::RGB2HSV<float, scn, dcn, bidx, 360> \
+    { \
+    }; \
+    template <> struct name ## _FULL ## _func<float> : cv::cudev::color_cvt_detail::RGB2HSV<float, scn, dcn, bidx, 360> \
+    { \
+    };
+
+CV_CUDEV_RGB2HSV_INST(RGB_to_HSV, 3, 3, 2)
+CV_CUDEV_RGB2HSV_INST(RGBA_to_HSV, 4, 3, 2)
+CV_CUDEV_RGB2HSV_INST(RGB_to_HSV4, 3, 4, 2)
+CV_CUDEV_RGB2HSV_INST(RGBA_to_HSV4, 4, 4, 2)
+CV_CUDEV_RGB2HSV_INST(BGR_to_HSV, 3, 3, 0)
+CV_CUDEV_RGB2HSV_INST(BGRA_to_HSV, 4, 3, 0)
+CV_CUDEV_RGB2HSV_INST(BGR_to_HSV4, 3, 4, 0)
+CV_CUDEV_RGB2HSV_INST(BGRA_to_HSV4, 4, 4, 0)
+
+#undef CV_CUDEV_RGB2HSV_INST
+
+// HSV to RGB
+
+#define CV_CUDEV_HSV2RGB_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::HSV2RGB<SrcDepth, scn, dcn, bidx, 180> \
+    { \
+    }; \
+    template <typename SrcDepth> struct name ## _FULL ## _func : cv::cudev::color_cvt_detail::HSV2RGB<SrcDepth, scn, dcn, bidx, 255> \
+    { \
+    }; \
+    template <> struct name ## _func<float> : cv::cudev::color_cvt_detail::HSV2RGB<float, scn, dcn, bidx, 360> \
+    { \
+    }; \
+    template <> struct name ## _FULL ## _func<float> : cv::cudev::color_cvt_detail::HSV2RGB<float, scn, dcn, bidx, 360> \
+    { \
+    };
+
+CV_CUDEV_HSV2RGB_INST(HSV_to_RGB, 3, 3, 2)
+CV_CUDEV_HSV2RGB_INST(HSV_to_RGBA, 3, 4, 2)
+CV_CUDEV_HSV2RGB_INST(HSV4_to_RGB, 4, 3, 2)
+CV_CUDEV_HSV2RGB_INST(HSV4_to_RGBA, 4, 4, 2)
+CV_CUDEV_HSV2RGB_INST(HSV_to_BGR, 3, 3, 0)
+CV_CUDEV_HSV2RGB_INST(HSV_to_BGRA, 3, 4, 0)
+CV_CUDEV_HSV2RGB_INST(HSV4_to_BGR, 4, 3, 0)
+CV_CUDEV_HSV2RGB_INST(HSV4_to_BGRA, 4, 4, 0)
+
+#undef CV_CUDEV_HSV2RGB_INST
+
+// RGB to HLS
+
+#define CV_CUDEV_RGB2HLS_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::RGB2HLS<SrcDepth, scn, dcn, bidx, 180> \
+    { \
+    }; \
+    template <typename SrcDepth> struct name ## _FULL ## _func : cv::cudev::color_cvt_detail::RGB2HLS<SrcDepth, scn, dcn, bidx, 256> \
+    { \
+    }; \
+    template <> struct name ## _func<float> : cv::cudev::color_cvt_detail::RGB2HLS<float, scn, dcn, bidx, 360> \
+    { \
+    }; \
+    template <> struct name ## _FULL ## _func<float> : cv::cudev::color_cvt_detail::RGB2HLS<float, scn, dcn, bidx, 360> \
+    { \
+    };
+
+CV_CUDEV_RGB2HLS_INST(RGB_to_HLS, 3, 3, 2)
+CV_CUDEV_RGB2HLS_INST(RGBA_to_HLS, 4, 3, 2)
+CV_CUDEV_RGB2HLS_INST(RGB_to_HLS4, 3, 4, 2)
+CV_CUDEV_RGB2HLS_INST(RGBA_to_HLS4, 4, 4, 2)
+CV_CUDEV_RGB2HLS_INST(BGR_to_HLS, 3, 3, 0)
+CV_CUDEV_RGB2HLS_INST(BGRA_to_HLS, 4, 3, 0)
+CV_CUDEV_RGB2HLS_INST(BGR_to_HLS4, 3, 4, 0)
+CV_CUDEV_RGB2HLS_INST(BGRA_to_HLS4, 4, 4, 0)
+
+#undef CV_CUDEV_RGB2HLS_INST
+
+// HLS to RGB
+
+#define CV_CUDEV_HLS2RGB_INST(name, scn, dcn, bidx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::HLS2RGB<SrcDepth, scn, dcn, bidx, 180> \
+    { \
+    }; \
+    template <typename SrcDepth> struct name ## _FULL ## _func : cv::cudev::color_cvt_detail::HLS2RGB<SrcDepth, scn, dcn, bidx, 255> \
+    { \
+    }; \
+    template <> struct name ## _func<float> : cv::cudev::color_cvt_detail::HLS2RGB<float, scn, dcn, bidx, 360> \
+    { \
+    }; \
+    template <> struct name ## _FULL ## _func<float> : cv::cudev::color_cvt_detail::HLS2RGB<float, scn, dcn, bidx, 360> \
+    { \
+    };
+
+CV_CUDEV_HLS2RGB_INST(HLS_to_RGB, 3, 3, 2)
+CV_CUDEV_HLS2RGB_INST(HLS_to_RGBA, 3, 4, 2)
+CV_CUDEV_HLS2RGB_INST(HLS4_to_RGB, 4, 3, 2)
+CV_CUDEV_HLS2RGB_INST(HLS4_to_RGBA, 4, 4, 2)
+CV_CUDEV_HLS2RGB_INST(HLS_to_BGR, 3, 3, 0)
+CV_CUDEV_HLS2RGB_INST(HLS_to_BGRA, 3, 4, 0)
+CV_CUDEV_HLS2RGB_INST(HLS4_to_BGR, 4, 3, 0)
+CV_CUDEV_HLS2RGB_INST(HLS4_to_BGRA, 4, 4, 0)
+
+#undef CV_CUDEV_HLS2RGB_INST
+
+// RGB to Lab
+
+#define CV_CUDEV_RGB2Lab_INST(name, scn, dcn, sRGB, blueIdx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::RGB2Lab<SrcDepth, scn, dcn, sRGB, blueIdx> \
+    { \
+    };
+
+CV_CUDEV_RGB2Lab_INST(RGB_to_Lab, 3, 3, true, 2)
+CV_CUDEV_RGB2Lab_INST(RGBA_to_Lab, 4, 3, true, 2)
+CV_CUDEV_RGB2Lab_INST(RGB_to_Lab4, 3, 4, true, 2)
+CV_CUDEV_RGB2Lab_INST(RGBA_to_Lab4, 4, 4, true, 2)
+CV_CUDEV_RGB2Lab_INST(BGR_to_Lab, 3, 3, true, 0)
+CV_CUDEV_RGB2Lab_INST(BGRA_to_Lab, 4, 3, true, 0)
+CV_CUDEV_RGB2Lab_INST(BGR_to_Lab4, 3, 4, true, 0)
+CV_CUDEV_RGB2Lab_INST(BGRA_to_Lab4, 4, 4, true, 0)
+
+CV_CUDEV_RGB2Lab_INST(LRGB_to_Lab, 3, 3, false, 2)
+CV_CUDEV_RGB2Lab_INST(LRGBA_to_Lab, 4, 3, false, 2)
+CV_CUDEV_RGB2Lab_INST(LRGB_to_Lab4, 3, 4, false, 2)
+CV_CUDEV_RGB2Lab_INST(LRGBA_to_Lab4, 4, 4, false, 2)
+CV_CUDEV_RGB2Lab_INST(LBGR_to_Lab, 3, 3, false, 0)
+CV_CUDEV_RGB2Lab_INST(LBGRA_to_Lab, 4, 3, false, 0)
+CV_CUDEV_RGB2Lab_INST(LBGR_to_Lab4, 3, 4, false, 0)
+CV_CUDEV_RGB2Lab_INST(LBGRA_to_Lab4, 4, 4, false, 0)
+
+#undef CV_CUDEV_RGB2Lab_INST
+
+// Lab to RGB
+
+#define CV_CUDEV_Lab2RGB_INST(name, scn, dcn, sRGB, blueIdx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::Lab2RGB<SrcDepth, scn, dcn, sRGB, blueIdx> \
+    { \
+    };
+
+CV_CUDEV_Lab2RGB_INST(Lab_to_RGB, 3, 3, true, 2)
+CV_CUDEV_Lab2RGB_INST(Lab4_to_RGB, 4, 3, true, 2)
+CV_CUDEV_Lab2RGB_INST(Lab_to_RGBA, 3, 4, true, 2)
+CV_CUDEV_Lab2RGB_INST(Lab4_to_RGBA, 4, 4, true, 2)
+CV_CUDEV_Lab2RGB_INST(Lab_to_BGR, 3, 3, true, 0)
+CV_CUDEV_Lab2RGB_INST(Lab4_to_BGR, 4, 3, true, 0)
+CV_CUDEV_Lab2RGB_INST(Lab_to_BGRA, 3, 4, true, 0)
+CV_CUDEV_Lab2RGB_INST(Lab4_to_BGRA, 4, 4, true, 0)
+
+CV_CUDEV_Lab2RGB_INST(Lab_to_LRGB, 3, 3, false, 2)
+CV_CUDEV_Lab2RGB_INST(Lab4_to_LRGB, 4, 3, false, 2)
+CV_CUDEV_Lab2RGB_INST(Lab_to_LRGBA, 3, 4, false, 2)
+CV_CUDEV_Lab2RGB_INST(Lab4_to_LRGBA, 4, 4, false, 2)
+CV_CUDEV_Lab2RGB_INST(Lab_to_LBGR, 3, 3, false, 0)
+CV_CUDEV_Lab2RGB_INST(Lab4_to_LBGR, 4, 3, false, 0)
+CV_CUDEV_Lab2RGB_INST(Lab_to_LBGRA, 3, 4, false, 0)
+CV_CUDEV_Lab2RGB_INST(Lab4_to_LBGRA, 4, 4, false, 0)
+
+#undef CV_CUDEV_Lab2RGB_INST
+
+// RGB to Luv
+
+#define CV_CUDEV_RGB2Luv_INST(name, scn, dcn, sRGB, blueIdx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::RGB2Luv<SrcDepth, scn, dcn, sRGB, blueIdx> \
+    { \
+    };
+
+CV_CUDEV_RGB2Luv_INST(RGB_to_Luv, 3, 3, true, 2)
+CV_CUDEV_RGB2Luv_INST(RGBA_to_Luv, 4, 3, true, 2)
+CV_CUDEV_RGB2Luv_INST(RGB_to_Luv4, 3, 4, true, 2)
+CV_CUDEV_RGB2Luv_INST(RGBA_to_Luv4, 4, 4, true, 2)
+CV_CUDEV_RGB2Luv_INST(BGR_to_Luv, 3, 3, true, 0)
+CV_CUDEV_RGB2Luv_INST(BGRA_to_Luv, 4, 3, true, 0)
+CV_CUDEV_RGB2Luv_INST(BGR_to_Luv4, 3, 4, true, 0)
+CV_CUDEV_RGB2Luv_INST(BGRA_to_Luv4, 4, 4, true, 0)
+
+CV_CUDEV_RGB2Luv_INST(LRGB_to_Luv, 3, 3, false, 2)
+CV_CUDEV_RGB2Luv_INST(LRGBA_to_Luv, 4, 3, false, 2)
+CV_CUDEV_RGB2Luv_INST(LRGB_to_Luv4, 3, 4, false, 2)
+CV_CUDEV_RGB2Luv_INST(LRGBA_to_Luv4, 4, 4, false, 2)
+CV_CUDEV_RGB2Luv_INST(LBGR_to_Luv, 3, 3, false, 0)
+CV_CUDEV_RGB2Luv_INST(LBGRA_to_Luv, 4, 3, false, 0)
+CV_CUDEV_RGB2Luv_INST(LBGR_to_Luv4, 3, 4, false, 0)
+CV_CUDEV_RGB2Luv_INST(LBGRA_to_Luv4, 4, 4, false, 0)
+
+#undef CV_CUDEV_RGB2Luv_INST
+
+// Luv to RGB
+
+#define CV_CUDEV_Luv2RGB_INST(name, scn, dcn, sRGB, blueIdx) \
+    template <typename SrcDepth> struct name ## _func : cv::cudev::color_cvt_detail::Luv2RGB<SrcDepth, scn, dcn, sRGB, blueIdx> \
+    { \
+    };
+
+CV_CUDEV_Luv2RGB_INST(Luv_to_RGB, 3, 3, true, 2)
+CV_CUDEV_Luv2RGB_INST(Luv4_to_RGB, 4, 3, true, 2)
+CV_CUDEV_Luv2RGB_INST(Luv_to_RGBA, 3, 4, true, 2)
+CV_CUDEV_Luv2RGB_INST(Luv4_to_RGBA, 4, 4, true, 2)
+CV_CUDEV_Luv2RGB_INST(Luv_to_BGR, 3, 3, true, 0)
+CV_CUDEV_Luv2RGB_INST(Luv4_to_BGR, 4, 3, true, 0)
+CV_CUDEV_Luv2RGB_INST(Luv_to_BGRA, 3, 4, true, 0)
+CV_CUDEV_Luv2RGB_INST(Luv4_to_BGRA, 4, 4, true, 0)
+
+CV_CUDEV_Luv2RGB_INST(Luv_to_LRGB, 3, 3, false, 2)
+CV_CUDEV_Luv2RGB_INST(Luv4_to_LRGB, 4, 3, false, 2)
+CV_CUDEV_Luv2RGB_INST(Luv_to_LRGBA, 3, 4, false, 2)
+CV_CUDEV_Luv2RGB_INST(Luv4_to_LRGBA, 4, 4, false, 2)
+CV_CUDEV_Luv2RGB_INST(Luv_to_LBGR, 3, 3, false, 0)
+CV_CUDEV_Luv2RGB_INST(Luv4_to_LBGR, 4, 3, false, 0)
+CV_CUDEV_Luv2RGB_INST(Luv_to_LBGRA, 3, 4, false, 0)
+CV_CUDEV_Luv2RGB_INST(Luv4_to_LBGRA, 4, 4, false, 0)
+
+#undef CV_CUDEV_Luv2RGB_INST
+
+// 24/32-bit RGB to 16-bit (565 or 555) RGB
+
+#define CV_CUDEV_RGB2RGB5x5_INST(name, scn, bidx, green_bits) \
+    typedef cv::cudev::color_cvt_detail::RGB2RGB5x5<scn, bidx, green_bits> name ## _func;
+
+CV_CUDEV_RGB2RGB5x5_INST(BGR_to_BGR555, 3, 0, 5)
+CV_CUDEV_RGB2RGB5x5_INST(BGR_to_BGR565, 3, 0, 6)
+CV_CUDEV_RGB2RGB5x5_INST(RGB_to_BGR555, 3, 2, 5)
+CV_CUDEV_RGB2RGB5x5_INST(RGB_to_BGR565, 3, 2, 6)
+CV_CUDEV_RGB2RGB5x5_INST(BGRA_to_BGR555, 4, 0, 5)
+CV_CUDEV_RGB2RGB5x5_INST(BGRA_to_BGR565, 4, 0, 6)
+CV_CUDEV_RGB2RGB5x5_INST(RGBA_to_BGR555, 4, 2, 5)
+CV_CUDEV_RGB2RGB5x5_INST(RGBA_to_BGR565, 4, 2, 6)
+
+#undef CV_CUDEV_RGB2RGB5x5_INST
+
+// 16-bit (565 or 555) RGB to 24/32-bit RGB
+
+#define CV_CUDEV_RGB5x52RGB_INST(name, dcn, bidx, green_bits) \
+    typedef cv::cudev::color_cvt_detail::RGB5x52RGB<dcn, bidx, green_bits> name ## _func;
+
+CV_CUDEV_RGB5x52RGB_INST(BGR555_to_RGB, 3, 2, 5)
+CV_CUDEV_RGB5x52RGB_INST(BGR565_to_RGB, 3, 2, 6)
+CV_CUDEV_RGB5x52RGB_INST(BGR555_to_BGR, 3, 0, 5)
+CV_CUDEV_RGB5x52RGB_INST(BGR565_to_BGR, 3, 0, 6)
+CV_CUDEV_RGB5x52RGB_INST(BGR555_to_RGBA, 4, 2, 5)
+CV_CUDEV_RGB5x52RGB_INST(BGR565_to_RGBA, 4, 2, 6)
+CV_CUDEV_RGB5x52RGB_INST(BGR555_to_BGRA, 4, 0, 5)
+CV_CUDEV_RGB5x52RGB_INST(BGR565_to_BGRA, 4, 0, 6)
+
+#undef CV_CUDEV_RGB5x52RGB_INST
+
+// Grayscale to 16-bit (565 or 555) RGB
+
+#define CV_CUDEV_GRAY2RGB5x5_INST(name, green_bits) \
+    typedef cv::cudev::color_cvt_detail::Gray2RGB5x5<green_bits> name ## _func;
+
+CV_CUDEV_GRAY2RGB5x5_INST(GRAY_to_BGR555, 5)
+CV_CUDEV_GRAY2RGB5x5_INST(GRAY_to_BGR565, 6)
+
+#undef CV_CUDEV_GRAY2RGB5x5_INST
+
+// 16-bit (565 or 555) RGB to Grayscale
+
+#define CV_CUDEV_RGB5x52GRAY_INST(name, green_bits) \
+    typedef cv::cudev::color_cvt_detail::RGB5x52Gray<green_bits> name ## _func;
+
+CV_CUDEV_RGB5x52GRAY_INST(BGR555_to_GRAY, 5)
+CV_CUDEV_RGB5x52GRAY_INST(BGR565_to_GRAY, 6)
+
+#undef CV_CUDEV_RGB5x52GRAY_INST
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/functional/detail/color_cvt.hpp b/modules/cudev/include/opencv2/cudev/functional/detail/color_cvt.hpp
new file mode 100644
index 00000000000..a456dfad480
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/functional/detail/color_cvt.hpp
@@ -0,0 +1,1284 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_FUNCTIONAL_COLOR_CVT_DETAIL_HPP
+#define OPENCV_CUDEV_FUNCTIONAL_COLOR_CVT_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/vec_traits.hpp"
+#include "../../util/saturate_cast.hpp"
+#include "../../util/limits.hpp"
+#include "../functional.hpp"
+
+namespace cv { namespace cudev {
+
+namespace color_cvt_detail
+{
+    // utility
+
+    #define CV_CUDEV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
+
+    template <typename T> struct ColorChannel
+    {
+        __device__ __forceinline__ static T max() { return numeric_limits<T>::max(); }
+        __device__ __forceinline__ static T half() { return (T)(max()/2 + 1); }
+    };
+
+    template <> struct ColorChannel<float>
+    {
+        __device__ __forceinline__ static float max() { return 1.f; }
+        __device__ __forceinline__ static float half() { return 0.5f; }
+    };
+
+    template <typename T> __device__ __forceinline__ void setAlpha(typename MakeVec<T, 3>::type& vec, T val)
+    {
+    }
+
+    template <typename T> __device__ __forceinline__ void setAlpha(typename MakeVec<T, 4>::type& vec, T val)
+    {
+        vec.w = val;
+    }
+
+    template <typename T> __device__ __forceinline__ T getAlpha(const typename MakeVec<T, 3>::type& vec)
+    {
+        return ColorChannel<T>::max();
+    }
+
+    template <typename T> __device__ __forceinline__ T getAlpha(const typename MakeVec<T, 4>::type& vec)
+    {
+        return vec.w;
+    }
+
+    enum
+    {
+        yuv_shift  = 14,
+        xyz_shift  = 12,
+        R2Y        = 4899,
+        G2Y        = 9617,
+        B2Y        = 1868,
+        BLOCK_SIZE = 256
+    };
+
+    // Various 3/4-channel to 3/4-channel RGB transformations
+
+    template <typename T, int scn, int dcn, int bidx> struct RGB2RGB
+            : unary_function<typename MakeVec<T, scn>::type, typename MakeVec<T, dcn>::type>
+    {
+        __device__ typename MakeVec<T, dcn>::type operator ()(const typename MakeVec<T, scn>::type& src) const
+        {
+            typename MakeVec<T, dcn>::type dst;
+
+            dst.x = bidx == 0 ? src.x : src.z;
+            dst.y = src.y;
+            dst.z = bidx == 0 ? src.z : src.x;
+            setAlpha(dst, getAlpha<T>(src));
+
+            return dst;
+        }
+    };
+
+    // 24/32-bit RGB to 16-bit (565 or 555) RGB
+
+    template <int scn, int bidx, int green_bits> struct RGB2RGB5x5;
+
+    template <int scn, int bidx> struct RGB2RGB5x5<scn, bidx, 6>
+            : unary_function<typename MakeVec<uchar, scn>::type, ushort>
+    {
+        __device__ ushort operator ()(const typename MakeVec<uchar, scn>::type& src) const
+        {
+            const int b = bidx == 0 ? src.x : src.z;
+            const int g = src.y;
+            const int r = bidx == 0 ? src.z : src.x;
+            return (ushort) ((b >> 3) | ((g & ~3) << 3) | ((r & ~7) << 8));
+        }
+    };
+
+    template <int bidx> struct RGB2RGB5x5<3, bidx, 5>
+            : unary_function<uchar3, ushort>
+    {
+        __device__ ushort operator ()(const uchar3& src) const
+        {
+            const int b = bidx == 0 ? src.x : src.z;
+            const int g = src.y;
+            const int r = bidx == 0 ? src.z : src.x;
+            return (ushort) ((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7));
+        }
+    };
+
+    template <int bidx> struct RGB2RGB5x5<4, bidx, 5>
+            : unary_function<uchar4, ushort>
+    {
+        __device__ ushort operator ()(const uchar4& src) const
+        {
+            const int b = bidx == 0 ? src.x : src.z;
+            const int g = src.y;
+            const int r = bidx == 0 ? src.z : src.x;
+            const int a = src.w;
+            return (ushort) ((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a ? 0x8000 : 0));
+        }
+    };
+
+    // 16-bit (565 or 555) RGB to 24/32-bit RGB
+
+    template <int dcn, int bidx, int green_bits> struct RGB5x52RGB;
+
+    template <int bidx> struct RGB5x52RGB<3, bidx, 5>
+            : unary_function<ushort, uchar3>
+    {
+        __device__ uchar3 operator ()(ushort src) const
+        {
+            const int b = src << 3;
+            const int r = (src >> 7) & ~7;
+
+            uchar3 dst;
+
+            dst.x = bidx == 0 ? b : r;
+            dst.y = (src >> 2) & ~7;
+            dst.z = bidx == 0 ? r : b;
+
+            return dst;
+        }
+    };
+
+    template <int bidx> struct RGB5x52RGB<4, bidx, 5>
+            : unary_function<ushort, uchar4>
+    {
+        __device__ uchar4 operator ()(ushort src) const
+        {
+            const int b = src << 3;
+            const int r = (src >> 7) & ~7;
+
+            uchar4 dst;
+
+            dst.x = bidx == 0 ? b : r;
+            dst.y = (src >> 2) & ~7;
+            dst.z = bidx == 0 ? r : b;
+            dst.w = (src & 0x8000) * 0xffu;
+
+            return dst;
+        }
+    };
+
+    template <int bidx> struct RGB5x52RGB<3, bidx, 6>
+            : unary_function<ushort, uchar3>
+    {
+        __device__ uchar3 operator ()(ushort src) const
+        {
+            const int b = src << 3;
+            const int r = (src >> 8) & ~7;
+
+            uchar3 dst;
+
+            dst.x = bidx == 0 ? b : r;
+            dst.y = (src >> 3) & ~3;
+            dst.z = bidx == 0 ? r : b;
+
+            return dst;
+        }
+    };
+
+    template <int bidx> struct RGB5x52RGB<4, bidx, 6>
+            : unary_function<ushort, uchar4>
+    {
+        __device__ uchar4 operator ()(ushort src) const
+        {
+            const int b = src << 3;
+            const int r = (src >> 8) & ~7;
+
+            uchar4 dst;
+
+            dst.x = bidx == 0 ? b : r;
+            dst.y = (src >> 3) & ~3;
+            dst.z = bidx == 0 ? r : b;
+            dst.w = 255;
+
+            return dst;
+        }
+    };
+
+    // Grayscale to RGB
+
+    template <typename T, int dcn> struct Gray2RGB
+            : unary_function<T, typename MakeVec<T, dcn>::type>
+    {
+        __device__ typename MakeVec<T, dcn>::type operator ()(T src) const
+        {
+            typename MakeVec<T, dcn>::type dst;
+
+            dst.z = dst.y = dst.x = src;
+            setAlpha(dst, ColorChannel<T>::max());
+
+            return dst;
+        }
+    };
+
+    // Grayscale to 16-bit (565 or 555) RGB
+
+    template <int green_bits> struct Gray2RGB5x5;
+
+    template <> struct Gray2RGB5x5<5>
+            : unary_function<uchar, ushort>
+    {
+        __device__ ushort operator ()(uchar src) const
+        {
+            const int t = src >> 3;
+            return (ushort)(t | (t << 5) | (t << 10));
+        }
+    };
+
+    template <> struct Gray2RGB5x5<6>
+            : unary_function<uchar, ushort>
+    {
+        __device__ ushort operator ()(uchar src) const
+        {
+            const int t = src;
+            return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
+        }
+    };
+
+    // 16-bit (565 or 555) RGB to Grayscale
+
+    template <int green_bits> struct RGB5x52Gray;
+
+    template <> struct RGB5x52Gray<5>
+            : unary_function<ushort, uchar>
+    {
+        __device__ uchar operator ()(ushort src) const
+        {
+            return (uchar) CV_CUDEV_DESCALE(((src << 3) & 0xf8) * B2Y + ((src >> 2) & 0xf8) * G2Y + ((src >> 7) & 0xf8) * R2Y, yuv_shift);
+        }
+    };
+
+    template <> struct RGB5x52Gray<6>
+            : unary_function<ushort, uchar>
+    {
+        __device__ uchar operator ()(ushort src) const
+        {
+            return (uchar) CV_CUDEV_DESCALE(((src << 3) & 0xf8) * B2Y + ((src >> 3) & 0xfc) * G2Y + ((src >> 8) & 0xf8) * R2Y, yuv_shift);
+        }
+    };
+
+    // RGB to Grayscale
+
+    template <typename T, int scn, int bidx> struct RGB2Gray
+            : unary_function<typename MakeVec<T, scn>::type, T>
+    {
+        __device__ T operator ()(const typename MakeVec<T, scn>::type& src) const
+        {
+            const int b = bidx == 0 ? src.x : src.z;
+            const int g = src.y;
+            const int r = bidx == 0 ? src.z : src.x;
+            return (T) CV_CUDEV_DESCALE(b * B2Y + g * G2Y + r * R2Y, yuv_shift);
+        }
+    };
+
+    template <int scn, int bidx> struct RGB2Gray<float, scn, bidx>
+            : unary_function<typename MakeVec<float, scn>::type, float>
+    {
+        __device__ float operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float b = bidx == 0 ? src.x : src.z;
+            const float g = src.y;
+            const float r = bidx == 0 ? src.z : src.x;
+            return b * 0.114f + g * 0.587f + r * 0.299f;
+        }
+    };
+
+    // RGB to YUV
+
+    __constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
+    __constant__ int   c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
+
+    template <typename T, int scn, int dcn, int bidx> struct RGB2YUV
+            : unary_function<typename MakeVec<T, scn>::type, typename MakeVec<T, dcn>::type>
+    {
+        __device__ typename MakeVec<T, dcn>::type operator ()(const typename MakeVec<T, scn>::type& src) const
+        {
+            const int b = bidx == 0 ? src.x : src.z;
+            const int g = src.y;
+            const int r = bidx == 0 ? src.z : src.x;
+
+            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
+
+            const int Y = CV_CUDEV_DESCALE(b * c_RGB2YUVCoeffs_i[0] + g * c_RGB2YUVCoeffs_i[1] + r * c_RGB2YUVCoeffs_i[2], yuv_shift);
+            const int Cb = CV_CUDEV_DESCALE((b - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);
+            const int Cr = CV_CUDEV_DESCALE((r - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);
+
+            typename MakeVec<T, dcn>::type dst;
+
+            dst.x = saturate_cast<T>(Y);
+            dst.y = saturate_cast<T>(Cb);
+            dst.z = saturate_cast<T>(Cr);
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, int bidx> struct RGB2YUV<float, scn, dcn, bidx>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float b = bidx == 0 ? src.x : src.z;
+            const float g = src.y;
+            const float r = bidx == 0 ? src.z : src.x;
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = b * c_RGB2YUVCoeffs_f[0] + g * c_RGB2YUVCoeffs_f[1] + r * c_RGB2YUVCoeffs_f[2];
+            dst.y = (b - dst.x) * c_RGB2YUVCoeffs_f[3] + ColorChannel<float>::half();
+            dst.z = (r - dst.x) * c_RGB2YUVCoeffs_f[4] + ColorChannel<float>::half();
+
+            return dst;
+        }
+    };
+
+    // YUV to RGB
+
+    __constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
+    __constant__ int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 };
+
+    template <typename T, int scn, int dcn, int bidx> struct YUV2RGB
+            : unary_function<typename MakeVec<T, scn>::type, typename MakeVec<T, dcn>::type>
+    {
+        __device__ typename MakeVec<T, dcn>::type operator ()(const typename MakeVec<T, scn>::type& src) const
+        {
+            const int r = src.x + CV_CUDEV_DESCALE((src.z - ColorChannel<T>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);
+            const int g = src.x + CV_CUDEV_DESCALE((src.z - ColorChannel<T>::half()) * c_YUV2RGBCoeffs_i[2] + (src.y - ColorChannel<T>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);
+            const int b = src.x + CV_CUDEV_DESCALE((src.y - ColorChannel<T>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);
+
+            typename MakeVec<T, dcn>::type dst;
+
+            dst.x = saturate_cast<T>(bidx == 0 ? b : r);
+            dst.y = saturate_cast<T>(g);
+            dst.z = saturate_cast<T>(bidx == 0 ? r : b);
+            setAlpha(dst, ColorChannel<T>::max());
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, int bidx> struct YUV2RGB<float, scn, dcn, bidx>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float r = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[3];
+            const float g = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[1];
+            const float b = src.x + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[0];
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = bidx == 0 ? b : r;
+            dst.y = g;
+            dst.z = bidx == 0 ? r : b;
+            setAlpha(dst, ColorChannel<float>::max());
+
+            return dst;
+        }
+    };
+
+    // RGB to YCrCb
+
+    __constant__ float c_RGB2YCrCbCoeffs_f[5] = { 0.299f, 0.587f, 0.114f, 0.713f, 0.564f };
+    __constant__ int   c_RGB2YCrCbCoeffs_i[5] = { R2Y, G2Y, B2Y, 11682, 9241 };
+
+    template <typename T, int scn, int dcn, int bidx> struct RGB2YCrCb
+            : unary_function<typename MakeVec<T, scn>::type, typename MakeVec<T, dcn>::type>
+    {
+        __device__ typename MakeVec<T, dcn>::type operator ()(const typename MakeVec<T, scn>::type& src) const
+        {
+            const int b = bidx == 0 ? src.x : src.z;
+            const int g = src.y;
+            const int r = bidx == 0 ? src.z : src.x;
+
+            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
+
+            const int Y = CV_CUDEV_DESCALE(b * c_RGB2YCrCbCoeffs_i[2] + g * c_RGB2YCrCbCoeffs_i[1] + r * c_RGB2YCrCbCoeffs_i[0], yuv_shift);
+            const int Cr = CV_CUDEV_DESCALE((r - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_CUDEV_DESCALE((b - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);
+
+            typename MakeVec<T, dcn>::type dst;
+
+            dst.x = saturate_cast<T>(Y);
+            dst.y = saturate_cast<T>(Cr);
+            dst.z = saturate_cast<T>(Cb);
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, int bidx> struct RGB2YCrCb<float, scn, dcn, bidx>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float b = bidx == 0 ? src.x : src.z;
+            const float g = src.y;
+            const float r = bidx == 0 ? src.z : src.x;
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = b * c_RGB2YCrCbCoeffs_f[2] + g * c_RGB2YCrCbCoeffs_f[1] + r * c_RGB2YCrCbCoeffs_f[0];
+            dst.y = (r - dst.x) * c_RGB2YCrCbCoeffs_f[3] + ColorChannel<float>::half();
+            dst.z = (b - dst.x) * c_RGB2YCrCbCoeffs_f[4] + ColorChannel<float>::half();
+
+            return dst;
+        }
+    };
+
+    // YCrCb to RGB
+
+    __constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f};
+    __constant__ int   c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049};
+
+    template <typename T, int scn, int dcn, int bidx> struct YCrCb2RGB
+            : unary_function<typename MakeVec<T, scn>::type, typename MakeVec<T, dcn>::type>
+    {
+        __device__ typename MakeVec<T, dcn>::type operator ()(const typename MakeVec<T, scn>::type& src) const
+        {
+            const int b = src.x + CV_CUDEV_DESCALE((src.z - ColorChannel<T>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);
+            const int g = src.x + CV_CUDEV_DESCALE((src.z - ColorChannel<T>::half()) * c_YCrCb2RGBCoeffs_i[2] + (src.y - ColorChannel<T>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);
+            const int r = src.x + CV_CUDEV_DESCALE((src.y - ColorChannel<T>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);
+
+            typename MakeVec<T, dcn>::type dst;
+
+            dst.x = saturate_cast<T>(bidx == 0 ? b : r);
+            dst.y = saturate_cast<T>(g);
+            dst.z = saturate_cast<T>(bidx == 0 ? r : b);
+            setAlpha(dst, ColorChannel<T>::max());
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, int bidx> struct YCrCb2RGB<float, scn, dcn, bidx>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float b = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[3];
+            const float g = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[1];
+            const float r = src.x + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[0];
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = bidx == 0 ? b : r;
+            dst.y = g;
+            dst.z = bidx == 0 ? r : b;
+            setAlpha(dst, ColorChannel<float>::max());
+
+            return dst;
+        }
+    };
+
+    // RGB to XYZ
+
+    __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };
+    __constant__ int   c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };
+
+    template <typename T, int scn, int dcn, int bidx> struct RGB2XYZ
+            : unary_function<typename MakeVec<T, scn>::type, typename MakeVec<T, dcn>::type>
+    {
+        __device__ typename MakeVec<T, dcn>::type operator ()(const typename MakeVec<T, scn>::type& src) const
+        {
+            const int b = bidx == 0 ? src.x : src.z;
+            const int g = src.y;
+            const int r = bidx == 0 ? src.z : src.x;
+
+            typename MakeVec<T, dcn>::type dst;
+
+            dst.z = saturate_cast<T>(CV_CUDEV_DESCALE(r * c_RGB2XYZ_D65i[6] + g * c_RGB2XYZ_D65i[7] + b * c_RGB2XYZ_D65i[8], xyz_shift));
+            dst.x = saturate_cast<T>(CV_CUDEV_DESCALE(r * c_RGB2XYZ_D65i[0] + g * c_RGB2XYZ_D65i[1] + b * c_RGB2XYZ_D65i[2], xyz_shift));
+            dst.y = saturate_cast<T>(CV_CUDEV_DESCALE(r * c_RGB2XYZ_D65i[3] + g * c_RGB2XYZ_D65i[4] + b * c_RGB2XYZ_D65i[5], xyz_shift));
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, int bidx> struct RGB2XYZ<float, scn, dcn, bidx>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float b = bidx == 0 ? src.x : src.z;
+            const float g = src.y;
+            const float r = bidx == 0 ? src.z : src.x;
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = r * c_RGB2XYZ_D65f[0] + g * c_RGB2XYZ_D65f[1] + b * c_RGB2XYZ_D65f[2];
+            dst.y = r * c_RGB2XYZ_D65f[3] + g * c_RGB2XYZ_D65f[4] + b * c_RGB2XYZ_D65f[5];
+            dst.z = r * c_RGB2XYZ_D65f[6] + g * c_RGB2XYZ_D65f[7] + b * c_RGB2XYZ_D65f[8];
+
+            return dst;
+        }
+    };
+
+    // XYZ to RGB
+
+    __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };
+    __constant__ int   c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };
+
+    template <typename T, int scn, int dcn, int bidx> struct XYZ2RGB
+            : unary_function<typename MakeVec<T, scn>::type, typename MakeVec<T, dcn>::type>
+    {
+        __device__ typename MakeVec<T, dcn>::type operator ()(const typename MakeVec<T, scn>::type& src) const
+        {
+            const int b = CV_CUDEV_DESCALE(src.x * c_XYZ2sRGB_D65i[6] + src.y * c_XYZ2sRGB_D65i[7] + src.z * c_XYZ2sRGB_D65i[8], xyz_shift);
+            const int g = CV_CUDEV_DESCALE(src.x * c_XYZ2sRGB_D65i[3] + src.y * c_XYZ2sRGB_D65i[4] + src.z * c_XYZ2sRGB_D65i[5], xyz_shift);
+            const int r = CV_CUDEV_DESCALE(src.x * c_XYZ2sRGB_D65i[0] + src.y * c_XYZ2sRGB_D65i[1] + src.z * c_XYZ2sRGB_D65i[2], xyz_shift);
+
+            typename MakeVec<T, dcn>::type dst;
+
+            dst.x = saturate_cast<T>(bidx == 0 ? b : r);
+            dst.y = saturate_cast<T>(g);
+            dst.z = saturate_cast<T>(bidx == 0 ? r : b);
+            setAlpha(dst, ColorChannel<T>::max());
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, int bidx> struct XYZ2RGB<float, scn, dcn, bidx>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float b = src.x * c_XYZ2sRGB_D65f[6] + src.y * c_XYZ2sRGB_D65f[7] + src.z * c_XYZ2sRGB_D65f[8];
+            const float g = src.x * c_XYZ2sRGB_D65f[3] + src.y * c_XYZ2sRGB_D65f[4] + src.z * c_XYZ2sRGB_D65f[5];
+            const float r = src.x * c_XYZ2sRGB_D65f[0] + src.y * c_XYZ2sRGB_D65f[1] + src.z * c_XYZ2sRGB_D65f[2];
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = bidx == 0 ? b : r;
+            dst.y = g;
+            dst.z = bidx == 0 ? r : b;
+            setAlpha(dst, ColorChannel<float>::max());
+
+            return dst;
+        }
+    };
+
+    // RGB to HSV
+
+    __constant__ int c_HsvDivTable   [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};
+    __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};
+    __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};
+
+    template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HSV;
+
+    template <int scn, int dcn, int bidx, int hr> struct RGB2HSV<uchar, scn, dcn, bidx, hr>
+            : unary_function<typename MakeVec<uchar, scn>::type, typename MakeVec<uchar, dcn>::type>
+    {
+        __device__ typename MakeVec<uchar, dcn>::type operator ()(const typename MakeVec<uchar, scn>::type& src) const
+        {
+            const int hsv_shift = 12;
+            const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;
+
+            const int b = bidx == 0 ? src.x : src.z;
+            const int g = src.y;
+            const int r = bidx == 0 ? src.z : src.x;
+
+            int h, s, v = b;
+            int vmin = b, diff;
+            int vr, vg;
+
+            v = ::max(v, g);
+            v = ::max(v, r);
+            vmin = ::min(vmin, g);
+            vmin = ::min(vmin, r);
+
+            diff = v - vmin;
+            vr = (v == r) * -1;
+            vg = (v == g) * -1;
+
+            s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h += (h < 0) * hr;
+
+            typename MakeVec<uchar, dcn>::type dst;
+
+            dst.x = saturate_cast<uchar>(h);
+            dst.y = saturate_cast<uchar>(s);
+            dst.z = saturate_cast<uchar>(v);
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, int bidx, int hr> struct RGB2HSV<float, scn, dcn, bidx, hr>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float hscale = hr * (1.f / 360.f);
+
+            const float b = bidx == 0 ? src.x : src.z;
+            const float g = src.y;
+            const float r = bidx == 0 ? src.z : src.x;
+
+            float h, s, v;
+            float vmin, diff;
+
+            v = vmin = r;
+            v = ::fmax(v, g);
+            v = ::fmax(v, b);
+            vmin = ::fmin(vmin, g);
+            vmin = ::fmin(vmin, b);
+
+            diff = v - vmin;
+            s = diff / (float)(::fabs(v) + numeric_limits<float>::epsilon());
+            diff = (float)(60. / (diff + numeric_limits<float>::epsilon()));
+
+            h  = (v == r) * (g - b) * diff;
+            h += (v != r && v == g) * ((b - r) * diff + 120.f);
+            h += (v != r && v != g) * ((r - g) * diff + 240.f);
+            h += (h < 0) * 360.f;
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = h * hscale;
+            dst.y = s;
+            dst.z = v;
+
+            return dst;
+        }
+    };
+
+    // HSV to RGB
+
+    __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
+
+    template <typename T, int scn, int dcn, int bidx, int hr> struct HSV2RGB;
+
+    template <int scn, int dcn, int bidx, int hr> struct HSV2RGB<float, scn, dcn, bidx, hr>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float hscale = 6.f / hr;
+
+            float h = src.x, s = src.y, v = src.z;
+            float b = v, g = v, r = v;
+
+            if (s != 0)
+            {
+                h *= hscale;
+
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+
+                int sector = __float2int_rd(h);
+                h -= sector;
+
+                if ( (unsigned)sector >= 6u )
+                {
+                    sector = 0;
+                    h = 0.f;
+                }
+
+                float tab[4];
+                tab[0] = v;
+                tab[1] = v * (1.f - s);
+                tab[2] = v * (1.f - s * h);
+                tab[3] = v * (1.f - s * (1.f - h));
+
+                b = tab[c_HsvSectorData[sector][0]];
+                g = tab[c_HsvSectorData[sector][1]];
+                r = tab[c_HsvSectorData[sector][2]];
+            }
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = bidx == 0 ? b : r;
+            dst.y = g;
+            dst.z = bidx == 0 ? r : b;
+            setAlpha(dst, ColorChannel<float>::max());
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, int bidx, int hr> struct HSV2RGB<uchar, scn, dcn, bidx, hr>
+            : unary_function<typename MakeVec<uchar, scn>::type, typename MakeVec<uchar, dcn>::type>
+    {
+        __device__ typename MakeVec<uchar, dcn>::type operator ()(const typename MakeVec<uchar, scn>::type& src) const
+        {
+            float3 buf;
+
+            buf.x = src.x;
+            buf.y = src.y * (1.f / 255.f);
+            buf.z = src.z * (1.f / 255.f);
+
+            HSV2RGB<float, 3, 3, bidx, hr> cvtf;
+            buf = cvtf(buf);
+
+            typename MakeVec<uchar, dcn>::type dst;
+
+            dst.x = saturate_cast<uchar>(buf.x * 255.f);
+            dst.y = saturate_cast<uchar>(buf.y * 255.f);
+            dst.z = saturate_cast<uchar>(buf.z * 255.f);
+            setAlpha(dst, ColorChannel<uchar>::max());
+
+            return dst;
+        }
+    };
+
+    // RGB to HLS
+
+    template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HLS;
+
+    template <int scn, int dcn, int bidx, int hr> struct RGB2HLS<float, scn, dcn, bidx, hr>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float hscale = hr * (1.f / 360.f);
+
+            const float b = bidx == 0 ? src.x : src.z;
+            const float g = src.y;
+            const float r = bidx == 0 ? src.z : src.x;
+
+            float h = 0.f, s = 0.f, l;
+            float vmin, vmax, diff;
+
+            vmax = vmin = r;
+            vmax = ::fmax(vmax, g);
+            vmax = ::fmax(vmax, b);
+            vmin = ::fmin(vmin, g);
+            vmin = ::fmin(vmin, b);
+
+            diff = vmax - vmin;
+            l = (vmax + vmin) * 0.5f;
+
+            if (diff > numeric_limits<float>::epsilon())
+            {
+                s = l < 0.5f ? diff / (vmax + vmin) : diff / (2 - vmax - vmin);
+
+                diff = 60.f / diff;
+
+                h  = (vmax == r) * (g - b) * diff;
+                h += (vmax != r && vmax == g) * ((b - r) * diff + 120.f);
+                h += (vmax != r && vmax != g) * ((r - g) * diff + 240.f);
+                h += (h < 0.f) * 360.f;
+            }
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = h * hscale;
+            dst.y = l;
+            dst.z = s;
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, int bidx, int hr> struct RGB2HLS<uchar, scn, dcn, bidx, hr>
+            : unary_function<typename MakeVec<uchar, scn>::type, typename MakeVec<uchar, dcn>::type>
+    {
+        __device__ typename MakeVec<uchar, dcn>::type operator ()(const typename MakeVec<uchar, scn>::type& src) const
+        {
+            float3 buf;
+
+            buf.x = src.x * (1.f / 255.f);
+            buf.y = src.y * (1.f / 255.f);
+            buf.z = src.z * (1.f / 255.f);
+
+            RGB2HLS<float, 3, 3, bidx, hr> cvtf;
+            buf = cvtf(buf);
+
+            typename MakeVec<uchar, dcn>::type dst;
+
+            dst.x = saturate_cast<uchar>(buf.x);
+            dst.y = saturate_cast<uchar>(buf.y * 255.f);
+            dst.z = saturate_cast<uchar>(buf.z * 255.f);
+
+            return dst;
+        }
+    };
+
+    // HLS to RGB
+
+    __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
+
+    template <typename T, int scn, int dcn, int bidx, int hr> struct HLS2RGB;
+
+    template <int scn, int dcn, int bidx, int hr> struct HLS2RGB<float, scn, dcn, bidx, hr>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float hscale = 6.0f / hr;
+
+            float h = src.x, l = src.y, s = src.z;
+            float b = l, g = l, r = l;
+
+            if (s != 0)
+            {
+                float p2  = (l <= 0.5f) * l * (1 + s);
+                      p2 += (l > 0.5f) * (l + s - l * s);
+                float p1 = 2 * l - p2;
+
+                h *= hscale;
+
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+
+                int sector;
+                sector = __float2int_rd(h);
+
+                h -= sector;
+
+                float tab[4];
+                tab[0] = p2;
+                tab[1] = p1;
+                tab[2] = p1 + (p2 - p1) * (1 - h);
+                tab[3] = p1 + (p2 - p1) * h;
+
+                b = tab[c_HlsSectorData[sector][0]];
+                g = tab[c_HlsSectorData[sector][1]];
+                r = tab[c_HlsSectorData[sector][2]];
+            }
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = bidx == 0 ? b : r;
+            dst.y = g;
+            dst.z = bidx == 0 ? r : b;
+            setAlpha(dst, ColorChannel<float>::max());
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, int bidx, int hr> struct HLS2RGB<uchar, scn, dcn, bidx, hr>
+            : unary_function<typename MakeVec<uchar, scn>::type, typename MakeVec<uchar, dcn>::type>
+    {
+        __device__ typename MakeVec<uchar, dcn>::type operator ()(const typename MakeVec<uchar, scn>::type& src) const
+        {
+            float3 buf;
+
+            buf.x = src.x;
+            buf.y = src.y * (1.f / 255.f);
+            buf.z = src.z * (1.f / 255.f);
+
+            HLS2RGB<float, 3, 3, bidx, hr> cvtf;
+            buf = cvtf(buf);
+
+            typename MakeVec<uchar, dcn>::type dst;
+
+            dst.x = saturate_cast<uchar>(buf.x * 255.f);
+            dst.y = saturate_cast<uchar>(buf.y * 255.f);
+            dst.z = saturate_cast<uchar>(buf.z * 255.f);
+            setAlpha(dst, ColorChannel<uchar>::max());
+
+            return dst;
+        }
+    };
+
+    // RGB to Lab
+
+    enum
+    {
+        LAB_CBRT_TAB_SIZE = 1024,
+        GAMMA_TAB_SIZE = 1024,
+        lab_shift = xyz_shift,
+        gamma_shift = 3,
+        lab_shift2 = (lab_shift + gamma_shift),
+        LAB_CBRT_TAB_SIZE_B = (256 * 3 / 2 * (1 << gamma_shift))
+    };
+
+    __constant__ ushort c_sRGBGammaTab_b[] = {0,1,1,2,2,3,4,4,5,6,6,7,8,8,9,10,11,11,12,13,14,15,16,17,19,20,21,22,24,25,26,28,29,31,33,34,36,38,40,41,43,45,47,49,51,54,56,58,60,63,65,68,70,73,75,78,81,83,86,89,92,95,98,101,105,108,111,115,118,121,125,129,132,136,140,144,147,151,155,160,164,168,172,176,181,185,190,194,199,204,209,213,218,223,228,233,239,244,249,255,260,265,271,277,282,288,294,300,306,312,318,324,331,337,343,350,356,363,370,376,383,390,397,404,411,418,426,433,440,448,455,463,471,478,486,494,502,510,518,527,535,543,552,560,569,578,586,595,604,613,622,631,641,650,659,669,678,688,698,707,717,727,737,747,757,768,778,788,799,809,820,831,842,852,863,875,886,897,908,920,931,943,954,966,978,990,1002,1014,1026,1038,1050,1063,1075,1088,1101,1113,1126,1139,1152,1165,1178,1192,1205,1218,1232,1245,1259,1273,1287,1301,1315,1329,1343,1357,1372,1386,1401,1415,1430,1445,1460,1475,1490,1505,1521,1536,1551,1567,1583,1598,1614,1630,1646,1662,1678,1695,1711,1728,1744,1761,1778,1794,1811,1828,1846,1863,1880,1897,1915,1933,1950,1968,1986,2004,2022,2040};
+    __constant__ float c_sRGBGammaTab[] = {0,7.55853e-05,0.,-7.51331e-13,7.55853e-05,7.55853e-05,-2.25399e-12,3.75665e-12,0.000151171,7.55853e-05,9.01597e-12,-6.99932e-12,0.000226756,7.55853e-05,-1.1982e-11,2.41277e-12,0.000302341,7.55853e-05,-4.74369e-12,1.19001e-11,0.000377927,7.55853e-05,3.09568e-11,-2.09095e-11,0.000453512,7.55853e-05,-3.17718e-11,1.35303e-11,0.000529097,7.55853e-05,8.81905e-12,-4.10782e-12,0.000604683,7.55853e-05,-3.50439e-12,2.90097e-12,0.000680268,7.55853e-05,5.19852e-12,-7.49607e-12,0.000755853,7.55853e-05,-1.72897e-11,2.70833e-11,0.000831439,7.55854e-05,6.39602e-11,-4.26295e-11,0.000907024,7.55854e-05,-6.39282e-11,2.70193e-11,0.000982609,7.55853e-05,1.71298e-11,-7.24017e-12,0.00105819,7.55853e-05,-4.59077e-12,1.94137e-12,0.00113378,7.55853e-05,1.23333e-12,-5.25291e-13,0.00120937,7.55853e-05,-3.42545e-13,1.59799e-13,0.00128495,7.55853e-05,1.36852e-13,-1.13904e-13,0.00136054,7.55853e-05,-2.04861e-13,2.95818e-13,0.00143612,7.55853e-05,6.82594e-13,-1.06937e-12,0.00151171,7.55853e-05,-2.52551e-12,3.98166e-12,0.00158729,7.55853e-05,9.41946e-12,-1.48573e-11,0.00166288,7.55853e-05,-3.51523e-11,5.54474e-11,0.00173846,7.55854e-05,1.3119e-10,-9.0517e-11,0.00181405,7.55854e-05,-1.40361e-10,7.37899e-11,0.00188963,7.55853e-05,8.10085e-11,-8.82272e-11,0.00196522,7.55852e-05,-1.83673e-10,1.62704e-10,0.0020408,7.55853e-05,3.04438e-10,-2.13341e-10,0.00211639,7.55853e-05,-3.35586e-10,2.25e-10,0.00219197,7.55853e-05,3.39414e-10,-2.20997e-10,0.00226756,7.55853e-05,-3.23576e-10,1.93326e-10,0.00234315,7.55853e-05,2.564e-10,-8.66446e-11,0.00241873,7.55855e-05,-3.53328e-12,-7.9578e-11,0.00249432,7.55853e-05,-2.42267e-10,1.72126e-10,0.0025699,7.55853e-05,2.74111e-10,-1.43265e-10,0.00264549,7.55854e-05,-1.55683e-10,-6.47292e-11,0.00272107,7.55849e-05,-3.4987e-10,8.67842e-10,0.00279666,7.55868e-05,2.25366e-09,-3.8723e-09,0.00287224,7.55797e-05,-9.36325e-09,1.5087e-08,0.00294783,7.56063e-05,3.58978e-08,-5.69415e-08,0.00302341,7.55072e-05,-1.34927e-07,2.13144e-07,0.003099,7.58768e-05,5.04507e-07,1.38713e-07,0.00317552,7.7302e-05,9.20646e-07,-1.55186e-07,0.00325359,7.86777e-05,4.55087e-07,4.26813e-08,0.00333276,7.97159e-05,5.83131e-07,-1.06495e-08,0.00341305,8.08502e-05,5.51182e-07,3.87467e-09,0.00349446,8.19642e-05,5.62806e-07,-1.92586e-10,0.00357698,8.30892e-05,5.62228e-07,1.0866e-09,0.00366063,8.4217e-05,5.65488e-07,5.02818e-10,0.00374542,8.53494e-05,5.66997e-07,8.60211e-10,0.00383133,8.6486e-05,5.69577e-07,7.13044e-10,0.00391839,8.76273e-05,5.71716e-07,4.78527e-10,0.00400659,8.87722e-05,5.73152e-07,1.09818e-09,0.00409594,8.99218e-05,5.76447e-07,2.50964e-10,0.00418644,9.10754e-05,5.772e-07,1.15762e-09,0.00427809,9.22333e-05,5.80672e-07,2.40865e-10,0.0043709,9.33954e-05,5.81395e-07,1.13854e-09,0.00446488,9.45616e-05,5.84811e-07,3.27267e-10,0.00456003,9.57322e-05,5.85792e-07,8.1197e-10,0.00465635,9.69062e-05,5.88228e-07,6.15823e-10,0.00475384,9.80845e-05,5.90076e-07,9.15747e-10,0.00485252,9.92674e-05,5.92823e-07,3.778e-10,0.00495238,0.000100454,5.93956e-07,8.32623e-10,0.00505343,0.000101645,5.96454e-07,4.82695e-10,0.00515567,0.000102839,5.97902e-07,9.61904e-10,0.00525911,0.000104038,6.00788e-07,3.26281e-10,0.00536375,0.00010524,6.01767e-07,9.926e-10,0.00546959,0.000106447,6.04745e-07,3.59933e-10,0.00557664,0.000107657,6.05824e-07,8.2728e-10,0.0056849,0.000108871,6.08306e-07,5.21898e-10,0.00579438,0.00011009,6.09872e-07,8.10492e-10,0.00590508,0.000111312,6.12303e-07,4.27046e-10,0.00601701,0.000112538,6.13585e-07,7.40878e-10,0.00613016,0.000113767,6.15807e-07,8.00469e-10,0.00624454,0.000115001,6.18209e-07,2.48178e-10,0.00636016,0.000116238,6.18953e-07,1.00073e-09,0.00647702,0.000117479,6.21955e-07,4.05654e-10,0.00659512,0.000118724,6.23172e-07,6.36192e-10,0.00671447,0.000119973,6.25081e-07,7.74927e-10,0.00683507,0.000121225,6.27406e-07,4.54975e-10,0.00695692,0.000122481,6.28771e-07,6.64841e-10,0.00708003,0.000123741,6.30765e-07,6.10972e-10,0.00720441,0.000125004,6.32598e-07,6.16543e-10,0.00733004,0.000126271,6.34448e-07,6.48204e-10,0.00745695,0.000127542,6.36392e-07,5.15835e-10,0.00758513,0.000128816,6.3794e-07,5.48103e-10,0.00771458,0.000130094,6.39584e-07,1.01706e-09,0.00784532,0.000131376,6.42635e-07,4.0283e-11,0.00797734,0.000132661,6.42756e-07,6.84471e-10,0.00811064,0.000133949,6.4481e-07,9.47144e-10,0.00824524,0.000135241,6.47651e-07,1.83472e-10,0.00838112,0.000136537,6.48201e-07,1.11296e-09,0.00851831,0.000137837,6.5154e-07,2.13163e-11,0.0086568,0.00013914,6.51604e-07,6.64462e-10,0.00879659,0.000140445,6.53598e-07,1.04613e-09,0.00893769,0.000141756,6.56736e-07,-1.92377e-10,0.0090801,0.000143069,6.56159e-07,1.58601e-09,0.00922383,0.000144386,6.60917e-07,-5.63754e-10,0.00936888,0.000145706,6.59226e-07,1.60033e-09,0.00951524,0.000147029,6.64027e-07,-2.49543e-10,0.00966294,0.000148356,6.63278e-07,1.26043e-09,0.00981196,0.000149687,6.67059e-07,-1.35572e-10,0.00996231,0.00015102,6.66653e-07,1.14458e-09,0.010114,0.000152357,6.70086e-07,2.13864e-10,0.010267,0.000153698,6.70728e-07,7.93856e-10,0.0104214,0.000155042,6.73109e-07,3.36077e-10,0.0105771,0.000156389,6.74118e-07,6.55765e-10,0.0107342,0.000157739,6.76085e-07,7.66211e-10,0.0108926,0.000159094,6.78384e-07,4.66116e-12,0.0110524,0.000160451,6.78398e-07,1.07775e-09,0.0112135,0.000161811,6.81631e-07,3.41023e-10,0.011376,0.000163175,6.82654e-07,3.5205e-10,0.0115398,0.000164541,6.8371e-07,1.04473e-09,0.0117051,0.000165912,6.86844e-07,1.25757e-10,0.0118717,0.000167286,6.87222e-07,3.14818e-10,0.0120396,0.000168661,6.88166e-07,1.40886e-09,0.012209,0.000170042,6.92393e-07,-3.62244e-10,0.0123797,0.000171425,6.91306e-07,9.71397e-10,0.0125518,0.000172811,6.9422e-07,2.02003e-10,0.0127253,0.0001742,6.94826e-07,1.01448e-09,0.0129002,0.000175593,6.97869e-07,3.96653e-10,0.0130765,0.00017699,6.99059e-07,1.92927e-10,0.0132542,0.000178388,6.99638e-07,6.94305e-10,0.0134333,0.00017979,7.01721e-07,7.55108e-10,0.0136138,0.000181195,7.03986e-07,1.05918e-11,0.0137957,0.000182603,7.04018e-07,1.06513e-09,0.013979,0.000184015,7.07214e-07,3.85512e-10,0.0141637,0.00018543,7.0837e-07,1.86769e-10,0.0143499,0.000186848,7.0893e-07,7.30116e-10,0.0145374,0.000188268,7.11121e-07,6.17983e-10,0.0147264,0.000189692,7.12975e-07,5.23282e-10,0.0149168,0.000191119,7.14545e-07,8.28398e-11,0.0151087,0.000192549,7.14793e-07,1.0081e-09,0.0153019,0.000193981,7.17817e-07,5.41244e-10,0.0154966,0.000195418,7.19441e-07,-3.7907e-10,0.0156928,0.000196856,7.18304e-07,1.90641e-09,0.0158903,0.000198298,7.24023e-07,-7.27387e-10,0.0160893,0.000199744,7.21841e-07,1.00317e-09,0.0162898,0.000201191,7.24851e-07,4.39949e-10,0.0164917,0.000202642,7.2617e-07,9.6234e-10,0.0166951,0.000204097,7.29057e-07,-5.64019e-10,0.0168999,0.000205554,7.27365e-07,1.29374e-09,0.0171062,0.000207012,7.31247e-07,9.77025e-10,0.017314,0.000208478,7.34178e-07,-1.47651e-09,0.0175232,0.000209942,7.29748e-07,3.06636e-09,0.0177338,0.00021141,7.38947e-07,-1.47573e-09,0.017946,0.000212884,7.3452e-07,9.7386e-10,0.0181596,0.000214356,7.37442e-07,1.30562e-09,0.0183747,0.000215835,7.41358e-07,-6.08376e-10,0.0185913,0.000217315,7.39533e-07,1.12785e-09,0.0188093,0.000218798,7.42917e-07,-1.77711e-10,0.0190289,0.000220283,7.42384e-07,1.44562e-09,0.0192499,0.000221772,7.46721e-07,-1.68825e-11,0.0194724,0.000223266,7.4667e-07,4.84533e-10,0.0196964,0.000224761,7.48124e-07,-5.85298e-11,0.0199219,0.000226257,7.47948e-07,1.61217e-09,0.0201489,0.000227757,7.52785e-07,-8.02136e-10,0.0203775,0.00022926,7.50378e-07,1.59637e-09,0.0206075,0.000230766,7.55167e-07,4.47168e-12,0.020839,0.000232276,7.55181e-07,2.48387e-10,0.021072,0.000233787,7.55926e-07,8.6474e-10,0.0213066,0.000235302,7.5852e-07,1.78299e-11,0.0215426,0.000236819,7.58573e-07,9.26567e-10,0.0217802,0.000238339,7.61353e-07,1.34529e-12,0.0220193,0.000239862,7.61357e-07,9.30659e-10,0.0222599,0.000241387,7.64149e-07,1.34529e-12,0.0225021,0.000242915,7.64153e-07,9.26567e-10,0.0227458,0.000244447,7.66933e-07,1.76215e-11,0.022991,0.00024598,7.66986e-07,8.65536e-10,0.0232377,0.000247517,7.69582e-07,2.45677e-10,0.023486,0.000249057,7.70319e-07,1.44193e-11,0.0237358,0.000250598,7.70363e-07,1.55918e-09,0.0239872,0.000252143,7.7504e-07,-6.63173e-10,0.0242401,0.000253691,7.73051e-07,1.09357e-09,0.0244946,0.000255241,7.76331e-07,1.41919e-11,0.0247506,0.000256793,7.76374e-07,7.12248e-10,0.0250082,0.000258348,7.78511e-07,8.62049e-10,0.0252673,0.000259908,7.81097e-07,-4.35061e-10,0.025528,0.000261469,7.79792e-07,8.7825e-10,0.0257902,0.000263031,7.82426e-07,6.47181e-10,0.0260541,0.000264598,7.84368e-07,2.58448e-10,0.0263194,0.000266167,7.85143e-07,1.81558e-10,0.0265864,0.000267738,7.85688e-07,8.78041e-10,0.0268549,0.000269312,7.88322e-07,3.15102e-11,0.027125,0.000270889,7.88417e-07,8.58525e-10,0.0273967,0.000272468,7.90992e-07,2.59812e-10,0.02767,0.000274051,7.91772e-07,-3.5224e-11,0.0279448,0.000275634,7.91666e-07,1.74377e-09,0.0282212,0.000277223,7.96897e-07,-1.35196e-09,0.0284992,0.000278813,7.92841e-07,1.80141e-09,0.0287788,0.000280404,7.98246e-07,-2.65629e-10,0.0290601,0.000281999,7.97449e-07,1.12374e-09,0.0293428,0.000283598,8.0082e-07,-5.04106e-10,0.0296272,0.000285198,7.99308e-07,8.92764e-10,0.0299132,0.000286799,8.01986e-07,6.58379e-10,0.0302008,0.000288405,8.03961e-07,1.98971e-10,0.0304901,0.000290014,8.04558e-07,4.08382e-10,0.0307809,0.000291624,8.05783e-07,3.01839e-11,0.0310733,0.000293236,8.05874e-07,1.33343e-09,0.0313673,0.000294851,8.09874e-07,2.2419e-10,0.031663,0.000296472,8.10547e-07,-3.67606e-10,0.0319603,0.000298092,8.09444e-07,1.24624e-09,0.0322592,0.000299714,8.13182e-07,-8.92025e-10,0.0325597,0.000301338,8.10506e-07,2.32183e-09,0.0328619,0.000302966,8.17472e-07,-9.44719e-10,0.0331657,0.000304598,8.14638e-07,1.45703e-09,0.0334711,0.000306232,8.19009e-07,-1.15805e-09,0.0337781,0.000307866,8.15535e-07,3.17507e-09,0.0340868,0.000309507,8.2506e-07,-4.09161e-09,0.0343971,0.000311145,8.12785e-07,5.74079e-09,0.0347091,0.000312788,8.30007e-07,-3.97034e-09,0.0350227,0.000314436,8.18096e-07,2.68985e-09,0.035338,0.00031608,8.26166e-07,6.61676e-10,0.0356549,0.000317734,8.28151e-07,-1.61123e-09,0.0359734,0.000319386,8.23317e-07,2.05786e-09,0.0362936,0.000321038,8.29491e-07,8.30388e-10,0.0366155,0.0003227,8.31982e-07,-1.65424e-09,0.036939,0.000324359,8.27019e-07,2.06129e-09,0.0372642,0.000326019,8.33203e-07,8.59719e-10,0.0375911,0.000327688,8.35782e-07,-1.77488e-09,0.0379196,0.000329354,8.30458e-07,2.51464e-09,0.0382498,0.000331023,8.38002e-07,-8.33135e-10,0.0385817,0.000332696,8.35502e-07,8.17825e-10,0.0389152,0.00033437,8.37956e-07,1.28718e-09,0.0392504,0.00033605,8.41817e-07,-2.2413e-09,0.0395873,0.000337727,8.35093e-07,3.95265e-09,0.0399258,0.000339409,8.46951e-07,-2.39332e-09,0.0402661,0.000341095,8.39771e-07,1.89533e-09,0.040608,0.000342781,8.45457e-07,-1.46271e-09,0.0409517,0.000344467,8.41069e-07,3.95554e-09,0.041297,0.000346161,8.52936e-07,-3.18369e-09,0.041644,0.000347857,8.43385e-07,1.32873e-09,0.0419927,0.000349548,8.47371e-07,1.59402e-09,0.0423431,0.000351248,8.52153e-07,-2.54336e-10,0.0426952,0.000352951,8.5139e-07,-5.76676e-10,0.043049,0.000354652,8.4966e-07,2.56114e-09,0.0434045,0.000356359,8.57343e-07,-2.21744e-09,0.0437617,0.000358067,8.50691e-07,2.58344e-09,0.0441206,0.000359776,8.58441e-07,-6.65826e-10,0.0444813,0.000361491,8.56444e-07,7.99218e-11,0.0448436,0.000363204,8.56684e-07,3.46063e-10,0.0452077,0.000364919,8.57722e-07,2.26116e-09,0.0455734,0.000366641,8.64505e-07,-1.94005e-09,0.045941,0.000368364,8.58685e-07,1.77384e-09,0.0463102,0.000370087,8.64007e-07,-1.43005e-09,0.0466811,0.000371811,8.59717e-07,3.94634e-09,0.0470538,0.000373542,8.71556e-07,-3.17946e-09,0.0474282,0.000375276,8.62017e-07,1.32104e-09,0.0478043,0.000377003,8.6598e-07,1.62045e-09,0.0481822,0.00037874,8.70842e-07,-3.52297e-10,0.0485618,0.000380481,8.69785e-07,-2.11211e-10,0.0489432,0.00038222,8.69151e-07,1.19716e-09,0.0493263,0.000383962,8.72743e-07,-8.52026e-10,0.0497111,0.000385705,8.70187e-07,2.21092e-09,0.0500977,0.000387452,8.76819e-07,-5.41339e-10,0.050486,0.000389204,8.75195e-07,-4.5361e-11,0.0508761,0.000390954,8.75059e-07,7.22669e-10,0.0512679,0.000392706,8.77227e-07,8.79936e-10,0.0516615,0.000394463,8.79867e-07,-5.17048e-10,0.0520568,0.000396222,8.78316e-07,1.18833e-09,0.0524539,0.000397982,8.81881e-07,-5.11022e-10,0.0528528,0.000399744,8.80348e-07,8.55683e-10,0.0532534,0.000401507,8.82915e-07,8.13562e-10,0.0536558,0.000403276,8.85356e-07,-3.84603e-10,0.05406,0.000405045,8.84202e-07,7.24962e-10,0.0544659,0.000406816,8.86377e-07,1.20986e-09,0.0548736,0.000408592,8.90006e-07,-1.83896e-09,0.0552831,0.000410367,8.84489e-07,2.42071e-09,0.0556944,0.000412143,8.91751e-07,-3.93413e-10,0.0561074,0.000413925,8.90571e-07,-8.46967e-10,0.0565222,0.000415704,8.8803e-07,3.78122e-09,0.0569388,0.000417491,8.99374e-07,-3.1021e-09,0.0573572,0.000419281,8.90068e-07,1.17658e-09,0.0577774,0.000421064,8.93597e-07,2.12117e-09,0.0581993,0.000422858,8.99961e-07,-2.21068e-09,0.0586231,0.000424651,8.93329e-07,2.9961e-09,0.0590486,0.000426447,9.02317e-07,-2.32311e-09,0.059476,0.000428244,8.95348e-07,2.57122e-09,0.0599051,0.000430043,9.03062e-07,-5.11098e-10,0.0603361,0.000431847,9.01528e-07,-5.27166e-10,0.0607688,0.000433649,8.99947e-07,2.61984e-09,0.0612034,0.000435457,9.07806e-07,-2.50141e-09,0.0616397,0.000437265,9.00302e-07,3.66045e-09,0.0620779,0.000439076,9.11283e-07,-4.68977e-09,0.0625179,0.000440885,8.97214e-07,7.64783e-09,0.0629597,0.000442702,9.20158e-07,-7.27499e-09,0.0634033,0.000444521,8.98333e-07,6.55113e-09,0.0638487,0.000446337,9.17986e-07,-4.02844e-09,0.0642959,0.000448161,9.05901e-07,2.11196e-09,0.064745,0.000449979,9.12236e-07,3.03125e-09,0.0651959,0.000451813,9.2133e-07,-6.78648e-09,0.0656486,0.000453635,9.00971e-07,9.21375e-09,0.0661032,0.000455464,9.28612e-07,-7.71684e-09,0.0665596,0.000457299,9.05462e-07,6.7522e-09,0.0670178,0.00045913,9.25718e-07,-4.3907e-09,0.0674778,0.000460968,9.12546e-07,3.36e-09,0.0679397,0.000462803,9.22626e-07,-1.59876e-09,0.0684034,0.000464644,9.1783e-07,3.0351e-09,0.068869,0.000466488,9.26935e-07,-3.09101e-09,0.0693364,0.000468333,9.17662e-07,1.8785e-09,0.0698057,0.000470174,9.23298e-07,3.02733e-09,0.0702768,0.00047203,9.3238e-07,-6.53722e-09,0.0707497,0.000473875,9.12768e-07,8.22054e-09,0.0712245,0.000475725,9.37429e-07,-3.99325e-09,0.0717012,0.000477588,9.2545e-07,3.01839e-10,0.0721797,0.00047944,9.26355e-07,2.78597e-09,0.0726601,0.000481301,9.34713e-07,-3.99507e-09,0.0731423,0.000483158,9.22728e-07,5.7435e-09,0.0736264,0.000485021,9.39958e-07,-4.07776e-09,0.0741123,0.000486888,9.27725e-07,3.11695e-09,0.0746002,0.000488753,9.37076e-07,-9.39394e-10,0.0750898,0.000490625,9.34258e-07,6.4055e-10,0.0755814,0.000492495,9.3618e-07,-1.62265e-09,0.0760748,0.000494363,9.31312e-07,5.84995e-09,0.0765701,0.000496243,9.48861e-07,-6.87601e-09,0.0770673,0.00049812,9.28233e-07,6.75296e-09,0.0775664,0.000499997,9.48492e-07,-5.23467e-09,0.0780673,0.000501878,9.32788e-07,6.73523e-09,0.0785701,0.000503764,9.52994e-07,-6.80514e-09,0.0790748,0.000505649,9.32578e-07,5.5842e-09,0.0795814,0.000507531,9.49331e-07,-6.30583e-10,0.0800899,0.000509428,9.47439e-07,-3.0618e-09,0.0806003,0.000511314,9.38254e-07,5.4273e-09,0.0811125,0.000513206,9.54536e-07,-3.74627e-09,0.0816267,0.000515104,9.43297e-07,2.10713e-09,0.0821427,0.000516997,9.49618e-07,2.76839e-09,0.0826607,0.000518905,9.57924e-07,-5.73006e-09,0.0831805,0.000520803,9.40733e-07,5.25072e-09,0.0837023,0.0005227,9.56486e-07,-3.71718e-10,0.084226,0.000524612,9.5537e-07,-3.76404e-09,0.0847515,0.000526512,9.44078e-07,7.97735e-09,0.085279,0.000528424,9.6801e-07,-5.79367e-09,0.0858084,0.000530343,9.50629e-07,2.96268e-10,0.0863397,0.000532245,9.51518e-07,4.6086e-09,0.0868729,0.000534162,9.65344e-07,-3.82947e-09,0.087408,0.000536081,9.53856e-07,3.25861e-09,0.087945,0.000537998,9.63631e-07,-1.7543e-09,0.088484,0.00053992,9.58368e-07,3.75849e-09,0.0890249,0.000541848,9.69644e-07,-5.82891e-09,0.0895677,0.00054377,9.52157e-07,4.65593e-09,0.0901124,0.000545688,9.66125e-07,2.10643e-09,0.0906591,0.000547627,9.72444e-07,-5.63099e-09,0.0912077,0.000549555,9.55551e-07,5.51627e-09,0.0917582,0.000551483,9.721e-07,-1.53292e-09,0.0923106,0.000553422,9.67501e-07,6.15311e-10,0.092865,0.000555359,9.69347e-07,-9.28291e-10,0.0934213,0.000557295,9.66562e-07,3.09774e-09,0.0939796,0.000559237,9.75856e-07,-4.01186e-09,0.0945398,0.000561177,9.6382e-07,5.49892e-09,0.095102,0.000563121,9.80317e-07,-3.08258e-09,0.0956661,0.000565073,9.71069e-07,-6.19176e-10,0.0962321,0.000567013,9.69212e-07,5.55932e-09,0.0968001,0.000568968,9.8589e-07,-6.71704e-09,0.09737,0.00057092,9.65738e-07,6.40762e-09,0.0979419,0.00057287,9.84961e-07,-4.0122e-09,0.0985158,0.000574828,9.72925e-07,2.19059e-09,0.0990916,0.000576781,9.79496e-07,2.70048e-09,0.0996693,0.000578748,9.87598e-07,-5.54193e-09,0.100249,0.000580706,9.70972e-07,4.56597e-09,0.100831,0.000582662,9.8467e-07,2.17923e-09,0.101414,0.000584638,9.91208e-07,-5.83232e-09,0.102,0.000586603,9.73711e-07,6.24884e-09,0.102588,0.000588569,9.92457e-07,-4.26178e-09,0.103177,0.000590541,9.79672e-07,3.34781e-09,0.103769,0.00059251,9.89715e-07,-1.67904e-09,0.104362,0.000594485,9.84678e-07,3.36839e-09,0.104958,0.000596464,9.94783e-07,-4.34397e-09,0.105555,0.000598441,9.81751e-07,6.55696e-09,0.106155,0.000600424,1.00142e-06,-6.98272e-09,0.106756,0.000602406,9.80474e-07,6.4728e-09,0.107359,0.000604386,9.99893e-07,-4.00742e-09,0.107965,0.000606374,9.8787e-07,2.10654e-09,0.108572,0.000608356,9.9419e-07,3.0318e-09,0.109181,0.000610353,1.00329e-06,-6.7832e-09,0.109793,0.00061234,9.82936e-07,9.1998e-09,0.110406,0.000614333,1.01054e-06,-7.6642e-09,0.111021,0.000616331,9.87543e-07,6.55579e-09,0.111639,0.000618326,1.00721e-06,-3.65791e-09,0.112258,0.000620329,9.96236e-07,6.25467e-10,0.112879,0.000622324,9.98113e-07,1.15593e-09,0.113503,0.000624323,1.00158e-06,2.20158e-09,0.114128,0.000626333,1.00819e-06,-2.51191e-09,0.114755,0.000628342,1.00065e-06,3.95517e-10,0.115385,0.000630345,1.00184e-06,9.29807e-10,0.116016,0.000632351,1.00463e-06,3.33599e-09,0.116649,0.00063437,1.01463e-06,-6.82329e-09,0.117285,0.000636379,9.94163e-07,9.05595e-09,0.117922,0.000638395,1.02133e-06,-7.04862e-09,0.118562,0.000640416,1.00019e-06,4.23737e-09,0.119203,0.000642429,1.0129e-06,-2.45033e-09,0.119847,0.000644448,1.00555e-06,5.56395e-09,0.120492,0.000646475,1.02224e-06,-4.9043e-09,0.121139,0.000648505,1.00753e-06,-8.47952e-10,0.121789,0.000650518,1.00498e-06,8.29622e-09,0.122441,0.000652553,1.02987e-06,-9.98538e-09,0.123094,0.000654582,9.99914e-07,9.2936e-09,0.12375,0.00065661,1.02779e-06,-4.83707e-09,0.124407,0.000658651,1.01328e-06,2.60411e-09,0.125067,0.000660685,1.0211e-06,-5.57945e-09,0.125729,0.000662711,1.00436e-06,1.22631e-08,0.126392,0.000664756,1.04115e-06,-1.36704e-08,0.127058,0.000666798,1.00014e-06,1.26161e-08,0.127726,0.000668836,1.03798e-06,-6.99155e-09,0.128396,0.000670891,1.01701e-06,4.48836e-10,0.129068,0.000672926,1.01836e-06,5.19606e-09,0.129742,0.000674978,1.03394e-06,-6.3319e-09,0.130418,0.000677027,1.01495e-06,5.2305e-09,0.131096,0.000679073,1.03064e-06,3.11123e-10,0.131776,0.000681135,1.03157e-06,-6.47511e-09,0.132458,0.000683179,1.01215e-06,1.06882e-08,0.133142,0.000685235,1.04421e-06,-6.47519e-09,0.133829,0.000687304,1.02479e-06,3.11237e-10,0.134517,0.000689355,1.02572e-06,5.23035e-09,0.135207,0.000691422,1.04141e-06,-6.3316e-09,0.1359,0.000693486,1.02242e-06,5.19484e-09,0.136594,0.000695546,1.038e-06,4.53497e-10,0.137291,0.000697623,1.03936e-06,-7.00891e-09,0.137989,0.000699681,1.01834e-06,1.2681e-08,0.13869,0.000701756,1.05638e-06,-1.39128e-08,0.139393,0.000703827,1.01464e-06,1.31679e-08,0.140098,0.000705896,1.05414e-06,-8.95659e-09,0.140805,0.000707977,1.02727e-06,7.75742e-09,0.141514,0.000710055,1.05055e-06,-7.17182e-09,0.142225,0.000712135,1.02903e-06,6.02862e-09,0.142938,0.000714211,1.04712e-06,-2.04163e-09,0.143653,0.000716299,1.04099e-06,2.13792e-09,0.144371,0.000718387,1.04741e-06,-6.51009e-09,0.14509,0.000720462,1.02787e-06,9.00123e-09,0.145812,0.000722545,1.05488e-06,3.07523e-10,0.146535,0.000724656,1.0558e-06,-1.02312e-08,0.147261,0.000726737,1.02511e-06,1.0815e-08,0.147989,0.000728819,1.05755e-06,-3.22681e-09,0.148719,0.000730925,1.04787e-06,2.09244e-09,0.14945,0.000733027,1.05415e-06,-5.143e-09,0.150185,0.00073512,1.03872e-06,3.57844e-09,0.150921,0.000737208,1.04946e-06,5.73027e-09,0.151659,0.000739324,1.06665e-06,-1.15983e-08,0.152399,0.000741423,1.03185e-06,1.08605e-08,0.153142,0.000743519,1.06443e-06,-2.04106e-09,0.153886,0.000745642,1.05831e-06,-2.69642e-09,0.154633,0.00074775,1.05022e-06,-2.07425e-09,0.155382,0.000749844,1.044e-06,1.09934e-08,0.156133,0.000751965,1.07698e-06,-1.20972e-08,0.156886,0.000754083,1.04069e-06,7.59288e-09,0.157641,0.000756187,1.06347e-06,-3.37305e-09,0.158398,0.000758304,1.05335e-06,5.89921e-09,0.159158,0.000760428,1.07104e-06,-5.32248e-09,0.159919,0.000762554,1.05508e-06,4.8927e-10,0.160683,0.000764666,1.05654e-06,3.36547e-09,0.161448,0.000766789,1.06664e-06,9.50081e-10,0.162216,0.000768925,1.06949e-06,-7.16568e-09,0.162986,0.000771043,1.04799e-06,1.28114e-08,0.163758,0.000773177,1.08643e-06,-1.42774e-08,0.164533,0.000775307,1.0436e-06,1.44956e-08,0.165309,0.000777438,1.08708e-06,-1.39025e-08,0.166087,0.00077957,1.04538e-06,1.13118e-08,0.166868,0.000781695,1.07931e-06,-1.54224e-09,0.167651,0.000783849,1.07468e-06,-5.14312e-09,0.168436,0.000785983,1.05925e-06,7.21381e-09,0.169223,0.000788123,1.0809e-06,-8.81096e-09,0.170012,0.000790259,1.05446e-06,1.31289e-08,0.170803,0.000792407,1.09385e-06,-1.39022e-08,0.171597,0.000794553,1.05214e-06,1.26775e-08,0.172392,0.000796695,1.09018e-06,-7.00557e-09,0.17319,0.000798855,1.06916e-06,4.43796e-10,0.17399,0.000800994,1.07049e-06,5.23031e-09,0.174792,0.000803151,1.08618e-06,-6.46397e-09,0.175596,0.000805304,1.06679e-06,5.72444e-09,0.176403,0.000807455,1.08396e-06,-1.53254e-09,0.177211,0.000809618,1.07937e-06,4.05673e-10,0.178022,0.000811778,1.08058e-06,-9.01916e-11,0.178835,0.000813939,1.08031e-06,-4.49821e-11,0.17965,0.000816099,1.08018e-06,2.70234e-10,0.180467,0.00081826,1.08099e-06,-1.03603e-09,0.181286,0.000820419,1.07788e-06,3.87392e-09,0.182108,0.000822587,1.0895e-06,4.41522e-10,0.182932,0.000824767,1.09083e-06,-5.63997e-09,0.183758,0.000826932,1.07391e-06,7.21707e-09,0.184586,0.000829101,1.09556e-06,-8.32718e-09,0.185416,0.000831267,1.07058e-06,1.11907e-08,0.186248,0.000833442,1.10415e-06,-6.63336e-09,0.187083,0.00083563,1.08425e-06,4.41484e-10,0.187919,0.0008378,1.08557e-06,4.86754e-09,0.188758,0.000839986,1.10017e-06,-5.01041e-09,0.189599,0.000842171,1.08514e-06,2.72811e-10,0.190443,0.000844342,1.08596e-06,3.91916e-09,0.191288,0.000846526,1.09772e-06,-1.04819e-09,0.192136,0.000848718,1.09457e-06,2.73531e-10,0.192985,0.000850908,1.0954e-06,-4.58916e-11,0.193837,0.000853099,1.09526e-06,-9.01158e-11,0.194692,0.000855289,1.09499e-06,4.06506e-10,0.195548,0.00085748,1.09621e-06,-1.53595e-09,0.196407,0.000859668,1.0916e-06,5.73717e-09,0.197267,0.000861869,1.10881e-06,-6.51164e-09,0.19813,0.000864067,1.08928e-06,5.40831e-09,0.198995,0.000866261,1.1055e-06,-2.20401e-10,0.199863,0.000868472,1.10484e-06,-4.52652e-09,0.200732,0.000870668,1.09126e-06,3.42508e-09,0.201604,0.000872861,1.10153e-06,5.72762e-09,0.202478,0.000875081,1.11872e-06,-1.14344e-08,0.203354,0.000877284,1.08441e-06,1.02076e-08,0.204233,0.000879484,1.11504e-06,4.06355e-10,0.205113,0.000881715,1.11626e-06,-1.18329e-08,0.205996,0.000883912,1.08076e-06,1.71227e-08,0.206881,0.000886125,1.13213e-06,-1.19546e-08,0.207768,0.000888353,1.09626e-06,8.93465e-10,0.208658,0.000890548,1.09894e-06,8.38062e-09,0.209549,0.000892771,1.12408e-06,-4.61353e-09,0.210443,0.000895006,1.11024e-06,-4.82756e-09,0.211339,0.000897212,1.09576e-06,9.02245e-09,0.212238,0.00089943,1.12283e-06,-1.45997e-09,0.213138,0.000901672,1.11845e-06,-3.18255e-09,0.214041,0.000903899,1.1089e-06,-7.11073e-10,0.214946,0.000906115,1.10677e-06,6.02692e-09,0.215853,0.000908346,1.12485e-06,-8.49548e-09,0.216763,0.00091057,1.09936e-06,1.30537e-08,0.217675,0.000912808,1.13852e-06,-1.3917e-08,0.218588,0.000915044,1.09677e-06,1.28121e-08,0.219505,0.000917276,1.13521e-06,-7.5288e-09,0.220423,0.000919523,1.11262e-06,2.40205e-09,0.221344,0.000921756,1.11983e-06,-2.07941e-09,0.222267,0.000923989,1.11359e-06,5.91551e-09,0.223192,0.000926234,1.13134e-06,-6.68149e-09,0.224119,0.000928477,1.11129e-06,5.90929e-09,0.225049,0.000930717,1.12902e-06,-2.05436e-09,0.22598,0.000932969,1.12286e-06,2.30807e-09,0.226915,0.000935222,1.12978e-06,-7.17796e-09,0.227851,0.00093746,1.10825e-06,1.15028e-08,0.228789,0.000939711,1.14276e-06,-9.03083e-09,0.22973,0.000941969,1.11566e-06,9.71932e-09,0.230673,0.00094423,1.14482e-06,-1.49452e-08,0.231619,0.000946474,1.09998e-06,2.02591e-08,0.232566,0.000948735,1.16076e-06,-2.13879e-08,0.233516,0.000950993,1.0966e-06,2.05888e-08,0.234468,0.000953247,1.15837e-06,-1.62642e-08,0.235423,0.000955515,1.10957e-06,1.46658e-08,0.236379,0.000957779,1.15357e-06,-1.25966e-08,0.237338,0.000960048,1.11578e-06,5.91793e-09,0.238299,0.000962297,1.13353e-06,3.82602e-09,0.239263,0.000964576,1.14501e-06,-6.3208e-09,0.240229,0.000966847,1.12605e-06,6.55613e-09,0.241197,0.000969119,1.14572e-06,-5.00268e-09,0.242167,0.000971395,1.13071e-06,-1.44659e-09,0.243139,0.000973652,1.12637e-06,1.07891e-08,0.244114,0.000975937,1.15874e-06,-1.19073e-08,0.245091,0.000978219,1.12302e-06,7.03782e-09,0.246071,0.000980486,1.14413e-06,-1.34276e-09,0.247052,0.00098277,1.1401e-06,-1.66669e-09,0.248036,0.000985046,1.1351e-06,8.00935e-09,0.249022,0.00098734,1.15913e-06,-1.54694e-08,0.250011,0.000989612,1.11272e-06,2.4066e-08,0.251002,0.000991909,1.18492e-06,-2.11901e-08,0.251995,0.000994215,1.12135e-06,1.08973e-09,0.25299,0.000996461,1.12462e-06,1.68311e-08,0.253988,0.000998761,1.17511e-06,-8.8094e-09,0.254987,0.00100109,1.14868e-06,-1.13958e-08,0.25599,0.00100335,1.1145e-06,2.45902e-08,0.256994,0.00100565,1.18827e-06,-2.73603e-08,0.258001,0.00100795,1.10618e-06,2.52464e-08,0.25901,0.00101023,1.18192e-06,-1.40207e-08,0.260021,0.00101256,1.13986e-06,1.03387e-09,0.261035,0.00101484,1.14296e-06,9.8853e-09,0.262051,0.00101715,1.17262e-06,-1.07726e-08,0.263069,0.00101947,1.1403e-06,3.40272e-09,0.26409,0.00102176,1.15051e-06,-2.83827e-09,0.265113,0.00102405,1.142e-06,7.95039e-09,0.266138,0.00102636,1.16585e-06,8.39047e-10,0.267166,0.00102869,1.16836e-06,-1.13066e-08,0.268196,0.00103099,1.13444e-06,1.4585e-08,0.269228,0.00103331,1.1782e-06,-1.72314e-08,0.270262,0.00103561,1.1265e-06,2.45382e-08,0.271299,0.00103794,1.20012e-06,-2.13166e-08,0.272338,0.00104028,1.13617e-06,1.12364e-09,0.273379,0.00104255,1.13954e-06,1.68221e-08,0.274423,0.00104488,1.19001e-06,-8.80736e-09,0.275469,0.00104723,1.16358e-06,-1.13948e-08,0.276518,0.00104953,1.1294e-06,2.45839e-08,0.277568,0.00105186,1.20315e-06,-2.73361e-08,0.278621,0.00105418,1.12114e-06,2.51559e-08,0.279677,0.0010565,1.19661e-06,-1.36832e-08,0.280734,0.00105885,1.15556e-06,-2.25706e-10,0.281794,0.00106116,1.15488e-06,1.45862e-08,0.282857,0.00106352,1.19864e-06,-2.83167e-08,0.283921,0.00106583,1.11369e-06,3.90759e-08,0.284988,0.00106817,1.23092e-06,-3.85801e-08,0.286058,0.00107052,1.11518e-06,2.58375e-08,0.287129,0.00107283,1.19269e-06,-5.16498e-09,0.288203,0.0010752,1.1772e-06,-5.17768e-09,0.28928,0.00107754,1.16167e-06,-3.92671e-09,0.290358,0.00107985,1.14988e-06,2.08846e-08,0.29144,0.00108221,1.21254e-06,-2.00072e-08,0.292523,0.00108458,1.15252e-06,-4.60659e-10,0.293609,0.00108688,1.15114e-06,2.18499e-08,0.294697,0.00108925,1.21669e-06,-2.73343e-08,0.295787,0.0010916,1.13468e-06,2.78826e-08,0.29688,0.00109395,1.21833e-06,-2.45915e-08,0.297975,0.00109632,1.14456e-06,1.08787e-08,0.299073,0.00109864,1.17719e-06,1.08788e-08,0.300172,0.00110102,1.20983e-06,-2.45915e-08,0.301275,0.00110337,1.13605e-06,2.78828e-08,0.302379,0.00110573,1.2197e-06,-2.73348e-08,0.303486,0.00110808,1.1377e-06,2.18518e-08,0.304595,0.00111042,1.20325e-06,-4.67556e-10,0.305707,0.00111283,1.20185e-06,-1.99816e-08,0.306821,0.00111517,1.14191e-06,2.07891e-08,0.307937,0.00111752,1.20427e-06,-3.57026e-09,0.309056,0.00111992,1.19356e-06,-6.50797e-09,0.310177,0.00112228,1.17404e-06,-2.00165e-10,0.3113,0.00112463,1.17344e-06,7.30874e-09,0.312426,0.001127,1.19536e-06,7.67424e-10,0.313554,0.00112939,1.19767e-06,-1.03784e-08,0.314685,0.00113176,1.16653e-06,1.09437e-08,0.315818,0.00113412,1.19936e-06,-3.59406e-09,0.316953,0.00113651,1.18858e-06,3.43251e-09,0.318091,0.0011389,1.19888e-06,-1.0136e-08,0.319231,0.00114127,1.16847e-06,7.30915e-09,0.320374,0.00114363,1.1904e-06,1.07018e-08,0.321518,0.00114604,1.2225e-06,-2.03137e-08,0.322666,0.00114842,1.16156e-06,1.09484e-08,0.323815,0.00115078,1.19441e-06,6.32224e-09,0.324967,0.00115319,1.21337e-06,-6.43509e-09,0.326122,0.00115559,1.19407e-06,-1.03842e-08,0.327278,0.00115795,1.16291e-06,1.81697e-08,0.328438,0.00116033,1.21742e-06,-2.6901e-09,0.329599,0.00116276,1.20935e-06,-7.40939e-09,0.330763,0.00116515,1.18713e-06,2.52533e-09,0.331929,0.00116754,1.1947e-06,-2.69191e-09,0.333098,0.00116992,1.18663e-06,8.24218e-09,0.334269,0.00117232,1.21135e-06,-4.74377e-10,0.335443,0.00117474,1.20993e-06,-6.34471e-09,0.336619,0.00117714,1.1909e-06,-3.94922e-09,0.337797,0.00117951,1.17905e-06,2.21417e-08,0.338978,0.00118193,1.24547e-06,-2.50128e-08,0.340161,0.00118435,1.17043e-06,1.8305e-08,0.341346,0.00118674,1.22535e-06,-1.84048e-08,0.342534,0.00118914,1.17013e-06,2.55121e-08,0.343725,0.00119156,1.24667e-06,-2.40389e-08,0.344917,0.00119398,1.17455e-06,1.10389e-08,0.346113,0.00119636,1.20767e-06,9.68574e-09,0.34731,0.0011988,1.23673e-06,-1.99797e-08,0.34851,0.00120122,1.17679e-06,1.06284e-08,0.349713,0.0012036,1.20867e-06,7.26868e-09,0.350917,0.00120604,1.23048e-06,-9.90072e-09,0.352125,0.00120847,1.20078e-06,2.53177e-09,0.353334,0.00121088,1.20837e-06,-2.26199e-10,0.354546,0.0012133,1.20769e-06,-1.62705e-09,0.355761,0.00121571,1.20281e-06,6.73435e-09,0.356978,0.00121813,1.22302e-06,4.49207e-09,0.358197,0.00122059,1.23649e-06,-2.47027e-08,0.359419,0.00122299,1.16238e-06,3.47142e-08,0.360643,0.00122542,1.26653e-06,-2.47472e-08,0.36187,0.00122788,1.19229e-06,4.66965e-09,0.363099,0.00123028,1.20629e-06,6.06872e-09,0.36433,0.00123271,1.2245e-06,8.57729e-10,0.365564,0.00123516,1.22707e-06,-9.49952e-09,0.366801,0.00123759,1.19858e-06,7.33792e-09,0.36804,0.00124001,1.22059e-06,9.95025e-09,0.369281,0.00124248,1.25044e-06,-1.73366e-08,0.370525,0.00124493,1.19843e-06,-2.08464e-10,0.371771,0.00124732,1.1978e-06,1.81704e-08,0.373019,0.00124977,1.25232e-06,-1.28683e-08,0.37427,0.00125224,1.21371e-06,3.50042e-09,0.375524,0.00125468,1.22421e-06,-1.1335e-09,0.37678,0.00125712,1.22081e-06,1.03345e-09,0.378038,0.00125957,1.22391e-06,-3.00023e-09,0.379299,0.00126201,1.21491e-06,1.09676e-08,0.380562,0.00126447,1.24781e-06,-1.10676e-08,0.381828,0.00126693,1.21461e-06,3.50042e-09,0.383096,0.00126937,1.22511e-06,-2.93403e-09,0.384366,0.00127181,1.21631e-06,8.23574e-09,0.385639,0.00127427,1.24102e-06,-2.06607e-10,0.386915,0.00127675,1.2404e-06,-7.40935e-09,0.388193,0.00127921,1.21817e-06,4.1761e-11,0.389473,0.00128165,1.21829e-06,7.24223e-09,0.390756,0.0012841,1.24002e-06,7.91564e-10,0.392042,0.00128659,1.2424e-06,-1.04086e-08,0.393329,0.00128904,1.21117e-06,1.10405e-08,0.39462,0.0012915,1.24429e-06,-3.951e-09,0.395912,0.00129397,1.23244e-06,4.7634e-09,0.397208,0.00129645,1.24673e-06,-1.51025e-08,0.398505,0.0012989,1.20142e-06,2.58443e-08,0.399805,0.00130138,1.27895e-06,-2.86702e-08,0.401108,0.00130385,1.19294e-06,2.92318e-08,0.402413,0.00130632,1.28064e-06,-2.86524e-08,0.403721,0.0013088,1.19468e-06,2.57731e-08,0.405031,0.00131127,1.272e-06,-1.48355e-08,0.406343,0.00131377,1.2275e-06,3.76652e-09,0.407658,0.00131623,1.23879e-06,-2.30784e-10,0.408976,0.00131871,1.2381e-06,-2.84331e-09,0.410296,0.00132118,1.22957e-06,1.16041e-08,0.411618,0.00132367,1.26438e-06,-1.37708e-08,0.412943,0.00132616,1.22307e-06,1.36768e-08,0.41427,0.00132865,1.2641e-06,-1.1134e-08,0.4156,0.00133114,1.2307e-06,1.05714e-09,0.416933,0.00133361,1.23387e-06,6.90538e-09,0.418267,0.00133609,1.25459e-06,1.12372e-09,0.419605,0.00133861,1.25796e-06,-1.14002e-08,0.420945,0.00134109,1.22376e-06,1.46747e-08,0.422287,0.00134358,1.26778e-06,-1.7496e-08,0.423632,0.00134606,1.21529e-06,2.5507e-08,0.424979,0.00134857,1.29182e-06,-2.49272e-08,0.426329,0.00135108,1.21703e-06,1.45972e-08,0.427681,0.00135356,1.26083e-06,-3.65935e-09,0.429036,0.00135607,1.24985e-06,4.00178e-11,0.430393,0.00135857,1.24997e-06,3.49917e-09,0.431753,0.00136108,1.26047e-06,-1.40366e-08,0.433116,0.00136356,1.21836e-06,2.28448e-08,0.43448,0.00136606,1.28689e-06,-1.77378e-08,0.435848,0.00136858,1.23368e-06,1.83043e-08,0.437218,0.0013711,1.28859e-06,-2.56769e-08,0.43859,0.0013736,1.21156e-06,2.47987e-08,0.439965,0.0013761,1.28595e-06,-1.39133e-08,0.441342,0.00137863,1.24421e-06,1.05202e-09,0.442722,0.00138112,1.24737e-06,9.70507e-09,0.444104,0.00138365,1.27649e-06,-1.00698e-08,0.445489,0.00138617,1.24628e-06,7.72123e-10,0.446877,0.00138867,1.24859e-06,6.98132e-09,0.448267,0.00139118,1.26954e-06,1.10477e-09,0.449659,0.00139373,1.27285e-06,-1.14003e-08,0.451054,0.00139624,1.23865e-06,1.4694e-08,0.452452,0.00139876,1.28273e-06,-1.75734e-08,0.453852,0.00140127,1.23001e-06,2.5797e-08,0.455254,0.00140381,1.3074e-06,-2.60097e-08,0.456659,0.00140635,1.22937e-06,1.86371e-08,0.458067,0.00140886,1.28529e-06,-1.8736e-08,0.459477,0.00141137,1.22908e-06,2.65048e-08,0.46089,0.00141391,1.30859e-06,-2.76784e-08,0.462305,0.00141645,1.22556e-06,2.46043e-08,0.463722,0.00141897,1.29937e-06,-1.11341e-08,0.465143,0.00142154,1.26597e-06,-9.87033e-09,0.466565,0.00142404,1.23636e-06,2.08131e-08,0.467991,0.00142657,1.2988e-06,-1.37773e-08,0.469419,0.00142913,1.25746e-06,4.49378e-09,0.470849,0.00143166,1.27094e-06,-4.19781e-09,0.472282,0.00143419,1.25835e-06,1.22975e-08,0.473717,0.00143674,1.29524e-06,-1.51902e-08,0.475155,0.00143929,1.24967e-06,1.86608e-08,0.476596,0.00144184,1.30566e-06,-2.96506e-08,0.478039,0.00144436,1.2167e-06,4.03368e-08,0.479485,0.00144692,1.33771e-06,-4.22896e-08,0.480933,0.00144947,1.21085e-06,3.94148e-08,0.482384,0.00145201,1.32909e-06,-2.59626e-08,0.483837,0.00145459,1.2512e-06,4.83124e-09,0.485293,0.0014571,1.2657e-06,6.63757e-09,0.486751,0.00145966,1.28561e-06,-1.57911e-09,0.488212,0.00146222,1.28087e-06,-3.21468e-10,0.489676,0.00146478,1.27991e-06,2.86517e-09,0.491142,0.00146735,1.2885e-06,-1.11392e-08,0.49261,0.00146989,1.25508e-06,1.18893e-08,0.494081,0.00147244,1.29075e-06,-6.61574e-09,0.495555,0.001475,1.27091e-06,1.45736e-08,0.497031,0.00147759,1.31463e-06,-2.18759e-08,0.49851,0.00148015,1.249e-06,1.33252e-08,0.499992,0.00148269,1.28897e-06,-1.62277e-09,0.501476,0.00148526,1.28411e-06,-6.83421e-09,0.502962,0.00148781,1.2636e-06,2.89596e-08,0.504451,0.00149042,1.35048e-06,-4.93997e-08,0.505943,0.00149298,1.20228e-06,4.94299e-08,0.507437,0.00149553,1.35057e-06,-2.91107e-08,0.508934,0.00149814,1.26324e-06,7.40848e-09,0.510434,0.00150069,1.28547e-06,-5.23187e-10,0.511936,0.00150326,1.2839e-06,-5.31585e-09,0.51344,0.00150581,1.26795e-06,2.17866e-08,0.514947,0.00150841,1.33331e-06,-2.22257e-08,0.516457,0.00151101,1.26663e-06,7.51178e-09,0.517969,0.00151357,1.28917e-06,-7.82128e-09,0.519484,0.00151613,1.2657e-06,2.37733e-08,0.521002,0.00151873,1.33702e-06,-2.76674e-08,0.522522,0.00152132,1.25402e-06,2.72917e-08,0.524044,0.00152391,1.3359e-06,-2.18949e-08,0.525569,0.00152652,1.27021e-06,6.83372e-10,0.527097,0.00152906,1.27226e-06,1.91613e-08,0.528628,0.00153166,1.32974e-06,-1.77241e-08,0.53016,0.00153427,1.27657e-06,-7.86963e-09,0.531696,0.0015368,1.25296e-06,4.92027e-08,0.533234,0.00153945,1.40057e-06,-6.9732e-08,0.534775,0.00154204,1.19138e-06,5.09114e-08,0.536318,0.00154458,1.34411e-06,-1.4704e-08,0.537864,0.00154722,1.3e-06,7.9048e-09,0.539413,0.00154984,1.32371e-06,-1.69152e-08,0.540964,0.00155244,1.27297e-06,1.51355e-10,0.542517,0.00155499,1.27342e-06,1.63099e-08,0.544074,0.00155758,1.32235e-06,-5.78647e-09,0.545633,0.00156021,1.30499e-06,6.83599e-09,0.547194,0.00156284,1.3255e-06,-2.15575e-08,0.548758,0.00156543,1.26083e-06,1.97892e-08,0.550325,0.00156801,1.32019e-06,2.00525e-09,0.551894,0.00157065,1.32621e-06,-2.78103e-08,0.553466,0.00157322,1.24278e-06,4.96314e-08,0.555041,0.00157586,1.39167e-06,-5.1506e-08,0.556618,0.00157849,1.23716e-06,3.71835e-08,0.558198,0.00158107,1.34871e-06,-3.76233e-08,0.55978,0.00158366,1.23584e-06,5.37052e-08,0.561365,0.00158629,1.39695e-06,-5.79884e-08,0.562953,0.00158891,1.22299e-06,5.90392e-08,0.564543,0.00159153,1.4001e-06,-5.89592e-08,0.566136,0.00159416,1.22323e-06,5.7588e-08,0.567731,0.00159678,1.39599e-06,-5.21835e-08,0.569329,0.00159941,1.23944e-06,3.19369e-08,0.57093,0.00160199,1.33525e-06,-1.59594e-08,0.572533,0.00160461,1.28737e-06,3.19006e-08,0.574139,0.00160728,1.38307e-06,-5.20383e-08,0.575748,0.00160989,1.22696e-06,5.70431e-08,0.577359,0.00161251,1.39809e-06,-5.69247e-08,0.578973,0.00161514,1.22731e-06,5.14463e-08,0.580589,0.00161775,1.38165e-06,-2.9651e-08,0.582208,0.00162042,1.2927e-06,7.55339e-09,0.58383,0.00162303,1.31536e-06,-5.62636e-10,0.585455,0.00162566,1.31367e-06,-5.30281e-09,0.587081,0.00162827,1.29776e-06,2.17738e-08,0.588711,0.00163093,1.36309e-06,-2.21875e-08,0.590343,0.00163359,1.29652e-06,7.37164e-09,0.591978,0.00163621,1.31864e-06,-7.29907e-09,0.593616,0.00163882,1.29674e-06,2.18247e-08,0.595256,0.00164148,1.36221e-06,-2.03952e-08,0.596899,0.00164414,1.30103e-06,1.51241e-10,0.598544,0.00164675,1.30148e-06,1.97902e-08,0.600192,0.00164941,1.36085e-06,-1.97074e-08,0.601843,0.00165207,1.30173e-06,-5.65175e-10,0.603496,0.00165467,1.30004e-06,2.1968e-08,0.605152,0.00165734,1.36594e-06,-2.77024e-08,0.606811,0.00165999,1.28283e-06,2.92369e-08,0.608472,0.00166264,1.37054e-06,-2.96407e-08,0.610136,0.00166529,1.28162e-06,2.97215e-08,0.611803,0.00166795,1.37079e-06,-2.96408e-08,0.613472,0.0016706,1.28186e-06,2.92371e-08,0.615144,0.00167325,1.36957e-06,-2.77031e-08,0.616819,0.00167591,1.28647e-06,2.19708e-08,0.618496,0.00167855,1.35238e-06,-5.75407e-10,0.620176,0.00168125,1.35065e-06,-1.9669e-08,0.621858,0.00168389,1.29164e-06,1.96468e-08,0.623544,0.00168653,1.35058e-06,6.86403e-10,0.625232,0.00168924,1.35264e-06,-2.23924e-08,0.626922,0.00169187,1.28547e-06,2.92788e-08,0.628615,0.00169453,1.3733e-06,-3.51181e-08,0.630311,0.00169717,1.26795e-06,5.15889e-08,0.63201,0.00169987,1.42272e-06,-5.2028e-08,0.633711,0.00170255,1.26663e-06,3.73139e-08,0.635415,0.0017052,1.37857e-06,-3.76227e-08,0.637121,0.00170784,1.2657e-06,5.35722e-08,0.63883,0.00171054,1.42642e-06,-5.74567e-08,0.640542,0.00171322,1.25405e-06,5.70456e-08,0.642257,0.0017159,1.42519e-06,-5.15163e-08,0.643974,0.00171859,1.27064e-06,2.98103e-08,0.645694,0.00172122,1.36007e-06,-8.12016e-09,0.647417,0.00172392,1.33571e-06,2.67039e-09,0.649142,0.0017266,1.34372e-06,-2.56152e-09,0.65087,0.00172928,1.33604e-06,7.57571e-09,0.6526,0.00173197,1.35876e-06,-2.77413e-08,0.654334,0.00173461,1.27554e-06,4.3785e-08,0.65607,0.00173729,1.40689e-06,-2.81896e-08,0.657808,0.00174002,1.32233e-06,9.36893e-09,0.65955,0.00174269,1.35043e-06,-9.28617e-09,0.661294,0.00174536,1.32257e-06,2.77757e-08,0.66304,0.00174809,1.4059e-06,-4.2212e-08,0.66479,0.00175078,1.27926e-06,2.1863e-08,0.666542,0.0017534,1.34485e-06,1.43648e-08,0.668297,0.00175613,1.38795e-06,-1.97177e-08,0.670054,0.00175885,1.3288e-06,4.90115e-09,0.671814,0.00176152,1.3435e-06,1.13232e-10,0.673577,0.00176421,1.34384e-06,-5.3542e-09,0.675343,0.00176688,1.32778e-06,2.13035e-08,0.677111,0.0017696,1.39169e-06,-2.02553e-08,0.678882,0.00177232,1.33092e-06,1.13005e-10,0.680656,0.00177499,1.33126e-06,1.98031e-08,0.682432,0.00177771,1.39067e-06,-1.97211e-08,0.684211,0.00178043,1.33151e-06,-5.2349e-10,0.685993,0.00178309,1.32994e-06,2.18151e-08,0.687777,0.00178582,1.39538e-06,-2.71325e-08,0.689564,0.00178853,1.31398e-06,2.71101e-08,0.691354,0.00179124,1.39531e-06,-2.17035e-08,0.693147,0.00179396,1.3302e-06,9.92865e-11,0.694942,0.00179662,1.3305e-06,2.13063e-08,0.69674,0.00179935,1.39442e-06,-2.57198e-08,0.698541,0.00180206,1.31726e-06,2.19682e-08,0.700344,0.00180476,1.38317e-06,-2.54852e-09,0.70215,0.00180752,1.37552e-06,-1.17741e-08,0.703959,0.00181023,1.3402e-06,-9.95999e-09,0.705771,0.00181288,1.31032e-06,5.16141e-08,0.707585,0.00181566,1.46516e-06,-7.72869e-08,0.709402,0.00181836,1.2333e-06,7.87197e-08,0.711222,0.00182106,1.46946e-06,-5.87781e-08,0.713044,0.00182382,1.29312e-06,3.71834e-08,0.714869,0.00182652,1.40467e-06,-3.03511e-08,0.716697,0.00182924,1.31362e-06,2.46161e-08,0.718528,0.00183194,1.38747e-06,-8.5087e-09,0.720361,0.00183469,1.36194e-06,9.41892e-09,0.722197,0.00183744,1.3902e-06,-2.91671e-08,0.724036,0.00184014,1.3027e-06,4.76448e-08,0.725878,0.00184288,1.44563e-06,-4.22028e-08,0.727722,0.00184565,1.31902e-06,1.95682e-09,0.729569,0.00184829,1.3249e-06,3.43754e-08,0.731419,0.00185104,1.42802e-06,-2.0249e-08,0.733271,0.00185384,1.36727e-06,-1.29838e-08,0.735126,0.00185654,1.32832e-06,1.25794e-08,0.736984,0.00185923,1.36606e-06,2.22711e-08,0.738845,0.00186203,1.43287e-06,-4.20594e-08,0.740708,0.00186477,1.3067e-06,2.67571e-08,0.742574,0.00186746,1.38697e-06,-5.36424e-09,0.744443,0.00187022,1.37087e-06,-5.30023e-09,0.746315,0.00187295,1.35497e-06,2.65653e-08,0.748189,0.00187574,1.43467e-06,-4.13564e-08,0.750066,0.00187848,1.3106e-06,1.9651e-08,0.751946,0.00188116,1.36955e-06,2.23572e-08,0.753828,0.00188397,1.43663e-06,-4.9475e-08,0.755714,0.00188669,1.2882e-06,5.63335e-08,0.757602,0.00188944,1.4572e-06,-5.66499e-08,0.759493,0.00189218,1.28725e-06,5.10567e-08,0.761386,0.00189491,1.44042e-06,-2.83677e-08,0.763283,0.00189771,1.35532e-06,2.80962e-09,0.765182,0.00190042,1.36375e-06,1.71293e-08,0.767083,0.0019032,1.41513e-06,-1.17221e-08,0.768988,0.001906,1.37997e-06,-2.98453e-08,0.770895,0.00190867,1.29043e-06,7.14987e-08,0.772805,0.00191146,1.50493e-06,-7.73354e-08,0.774718,0.00191424,1.27292e-06,5.90292e-08,0.776634,0.00191697,1.45001e-06,-3.9572e-08,0.778552,0.00191975,1.33129e-06,3.9654e-08,0.780473,0.00192253,1.45026e-06,-5.94395e-08,0.782397,0.00192525,1.27194e-06,7.88945e-08,0.784324,0.00192803,1.50862e-06,-7.73249e-08,0.786253,0.00193082,1.27665e-06,5.15913e-08,0.788185,0.00193352,1.43142e-06,-9.83099e-09,0.79012,0.00193636,1.40193e-06,-1.22672e-08,0.792058,0.00193912,1.36513e-06,-7.05275e-10,0.793999,0.00194185,1.36301e-06,1.50883e-08,0.795942,0.00194462,1.40828e-06,-4.33147e-11,0.797888,0.00194744,1.40815e-06,-1.49151e-08,0.799837,0.00195021,1.3634e-06,9.93244e-11,0.801788,0.00195294,1.3637e-06,1.45179e-08,0.803743,0.00195571,1.40725e-06,1.43363e-09,0.8057,0.00195853,1.41155e-06,-2.02525e-08,0.80766,0.00196129,1.35079e-06,1.99718e-08,0.809622,0.00196405,1.41071e-06,-3.01649e-11,0.811588,0.00196687,1.41062e-06,-1.9851e-08,0.813556,0.00196964,1.35107e-06,1.98296e-08,0.815527,0.0019724,1.41056e-06,1.37485e-10,0.817501,0.00197522,1.41097e-06,-2.03796e-08,0.819477,0.00197798,1.34983e-06,2.17763e-08,0.821457,0.00198074,1.41516e-06,-7.12085e-09,0.823439,0.00198355,1.3938e-06,6.70707e-09,0.825424,0.00198636,1.41392e-06,-1.97074e-08,0.827412,0.00198913,1.35479e-06,1.25179e-08,0.829402,0.00199188,1.39235e-06,2.92405e-08,0.831396,0.00199475,1.48007e-06,-6.98755e-08,0.833392,0.0019975,1.27044e-06,7.14477e-08,0.835391,0.00200026,1.48479e-06,-3.71014e-08,0.837392,0.00200311,1.37348e-06,1.73533e-08,0.839397,0.00200591,1.42554e-06,-3.23118e-08,0.841404,0.00200867,1.32861e-06,5.2289e-08,0.843414,0.00201148,1.48547e-06,-5.76348e-08,0.845427,0.00201428,1.31257e-06,5.9041e-08,0.847443,0.00201708,1.48969e-06,-5.93197e-08,0.849461,0.00201988,1.31173e-06,5.90289e-08,0.851482,0.00202268,1.48882e-06,-5.75864e-08,0.853507,0.00202549,1.31606e-06,5.21075e-08,0.855533,0.00202828,1.47238e-06,-3.16344e-08,0.857563,0.00203113,1.37748e-06,1.48257e-08,0.859596,0.00203393,1.42196e-06,-2.76684e-08,0.861631,0.00203669,1.33895e-06,3.62433e-08,0.863669,0.00203947,1.44768e-06,1.90463e-09,0.86571,0.00204237,1.45339e-06,-4.38617e-08,0.867754,0.00204515,1.32181e-06,5.43328e-08,0.8698,0.00204796,1.48481e-06,-5.42603e-08,0.87185,0.00205076,1.32203e-06,4.34989e-08,0.873902,0.00205354,1.45252e-06,-5.26029e-10,0.875957,0.00205644,1.45095e-06,-4.13949e-08,0.878015,0.00205922,1.32676e-06,4.68962e-08,0.880075,0.00206201,1.46745e-06,-2.69807e-08,0.882139,0.00206487,1.38651e-06,1.42181e-09,0.884205,0.00206764,1.39077e-06,2.12935e-08,0.886274,0.00207049,1.45465e-06,-2.69912e-08,0.888346,0.00207332,1.37368e-06,2.70664e-08,0.890421,0.00207615,1.45488e-06,-2.16698e-08,0.892498,0.00207899,1.38987e-06,8.14756e-12,0.894579,0.00208177,1.38989e-06,2.16371e-08,0.896662,0.00208462,1.45481e-06,-2.6952e-08,0.898748,0.00208744,1.37395e-06,2.65663e-08,0.900837,0.00209027,1.45365e-06,-1.97084e-08,0.902928,0.00209312,1.39452e-06,-7.33731e-09,0.905023,0.00209589,1.37251e-06,4.90578e-08,0.90712,0.00209878,1.51968e-06,-6.96845e-08,0.90922,0.00210161,1.31063e-06,5.08664e-08,0.911323,0.00210438,1.46323e-06,-1.45717e-08,0.913429,0.00210727,1.41952e-06,7.42038e-09,0.915538,0.00211013,1.44178e-06,-1.51097e-08,0.917649,0.00211297,1.39645e-06,-6.58618e-09,0.919764,0.00211574,1.37669e-06,4.14545e-08,0.921881,0.00211862,1.50105e-06,-4.00222e-08,0.924001,0.0021215,1.38099e-06,-5.7518e-10,0.926124,0.00212426,1.37926e-06,4.23229e-08,0.92825,0.00212714,1.50623e-06,-4.9507e-08,0.930378,0.00213001,1.35771e-06,3.64958e-08,0.93251,0.00213283,1.4672e-06,-3.68713e-08,0.934644,0.00213566,1.35658e-06,5.13848e-08,0.936781,0.00213852,1.51074e-06,-4.94585e-08,0.938921,0.0021414,1.36236e-06,2.72399e-08,0.941064,0.0021442,1.44408e-06,1.0372e-10,0.943209,0.00214709,1.44439e-06,-2.76547e-08,0.945358,0.0021499,1.36143e-06,5.09106e-08,0.947509,0.00215277,1.51416e-06,-5.67784e-08,0.949663,0.00215563,1.34382e-06,5.69935e-08,0.95182,0.00215849,1.5148e-06,-5.19861e-08,0.95398,0.00216136,1.35885e-06,3.17417e-08,0.956143,0.00216418,1.45407e-06,-1.53758e-08,0.958309,0.00216704,1.40794e-06,2.97615e-08,0.960477,0.00216994,1.49723e-06,-4.40657e-08,0.962649,0.00217281,1.36503e-06,2.72919e-08,0.964823,0.00217562,1.44691e-06,-5.49729e-09,0.967,0.0021785,1.43041e-06,-5.30273e-09,0.96918,0.00218134,1.41451e-06,2.67084e-08,0.971363,0.00218425,1.49463e-06,-4.19265e-08,0.973548,0.00218711,1.36885e-06,2.17881e-08,0.975737,0.00218992,1.43422e-06,1.43789e-08,0.977928,0.00219283,1.47735e-06,-1.96989e-08,0.980122,0.00219572,1.41826e-06,4.81221e-09,0.98232,0.00219857,1.43269e-06,4.50048e-10,0.98452,0.00220144,1.43404e-06,-6.61237e-09,0.986722,0.00220429,1.41421e-06,2.59993e-08,0.988928,0.0022072,1.4922e-06,-3.77803e-08,0.991137,0.00221007,1.37886e-06,5.9127e-09,0.993348,0.00221284,1.3966e-06,1.33339e-07,0.995563,0.00221604,1.79662e-06,-5.98872e-07,0.99778,0.00222015,0.,0.};
+
+    __device__ static int LabCbrt_b(int i)
+    {
+        float x = i * (1.f / (255.f * (1 << gamma_shift)));
+        return (1 << lab_shift2) * (x < 0.008856f ? x * 7.787f + 0.13793103448275862f : ::cbrtf(x));
+    }
+
+    __device__ static float splineInterpolate(float x, const float* tab, int n)
+    {
+        int ix = ::min(::max(int(x), 0), n-1);
+        x -= ix;
+        tab += ix * 4;
+        return ((tab[3] * x + tab[2]) * x + tab[1]) * x + tab[0];
+    }
+
+    template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct RGB2Lab;
+
+    template <int scn, int dcn, bool srgb, int blueIdx> struct RGB2Lab<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename MakeVec<uchar, scn>::type, typename MakeVec<uchar, dcn>::type>
+    {
+        __device__ typename MakeVec<uchar, dcn>::type operator ()(const typename MakeVec<uchar, scn>::type& src) const
+        {
+            const int Lscale = (116 * 255 + 50) / 100;
+            const int Lshift = -((16 * 255 * (1 << lab_shift2) + 50) / 100);
+
+            int B = blueIdx == 0 ? src.x : src.z;
+            int G = src.y;
+            int R = blueIdx == 0 ? src.z : src.x;
+
+            if (srgb)
+            {
+                B = c_sRGBGammaTab_b[B];
+                G = c_sRGBGammaTab_b[G];
+                R = c_sRGBGammaTab_b[R];
+            }
+            else
+            {
+                B <<= 3;
+                G <<= 3;
+                R <<= 3;
+            }
+
+            int fX = LabCbrt_b(CV_CUDEV_DESCALE(B * 778 + G * 1541 + R * 1777, lab_shift));
+            int fY = LabCbrt_b(CV_CUDEV_DESCALE(B * 296 + G * 2929 + R * 871, lab_shift));
+            int fZ = LabCbrt_b(CV_CUDEV_DESCALE(B * 3575 + G * 448 + R * 73, lab_shift));
+
+            int L = CV_CUDEV_DESCALE(Lscale * fY + Lshift, lab_shift2);
+            int a = CV_CUDEV_DESCALE(500 * (fX - fY) + 128 * (1 << lab_shift2), lab_shift2);
+            int b = CV_CUDEV_DESCALE(200 * (fY - fZ) + 128 * (1 << lab_shift2), lab_shift2);
+
+            typename MakeVec<uchar, dcn>::type dst;
+
+            dst.x = saturate_cast<uchar>(L);
+            dst.y = saturate_cast<uchar>(a);
+            dst.z = saturate_cast<uchar>(b);
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, bool srgb, int blueIdx> struct RGB2Lab<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float _1_3 = 1.0f / 3.0f;
+            const float _a = 16.0f / 116.0f;
+
+            float B = blueIdx == 0 ? src.x : src.z;
+            float G = src.y;
+            float R = blueIdx == 0 ? src.z : src.x;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            float X = B * 0.189828f + G * 0.376219f + R * 0.433953f;
+            float Y = B * 0.072169f + G * 0.715160f + R * 0.212671f;
+            float Z = B * 0.872766f + G * 0.109477f + R * 0.017758f;
+
+            float FX = X > 0.008856f ? ::powf(X, _1_3) : (7.787f * X + _a);
+            float FY = Y > 0.008856f ? ::powf(Y, _1_3) : (7.787f * Y + _a);
+            float FZ = Z > 0.008856f ? ::powf(Z, _1_3) : (7.787f * Z + _a);
+
+            float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
+            float a = 500.f * (FX - FY);
+            float b = 200.f * (FY - FZ);
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = L;
+            dst.y = a;
+            dst.z = b;
+
+            return dst;
+        }
+    };
+
+    // Lab to RGB
+
+    __constant__ float c_sRGBInvGammaTab[] = {0,0.0126255,0.,-8.33961e-06,0.0126172,0.0126005,-2.50188e-05,4.1698e-05,0.0252344,0.0126756,0.000100075,-0.000158451,0.0378516,0.0124004,-0.000375277,-0.000207393,0.0496693,0.0110276,-0.000997456,0.00016837,0.0598678,0.00953783,-0.000492346,2.07235e-05,0.068934,0.00861531,-0.000430176,3.62876e-05,0.0771554,0.00786382,-0.000321313,1.87625e-05,0.0847167,0.00727748,-0.000265025,1.53594e-05,0.0917445,0.00679351,-0.000218947,1.10545e-05,0.0983301,0.00638877,-0.000185784,8.66984e-06,0.104542,0.00604322,-0.000159774,6.82996e-06,0.110432,0.00574416,-0.000139284,5.51008e-06,0.116042,0.00548212,-0.000122754,4.52322e-06,0.121406,0.00525018,-0.000109184,3.75557e-06,0.126551,0.00504308,-9.79177e-05,3.17134e-06,0.131499,0.00485676,-8.84037e-05,2.68469e-06,0.13627,0.004688,-8.03496e-05,2.31725e-06,0.14088,0.00453426,-7.33978e-05,2.00868e-06,0.145343,0.00439349,-6.73718e-05,1.74775e-06,0.149671,0.00426399,-6.21286e-05,1.53547e-06,0.153875,0.00414434,-5.75222e-05,1.364e-06,0.157963,0.00403338,-5.34301e-05,1.20416e-06,0.161944,0.00393014,-4.98177e-05,1.09114e-06,0.165825,0.00383377,-4.65443e-05,9.57987e-07,0.169613,0.00374356,-4.36703e-05,8.88359e-07,0.173314,0.00365888,-4.10052e-05,7.7849e-07,0.176933,0.00357921,-3.86697e-05,7.36254e-07,0.180474,0.00350408,-3.6461e-05,6.42534e-07,0.183942,0.00343308,-3.45334e-05,6.12614e-07,0.187342,0.00336586,-3.26955e-05,5.42894e-07,0.190675,0.00330209,-3.10669e-05,5.08967e-07,0.193947,0.00324149,-2.954e-05,4.75977e-07,0.197159,0.00318383,-2.8112e-05,4.18343e-07,0.200315,0.00312887,-2.6857e-05,4.13651e-07,0.203418,0.00307639,-2.5616e-05,3.70847e-07,0.206469,0.00302627,-2.45035e-05,3.3813e-07,0.209471,0.00297828,-2.34891e-05,3.32999e-07,0.212426,0.0029323,-2.24901e-05,2.96826e-07,0.215336,0.00288821,-2.15996e-05,2.82736e-07,0.218203,0.00284586,-2.07514e-05,2.70961e-07,0.221029,0.00280517,-1.99385e-05,2.42744e-07,0.223814,0.00276602,-1.92103e-05,2.33277e-07,0.226561,0.0027283,-1.85105e-05,2.2486e-07,0.229271,0.00269195,-1.78359e-05,2.08383e-07,0.231945,0.00265691,-1.72108e-05,1.93305e-07,0.234585,0.00262307,-1.66308e-05,1.80687e-07,0.237192,0.00259035,-1.60888e-05,1.86632e-07,0.239766,0.00255873,-1.55289e-05,1.60569e-07,0.24231,0.00252815,-1.50472e-05,1.54566e-07,0.244823,0.00249852,-1.45835e-05,1.59939e-07,0.247307,0.00246983,-1.41037e-05,1.29549e-07,0.249763,0.00244202,-1.3715e-05,1.41429e-07,0.252191,0.00241501,-1.32907e-05,1.39198e-07,0.254593,0.00238885,-1.28731e-05,1.06444e-07,0.256969,0.00236342,-1.25538e-05,1.2048e-07,0.25932,0.00233867,-1.21924e-05,1.26892e-07,0.261647,0.00231467,-1.18117e-05,8.72084e-08,0.26395,0.00229131,-1.15501e-05,1.20323e-07,0.26623,0.00226857,-1.11891e-05,8.71514e-08,0.268487,0.00224645,-1.09276e-05,9.73165e-08,0.270723,0.00222489,-1.06357e-05,8.98259e-08,0.272937,0.00220389,-1.03662e-05,7.98218e-08,0.275131,0.00218339,-1.01267e-05,9.75254e-08,0.277304,0.00216343,-9.83416e-06,6.65195e-08,0.279458,0.00214396,-9.63461e-06,8.34313e-08,0.281592,0.00212494,-9.38431e-06,7.65919e-08,0.283708,0.00210641,-9.15454e-06,5.7236e-08,0.285805,0.00208827,-8.98283e-06,8.18939e-08,0.287885,0.00207055,-8.73715e-06,6.2224e-08,0.289946,0.00205326,-8.55047e-06,5.66388e-08,0.291991,0.00203633,-8.38056e-06,6.88491e-08,0.294019,0.00201978,-8.17401e-06,5.53955e-08,0.296031,0.00200359,-8.00782e-06,6.71971e-08,0.298027,0.00198778,-7.80623e-06,3.34439e-08,0.300007,0.00197227,-7.7059e-06,6.7248e-08,0.301971,0.00195706,-7.50416e-06,5.51915e-08,0.303921,0.00194221,-7.33858e-06,3.98124e-08,0.305856,0.00192766,-7.21915e-06,5.37795e-08,0.307776,0.00191338,-7.05781e-06,4.30919e-08,0.309683,0.00189939,-6.92853e-06,4.20744e-08,0.311575,0.00188566,-6.80231e-06,5.68321e-08,0.313454,0.00187223,-6.63181e-06,2.86195e-08,0.31532,0.00185905,-6.54595e-06,3.73075e-08,0.317172,0.00184607,-6.43403e-06,6.05684e-08,0.319012,0.00183338,-6.25233e-06,1.84426e-08,0.320839,0.00182094,-6.197e-06,4.44757e-08,0.322654,0.00180867,-6.06357e-06,4.20729e-08,0.324456,0.00179667,-5.93735e-06,2.56511e-08,0.326247,0.00178488,-5.8604e-06,3.41368e-08,0.328026,0.00177326,-5.75799e-06,4.64177e-08,0.329794,0.00176188,-5.61874e-06,1.86107e-08,0.33155,0.0017507,-5.5629e-06,2.81511e-08,0.333295,0.00173966,-5.47845e-06,4.75987e-08,0.335029,0.00172884,-5.33565e-06,1.98726e-08,0.336753,0.00171823,-5.27604e-06,2.19226e-08,0.338466,0.00170775,-5.21027e-06,4.14483e-08,0.340169,0.00169745,-5.08592e-06,2.09017e-08,0.341861,0.00168734,-5.02322e-06,2.39561e-08,0.343543,0.00167737,-4.95135e-06,3.22852e-08,0.345216,0.00166756,-4.85449e-06,2.57173e-08,0.346878,0.00165793,-4.77734e-06,1.38569e-08,0.348532,0.00164841,-4.73577e-06,3.80634e-08,0.350175,0.00163906,-4.62158e-06,1.27043e-08,0.35181,0.00162985,-4.58347e-06,3.03279e-08,0.353435,0.00162078,-4.49249e-06,1.49961e-08,0.355051,0.00161184,-4.4475e-06,2.88977e-08,0.356659,0.00160303,-4.3608e-06,1.84241e-08,0.358257,0.00159436,-4.30553e-06,1.6616e-08,0.359848,0.0015858,-4.25568e-06,3.43218e-08,0.361429,0.00157739,-4.15272e-06,-4.89172e-09,0.363002,0.00156907,-4.16739e-06,4.48498e-08,0.364567,0.00156087,-4.03284e-06,4.30676e-09,0.366124,0.00155282,-4.01992e-06,2.73303e-08,0.367673,0.00154486,-3.93793e-06,5.58036e-09,0.369214,0.001537,-3.92119e-06,3.97554e-08,0.370747,0.00152928,-3.80193e-06,-1.55904e-08,0.372272,0.00152163,-3.8487e-06,5.24081e-08,0.37379,0.00151409,-3.69147e-06,-1.52272e-08,0.375301,0.00150666,-3.73715e-06,3.83028e-08,0.376804,0.0014993,-3.62225e-06,1.10278e-08,0.378299,0.00149209,-3.58916e-06,6.99326e-09,0.379788,0.00148493,-3.56818e-06,2.06038e-08,0.381269,0.00147786,-3.50637e-06,2.98009e-08,0.382744,0.00147093,-3.41697e-06,-2.05978e-08,0.384211,0.00146404,-3.47876e-06,5.25899e-08,0.385672,0.00145724,-3.32099e-06,-1.09471e-08,0.387126,0.00145056,-3.35383e-06,2.10009e-08,0.388573,0.00144392,-3.29083e-06,1.63501e-08,0.390014,0.00143739,-3.24178e-06,3.00641e-09,0.391448,0.00143091,-3.23276e-06,3.12282e-08,0.392875,0.00142454,-3.13908e-06,-8.70932e-09,0.394297,0.00141824,-3.16521e-06,3.34114e-08,0.395712,0.00141201,-3.06497e-06,-5.72754e-09,0.397121,0.00140586,-3.08215e-06,1.9301e-08,0.398524,0.00139975,-3.02425e-06,1.7931e-08,0.39992,0.00139376,-2.97046e-06,-1.61822e-09,0.401311,0.00138781,-2.97531e-06,1.83442e-08,0.402696,0.00138192,-2.92028e-06,1.76485e-08,0.404075,0.00137613,-2.86733e-06,4.68617e-10,0.405448,0.00137039,-2.86593e-06,1.02794e-08,0.406816,0.00136469,-2.83509e-06,1.80179e-08,0.408178,0.00135908,-2.78104e-06,7.05594e-09,0.409534,0.00135354,-2.75987e-06,1.33633e-08,0.410885,0.00134806,-2.71978e-06,-9.04568e-10,0.41223,0.00134261,-2.72249e-06,2.0057e-08,0.41357,0.00133723,-2.66232e-06,1.00841e-08,0.414905,0.00133194,-2.63207e-06,-7.88835e-10,0.416234,0.00132667,-2.63444e-06,2.28734e-08,0.417558,0.00132147,-2.56582e-06,-1.29785e-09,0.418877,0.00131633,-2.56971e-06,1.21205e-08,0.420191,0.00131123,-2.53335e-06,1.24202e-08,0.421499,0.0013062,-2.49609e-06,-2.19681e-09,0.422803,0.0013012,-2.50268e-06,2.61696e-08,0.424102,0.00129628,-2.42417e-06,-1.30747e-08,0.425396,0.00129139,-2.46339e-06,2.6129e-08,0.426685,0.00128654,-2.38501e-06,-2.03454e-09,0.427969,0.00128176,-2.39111e-06,1.18115e-08,0.429248,0.00127702,-2.35567e-06,1.43932e-08,0.430523,0.00127235,-2.31249e-06,-9.77965e-09,0.431793,0.00126769,-2.34183e-06,2.47253e-08,0.433058,0.00126308,-2.26766e-06,2.85278e-10,0.434319,0.00125855,-2.2668e-06,3.93614e-09,0.435575,0.00125403,-2.25499e-06,1.37722e-08,0.436827,0.00124956,-2.21368e-06,5.79803e-10,0.438074,0.00124513,-2.21194e-06,1.37112e-08,0.439317,0.00124075,-2.1708e-06,4.17973e-09,0.440556,0.00123642,-2.15826e-06,-6.27703e-10,0.44179,0.0012321,-2.16015e-06,2.81332e-08,0.44302,0.00122787,-2.07575e-06,-2.24985e-08,0.444246,0.00122365,-2.14324e-06,3.20586e-08,0.445467,0.00121946,-2.04707e-06,-1.6329e-08,0.446685,0.00121532,-2.09605e-06,3.32573e-08,0.447898,0.00121122,-1.99628e-06,-2.72927e-08,0.449107,0.00120715,-2.07816e-06,4.6111e-08,0.450312,0.00120313,-1.93983e-06,-3.79416e-08,0.451514,0.00119914,-2.05365e-06,4.60507e-08,0.452711,0.00119517,-1.9155e-06,-2.7052e-08,0.453904,0.00119126,-1.99666e-06,3.23551e-08,0.455093,0.00118736,-1.89959e-06,-1.29613e-08,0.456279,0.00118352,-1.93848e-06,1.94905e-08,0.45746,0.0011797,-1.88e-06,-5.39588e-09,0.458638,0.00117593,-1.89619e-06,2.09282e-09,0.459812,0.00117214,-1.88991e-06,2.68267e-08,0.460982,0.00116844,-1.80943e-06,-1.99925e-08,0.462149,0.00116476,-1.86941e-06,2.3341e-08,0.463312,0.00116109,-1.79939e-06,-1.37674e-08,0.464471,0.00115745,-1.84069e-06,3.17287e-08,0.465627,0.00115387,-1.7455e-06,-2.37407e-08,0.466779,0.00115031,-1.81673e-06,3.34315e-08,0.467927,0.00114677,-1.71643e-06,-2.05786e-08,0.469073,0.00114328,-1.77817e-06,1.90802e-08,0.470214,0.00113978,-1.72093e-06,3.86247e-09,0.471352,0.00113635,-1.70934e-06,-4.72759e-09,0.472487,0.00113292,-1.72352e-06,1.50478e-08,0.473618,0.00112951,-1.67838e-06,4.14108e-09,0.474746,0.00112617,-1.66595e-06,-1.80986e-09,0.47587,0.00112283,-1.67138e-06,3.09816e-09,0.476991,0.0011195,-1.66209e-06,1.92198e-08,0.478109,0.00111623,-1.60443e-06,-2.03726e-08,0.479224,0.00111296,-1.66555e-06,3.2468e-08,0.480335,0.00110973,-1.56814e-06,-2.00922e-08,0.481443,0.00110653,-1.62842e-06,1.80983e-08,0.482548,0.00110333,-1.57413e-06,7.30362e-09,0.48365,0.0011002,-1.55221e-06,-1.75107e-08,0.484749,0.00109705,-1.60475e-06,3.29373e-08,0.485844,0.00109393,-1.50594e-06,-2.48315e-08,0.486937,0.00109085,-1.58043e-06,3.65865e-08,0.488026,0.0010878,-1.47067e-06,-3.21078e-08,0.489112,0.00108476,-1.56699e-06,3.22397e-08,0.490195,0.00108172,-1.47027e-06,-7.44391e-09,0.491276,0.00107876,-1.49261e-06,-2.46428e-09,0.492353,0.00107577,-1.5e-06,1.73011e-08,0.493427,0.00107282,-1.4481e-06,-7.13552e-09,0.494499,0.0010699,-1.4695e-06,1.1241e-08,0.495567,0.001067,-1.43578e-06,-8.02637e-09,0.496633,0.0010641,-1.45986e-06,2.08645e-08,0.497695,0.00106124,-1.39726e-06,-1.58271e-08,0.498755,0.0010584,-1.44475e-06,1.26415e-08,0.499812,0.00105555,-1.40682e-06,2.48655e-08,0.500866,0.00105281,-1.33222e-06,-5.24988e-08,0.501918,0.00104999,-1.48972e-06,6.59206e-08,0.502966,0.00104721,-1.29196e-06,-3.237e-08,0.504012,0.00104453,-1.38907e-06,3.95479e-09,0.505055,0.00104176,-1.3772e-06,1.65509e-08,0.506096,0.00103905,-1.32755e-06,-1.05539e-08,0.507133,0.00103637,-1.35921e-06,2.56648e-08,0.508168,0.00103373,-1.28222e-06,-3.25007e-08,0.509201,0.00103106,-1.37972e-06,4.47336e-08,0.51023,0.00102844,-1.24552e-06,-2.72245e-08,0.511258,0.00102587,-1.32719e-06,4.55952e-09,0.512282,0.00102323,-1.31352e-06,8.98645e-09,0.513304,0.00102063,-1.28656e-06,1.90992e-08,0.514323,0.00101811,-1.22926e-06,-2.57786e-08,0.51534,0.00101557,-1.30659e-06,2.44104e-08,0.516355,0.00101303,-1.23336e-06,-1.22581e-08,0.517366,0.00101053,-1.27014e-06,2.4622e-08,0.518376,0.00100806,-1.19627e-06,-2.66253e-08,0.519383,0.00100559,-1.27615e-06,2.22744e-08,0.520387,0.00100311,-1.20932e-06,-2.8679e-09,0.521389,0.00100068,-1.21793e-06,-1.08029e-08,0.522388,0.000998211,-1.25034e-06,4.60795e-08,0.523385,0.000995849,-1.1121e-06,-5.4306e-08,0.52438,0.000993462,-1.27502e-06,5.19354e-08,0.525372,0.000991067,-1.11921e-06,-3.42262e-08,0.526362,0.000988726,-1.22189e-06,2.53646e-08,0.52735,0.000986359,-1.14579e-06,-7.62782e-09,0.528335,0.000984044,-1.16868e-06,5.14668e-09,0.529318,0.000981722,-1.15324e-06,-1.29589e-08,0.530298,0.000979377,-1.19211e-06,4.66888e-08,0.531276,0.000977133,-1.05205e-06,-5.45868e-08,0.532252,0.000974865,-1.21581e-06,5.24495e-08,0.533226,0.000972591,-1.05846e-06,-3.60019e-08,0.534198,0.000970366,-1.16647e-06,3.19537e-08,0.535167,0.000968129,-1.07061e-06,-3.2208e-08,0.536134,0.000965891,-1.16723e-06,3.72738e-08,0.537099,0.000963668,-1.05541e-06,2.32205e-09,0.538061,0.000961564,-1.04844e-06,-4.65618e-08,0.539022,0.000959328,-1.18813e-06,6.47159e-08,0.53998,0.000957146,-9.93979e-07,-3.3488e-08,0.540936,0.000955057,-1.09444e-06,9.63166e-09,0.54189,0.000952897,-1.06555e-06,-5.03871e-09,0.542842,0.000950751,-1.08066e-06,1.05232e-08,0.543792,0.000948621,-1.04909e-06,2.25503e-08,0.544739,0.000946591,-9.81444e-07,-4.11195e-08,0.545685,0.000944504,-1.1048e-06,2.27182e-08,0.546628,0.000942363,-1.03665e-06,9.85146e-09,0.54757,0.000940319,-1.00709e-06,-2.51938e-09,0.548509,0.000938297,-1.01465e-06,2.25858e-10,0.549446,0.000936269,-1.01397e-06,1.61598e-09,0.550381,0.000934246,-1.00913e-06,-6.68983e-09,0.551315,0.000932207,-1.0292e-06,2.51434e-08,0.552246,0.000930224,-9.53765e-07,-3.42793e-08,0.553175,0.000928214,-1.0566e-06,5.23688e-08,0.554102,0.000926258,-8.99497e-07,-5.59865e-08,0.555028,0.000924291,-1.06746e-06,5.23679e-08,0.555951,0.000922313,-9.10352e-07,-3.42763e-08,0.556872,0.00092039,-1.01318e-06,2.51326e-08,0.557792,0.000918439,-9.37783e-07,-6.64954e-09,0.558709,0.000916543,-9.57732e-07,1.46554e-09,0.559625,0.000914632,-9.53335e-07,7.87281e-10,0.560538,0.000912728,-9.50973e-07,-4.61466e-09,0.56145,0.000910812,-9.64817e-07,1.76713e-08,0.56236,0.000908935,-9.11804e-07,-6.46564e-09,0.563268,0.000907092,-9.312e-07,8.19121e-09,0.564174,0.000905255,-9.06627e-07,-2.62992e-08,0.565078,0.000903362,-9.85524e-07,3.74007e-08,0.565981,0.000901504,-8.73322e-07,-4.0942e-09,0.566882,0.000899745,-8.85605e-07,-2.1024e-08,0.56778,0.00089791,-9.48677e-07,2.85854e-08,0.568677,0.000896099,-8.62921e-07,-3.3713e-08,0.569573,0.000894272,-9.64059e-07,4.6662e-08,0.570466,0.000892484,-8.24073e-07,-3.37258e-08,0.571358,0.000890734,-9.25251e-07,2.86365e-08,0.572247,0.00088897,-8.39341e-07,-2.12155e-08,0.573135,0.000887227,-9.02988e-07,-3.37913e-09,0.574022,0.000885411,-9.13125e-07,3.47319e-08,0.574906,0.000883689,-8.08929e-07,-1.63394e-08,0.575789,0.000882022,-8.57947e-07,-2.8979e-08,0.57667,0.00088022,-9.44885e-07,7.26509e-08,0.57755,0.000878548,-7.26932e-07,-8.28106e-08,0.578427,0.000876845,-9.75364e-07,7.97774e-08,0.579303,0.000875134,-7.36032e-07,-5.74849e-08,0.580178,0.00087349,-9.08486e-07,3.09529e-08,0.58105,0.000871765,-8.15628e-07,-6.72206e-09,0.581921,0.000870114,-8.35794e-07,-4.06451e-09,0.582791,0.00086843,-8.47987e-07,2.29799e-08,0.583658,0.000866803,-7.79048e-07,-2.82503e-08,0.584524,0.00086516,-8.63799e-07,3.04167e-08,0.585388,0.000863524,-7.72548e-07,-3.38119e-08,0.586251,0.000861877,-8.73984e-07,4.52264e-08,0.587112,0.000860265,-7.38305e-07,-2.78842e-08,0.587972,0.000858705,-8.21958e-07,6.70567e-09,0.58883,0.000857081,-8.01841e-07,1.06161e-09,0.589686,0.000855481,-7.98656e-07,-1.09521e-08,0.590541,0.00085385,-8.31512e-07,4.27468e-08,0.591394,0.000852316,-7.03272e-07,-4.08257e-08,0.592245,0.000850787,-8.25749e-07,1.34677e-09,0.593095,0.000849139,-8.21709e-07,3.54387e-08,0.593944,0.000847602,-7.15393e-07,-2.38924e-08,0.59479,0.0008461,-7.8707e-07,5.26143e-10,0.595636,0.000844527,-7.85491e-07,2.17879e-08,0.596479,0.000843021,-7.20127e-07,-2.80733e-08,0.597322,0.000841497,-8.04347e-07,3.09005e-08,0.598162,0.000839981,-7.11646e-07,-3.5924e-08,0.599002,0.00083845,-8.19418e-07,5.3191e-08,0.599839,0.000836971,-6.59845e-07,-5.76307e-08,0.600676,0.000835478,-8.32737e-07,5.81227e-08,0.60151,0.000833987,-6.58369e-07,-5.56507e-08,0.602344,0.000832503,-8.25321e-07,4.52706e-08,0.603175,0.000830988,-6.89509e-07,-6.22236e-09,0.604006,0.000829591,-7.08176e-07,-2.03811e-08,0.604834,0.000828113,-7.6932e-07,2.8142e-08,0.605662,0.000826659,-6.84894e-07,-3.25822e-08,0.606488,0.000825191,-7.8264e-07,4.25823e-08,0.607312,0.000823754,-6.54893e-07,-1.85376e-08,0.608135,0.000822389,-7.10506e-07,-2.80365e-08,0.608957,0.000820883,-7.94616e-07,7.1079e-08,0.609777,0.000819507,-5.81379e-07,-7.74655e-08,0.610596,0.000818112,-8.13775e-07,5.9969e-08,0.611413,0.000816665,-6.33868e-07,-4.32013e-08,0.612229,0.000815267,-7.63472e-07,5.32313e-08,0.613044,0.0008139,-6.03778e-07,-5.05148e-08,0.613857,0.000812541,-7.55323e-07,2.96187e-08,0.614669,0.000811119,-6.66466e-07,-8.35545e-09,0.615479,0.000809761,-6.91533e-07,3.80301e-09,0.616288,0.00080839,-6.80124e-07,-6.85666e-09,0.617096,0.000807009,-7.00694e-07,2.36237e-08,0.617903,0.000805678,-6.29822e-07,-2.80336e-08,0.618708,0.000804334,-7.13923e-07,2.8906e-08,0.619511,0.000802993,-6.27205e-07,-2.79859e-08,0.620314,0.000801655,-7.11163e-07,2.34329e-08,0.621114,0.000800303,-6.40864e-07,-6.14108e-09,0.621914,0.000799003,-6.59287e-07,1.13151e-09,0.622712,0.000797688,-6.55893e-07,1.61507e-09,0.62351,0.000796381,-6.51048e-07,-7.59186e-09,0.624305,0.000795056,-6.73823e-07,2.87524e-08,0.6251,0.000793794,-5.87566e-07,-4.7813e-08,0.625893,0.000792476,-7.31005e-07,4.32901e-08,0.626685,0.000791144,-6.01135e-07,-6.13814e-09,0.627475,0.000789923,-6.19549e-07,-1.87376e-08,0.628264,0.000788628,-6.75762e-07,2.14837e-08,0.629052,0.000787341,-6.11311e-07,-7.59265e-09,0.629839,0.000786095,-6.34089e-07,8.88692e-09,0.630625,0.000784854,-6.07428e-07,-2.7955e-08,0.631409,0.000783555,-6.91293e-07,4.33285e-08,0.632192,0.000782302,-5.61307e-07,-2.61497e-08,0.632973,0.000781101,-6.39757e-07,1.6658e-09,0.633754,0.000779827,-6.34759e-07,1.94866e-08,0.634533,0.000778616,-5.76299e-07,-2.00076e-08,0.635311,0.000777403,-6.36322e-07,9.39091e-10,0.636088,0.000776133,-6.33505e-07,1.62512e-08,0.636863,0.000774915,-5.84751e-07,-6.33937e-09,0.637638,0.000773726,-6.03769e-07,9.10609e-09,0.638411,0.000772546,-5.76451e-07,-3.00849e-08,0.639183,0.000771303,-6.66706e-07,5.1629e-08,0.639953,0.000770125,-5.11819e-07,-5.7222e-08,0.640723,0.000768929,-6.83485e-07,5.80497e-08,0.641491,0.000767736,-5.09336e-07,-5.57674e-08,0.642259,0.000766551,-6.76638e-07,4.58105e-08,0.643024,0.000765335,-5.39206e-07,-8.26541e-09,0.643789,0.000764231,-5.64002e-07,-1.27488e-08,0.644553,0.000763065,-6.02249e-07,-3.44168e-10,0.645315,0.00076186,-6.03281e-07,1.41254e-08,0.646077,0.000760695,-5.60905e-07,3.44727e-09,0.646837,0.000759584,-5.50563e-07,-2.79144e-08,0.647596,0.000758399,-6.34307e-07,4.86057e-08,0.648354,0.000757276,-4.88489e-07,-4.72989e-08,0.64911,0.000756158,-6.30386e-07,2.13807e-08,0.649866,0.000754961,-5.66244e-07,2.13808e-08,0.65062,0.000753893,-5.02102e-07,-4.7299e-08,0.651374,0.000752746,-6.43999e-07,4.86059e-08,0.652126,0.000751604,-4.98181e-07,-2.79154e-08,0.652877,0.000750524,-5.81927e-07,3.45089e-09,0.653627,0.000749371,-5.71575e-07,1.41119e-08,0.654376,0.00074827,-5.29239e-07,-2.93748e-10,0.655123,0.00074721,-5.3012e-07,-1.29368e-08,0.65587,0.000746111,-5.68931e-07,-7.56355e-09,0.656616,0.000744951,-5.91621e-07,4.3191e-08,0.65736,0.000743897,-4.62048e-07,-4.59911e-08,0.658103,0.000742835,-6.00022e-07,2.15642e-08,0.658846,0.0007417,-5.35329e-07,1.93389e-08,0.659587,0.000740687,-4.77312e-07,-3.93152e-08,0.660327,0.000739615,-5.95258e-07,1.87126e-08,0.661066,0.00073848,-5.3912e-07,2.40695e-08,0.661804,0.000737474,-4.66912e-07,-5.53859e-08,0.662541,0.000736374,-6.33069e-07,7.82648e-08,0.663277,0.000735343,-3.98275e-07,-7.88593e-08,0.664012,0.00073431,-6.34853e-07,5.83585e-08,0.664745,0.000733215,-4.59777e-07,-3.53656e-08,0.665478,0.000732189,-5.65874e-07,2.34994e-08,0.66621,0.000731128,-4.95376e-07,9.72743e-10,0.66694,0.00073014,-4.92458e-07,-2.73903e-08,0.66767,0.000729073,-5.74629e-07,4.89839e-08,0.668398,0.000728071,-4.27677e-07,-4.93359e-08,0.669126,0.000727068,-5.75685e-07,2.91504e-08,0.669853,0.000726004,-4.88234e-07,-7.66109e-09,0.670578,0.000725004,-5.11217e-07,1.49392e-09,0.671303,0.000723986,-5.06735e-07,1.68533e-09,0.672026,0.000722978,-5.01679e-07,-8.23525e-09,0.672749,0.00072195,-5.26385e-07,3.12556e-08,0.67347,0.000720991,-4.32618e-07,-5.71825e-08,0.674191,0.000719954,-6.04166e-07,7.8265e-08,0.67491,0.00071898,-3.69371e-07,-7.70634e-08,0.675628,0.00071801,-6.00561e-07,5.11747e-08,0.676346,0.000716963,-4.47037e-07,-8.42615e-09,0.677062,0.000716044,-4.72315e-07,-1.747e-08,0.677778,0.000715046,-5.24725e-07,1.87015e-08,0.678493,0.000714053,-4.68621e-07,2.26856e-09,0.679206,0.000713123,-4.61815e-07,-2.77758e-08,0.679919,0.000712116,-5.45142e-07,4.92298e-08,0.68063,0.000711173,-3.97453e-07,-4.99339e-08,0.681341,0.000710228,-5.47255e-07,3.12967e-08,0.682051,0.000709228,-4.53365e-07,-1.56481e-08,0.68276,0.000708274,-5.00309e-07,3.12958e-08,0.683467,0.000707367,-4.06422e-07,-4.99303e-08,0.684174,0.000706405,-5.56213e-07,4.9216e-08,0.68488,0.00070544,-4.08565e-07,-2.77245e-08,0.685585,0.00070454,-4.91738e-07,2.07748e-09,0.686289,0.000703562,-4.85506e-07,1.94146e-08,0.686992,0.00070265,-4.27262e-07,-2.01314e-08,0.687695,0.000701735,-4.87656e-07,1.50616e-09,0.688396,0.000700764,-4.83137e-07,1.41067e-08,0.689096,0.00069984,-4.40817e-07,1.67168e-09,0.689795,0.000698963,-4.35802e-07,-2.07934e-08,0.690494,0.000698029,-4.98182e-07,2.18972e-08,0.691192,0.000697099,-4.32491e-07,-7.19092e-09,0.691888,0.000696212,-4.54064e-07,6.86642e-09,0.692584,0.000695325,-4.33464e-07,-2.02747e-08,0.693279,0.000694397,-4.94288e-07,1.46279e-08,0.693973,0.000693452,-4.50405e-07,2.13678e-08,0.694666,0.000692616,-3.86301e-07,-4.04945e-08,0.695358,0.000691721,-5.07785e-07,2.14009e-08,0.696049,0.00069077,-4.43582e-07,1.44955e-08,0.69674,0.000689926,-4.00096e-07,-1.97783e-08,0.697429,0.000689067,-4.5943e-07,5.01296e-09,0.698118,0.000688163,-4.44392e-07,-2.73521e-10,0.698805,0.000687273,-4.45212e-07,-3.91893e-09,0.699492,0.000686371,-4.56969e-07,1.59493e-08,0.700178,0.000685505,-4.09121e-07,-2.73351e-10,0.700863,0.000684686,-4.09941e-07,-1.4856e-08,0.701548,0.000683822,-4.54509e-07,9.25979e-11,0.702231,0.000682913,-4.54231e-07,1.44855e-08,0.702913,0.000682048,-4.10775e-07,1.56992e-09,0.703595,0.000681231,-4.06065e-07,-2.07652e-08,0.704276,0.000680357,-4.68361e-07,2.18864e-08,0.704956,0.000679486,-4.02701e-07,-7.17595e-09,0.705635,0.000678659,-4.24229e-07,6.81748e-09,0.706313,0.000677831,-4.03777e-07,-2.0094e-08,0.70699,0.000676963,-4.64059e-07,1.39538e-08,0.707667,0.000676077,-4.22197e-07,2.38835e-08,0.708343,0.000675304,-3.50547e-07,-4.98831e-08,0.709018,0.000674453,-5.00196e-07,5.64395e-08,0.709692,0.000673622,-3.30878e-07,-5.66657e-08,0.710365,0.00067279,-5.00875e-07,5.1014e-08,0.711037,0.000671942,-3.47833e-07,-2.81809e-08,0.711709,0.000671161,-4.32376e-07,2.10513e-09,0.712379,0.000670303,-4.2606e-07,1.97604e-08,0.713049,0.00066951,-3.66779e-07,-2.15422e-08,0.713718,0.000668712,-4.31406e-07,6.8038e-09,0.714387,0.000667869,-4.10994e-07,-5.67295e-09,0.715054,0.00066703,-4.28013e-07,1.5888e-08,0.715721,0.000666222,-3.80349e-07,1.72576e-09,0.716387,0.000665467,-3.75172e-07,-2.27911e-08,0.717052,0.000664648,-4.43545e-07,2.9834e-08,0.717716,0.00066385,-3.54043e-07,-3.69401e-08,0.718379,0.000663031,-4.64864e-07,5.83219e-08,0.719042,0.000662277,-2.89898e-07,-7.71382e-08,0.719704,0.000661465,-5.21313e-07,7.14171e-08,0.720365,0.000660637,-3.07061e-07,-2.97161e-08,0.721025,0.000659934,-3.96209e-07,-1.21575e-08,0.721685,0.000659105,-4.32682e-07,1.87412e-08,0.722343,0.000658296,-3.76458e-07,-3.2029e-09,0.723001,0.000657533,-3.86067e-07,-5.9296e-09,0.723659,0.000656743,-4.03856e-07,2.69213e-08,0.724315,0.000656016,-3.23092e-07,-4.21511e-08,0.724971,0.000655244,-4.49545e-07,2.24737e-08,0.725625,0.000654412,-3.82124e-07,1.18611e-08,0.726279,0.000653683,-3.46541e-07,-1.03132e-08,0.726933,0.000652959,-3.7748e-07,-3.02128e-08,0.727585,0.000652114,-4.68119e-07,7.15597e-08,0.728237,0.000651392,-2.5344e-07,-7.72119e-08,0.728888,0.000650654,-4.85075e-07,5.8474e-08,0.729538,0.000649859,-3.09654e-07,-3.74746e-08,0.730188,0.000649127,-4.22077e-07,3.18197e-08,0.730837,0.000648379,-3.26618e-07,-3.01997e-08,0.731485,0.000647635,-4.17217e-07,2.93747e-08,0.732132,0.000646888,-3.29093e-07,-2.76943e-08,0.732778,0.000646147,-4.12176e-07,2.17979e-08,0.733424,0.000645388,-3.46783e-07,1.07292e-10,0.734069,0.000644695,-3.46461e-07,-2.22271e-08,0.734713,0.000643935,-4.13142e-07,2.91963e-08,0.735357,0.000643197,-3.25553e-07,-3.49536e-08,0.736,0.000642441,-4.30414e-07,5.10133e-08,0.736642,0.000641733,-2.77374e-07,-4.98904e-08,0.737283,0.000641028,-4.27045e-07,2.93392e-08,0.737924,0.000640262,-3.39028e-07,-7.86156e-09,0.738564,0.000639561,-3.62612e-07,2.10703e-09,0.739203,0.000638842,-3.56291e-07,-5.6653e-10,0.739842,0.000638128,-3.57991e-07,1.59086e-10,0.740479,0.000637412,-3.57513e-07,-6.98321e-11,0.741116,0.000636697,-3.57723e-07,1.20214e-10,0.741753,0.000635982,-3.57362e-07,-4.10987e-10,0.742388,0.000635266,-3.58595e-07,1.5237e-09,0.743023,0.000634553,-3.54024e-07,-5.68376e-09,0.743657,0.000633828,-3.71075e-07,2.12113e-08,0.744291,0.00063315,-3.07441e-07,-1.95569e-08,0.744924,0.000632476,-3.66112e-07,-2.58816e-09,0.745556,0.000631736,-3.73877e-07,2.99096e-08,0.746187,0.000631078,-2.84148e-07,-5.74454e-08,0.746818,0.000630337,-4.56484e-07,8.06629e-08,0.747448,0.000629666,-2.14496e-07,-8.63922e-08,0.748077,0.000628978,-4.73672e-07,8.60918e-08,0.748706,0.000628289,-2.15397e-07,-7.91613e-08,0.749334,0.000627621,-4.5288e-07,5.17393e-08,0.749961,0.00062687,-2.97663e-07,-8.58662e-09,0.750588,0.000626249,-3.23422e-07,-1.73928e-08,0.751214,0.00062555,-3.75601e-07,1.85532e-08,0.751839,0.000624855,-3.19941e-07,2.78479e-09,0.752463,0.000624223,-3.11587e-07,-2.96923e-08,0.753087,0.000623511,-4.00664e-07,5.63799e-08,0.75371,0.000622879,-2.31524e-07,-7.66179e-08,0.754333,0.000622186,-4.61378e-07,7.12778e-08,0.754955,0.000621477,-2.47545e-07,-2.96794e-08,0.755576,0.000620893,-3.36583e-07,-1.21648e-08,0.756196,0.000620183,-3.73077e-07,1.87339e-08,0.756816,0.000619493,-3.16875e-07,-3.16622e-09,0.757435,0.00061885,-3.26374e-07,-6.0691e-09,0.758054,0.000618179,-3.44581e-07,2.74426e-08,0.758672,0.000617572,-2.62254e-07,-4.40968e-08,0.759289,0.000616915,-3.94544e-07,2.97352e-08,0.759906,0.000616215,-3.05338e-07,-1.52393e-08,0.760522,0.000615559,-3.51056e-07,3.12221e-08,0.761137,0.000614951,-2.5739e-07,-5.00443e-08,0.761751,0.000614286,-4.07523e-07,4.9746e-08,0.762365,0.00061362,-2.58285e-07,-2.97303e-08,0.762979,0.000613014,-3.47476e-07,9.57079e-09,0.763591,0.000612348,-3.18764e-07,-8.55287e-09,0.764203,0.000611685,-3.44422e-07,2.46407e-08,0.764815,0.00061107,-2.705e-07,-3.04053e-08,0.765426,0.000610437,-3.61716e-07,3.73759e-08,0.766036,0.000609826,-2.49589e-07,-5.94935e-08,0.766645,0.000609149,-4.28069e-07,8.13889e-08,0.767254,0.000608537,-1.83902e-07,-8.72483e-08,0.767862,0.000607907,-4.45647e-07,8.87901e-08,0.76847,0.000607282,-1.79277e-07,-8.90983e-08,0.769077,0.000606656,-4.46572e-07,8.87892e-08,0.769683,0.000606029,-1.80204e-07,-8.72446e-08,0.770289,0.000605407,-4.41938e-07,8.13752e-08,0.770894,0.000604768,-1.97812e-07,-5.94423e-08,0.771498,0.000604194,-3.76139e-07,3.71848e-08,0.772102,0.000603553,-2.64585e-07,-2.96922e-08,0.772705,0.000602935,-3.53661e-07,2.19793e-08,0.773308,0.000602293,-2.87723e-07,1.37955e-09,0.77391,0.000601722,-2.83585e-07,-2.74976e-08,0.774512,0.000601072,-3.66077e-07,4.9006e-08,0.775112,0.000600487,-2.19059e-07,-4.93171e-08,0.775712,0.000599901,-3.67011e-07,2.90531e-08,0.776312,0.000599254,-2.79851e-07,-7.29081e-09,0.776911,0.000598673,-3.01724e-07,1.10077e-10,0.777509,0.00059807,-3.01393e-07,6.85053e-09,0.778107,0.000597487,-2.80842e-07,-2.75123e-08,0.778704,0.000596843,-3.63379e-07,4.35939e-08,0.779301,0.000596247,-2.32597e-07,-2.7654e-08,0.779897,0.000595699,-3.15559e-07,7.41741e-09,0.780492,0.00059509,-2.93307e-07,-2.01562e-09,0.781087,0.000594497,-2.99354e-07,6.45059e-10,0.781681,0.000593901,-2.97418e-07,-5.64635e-10,0.782275,0.000593304,-2.99112e-07,1.61347e-09,0.782868,0.000592711,-2.94272e-07,-5.88926e-09,0.78346,0.000592105,-3.1194e-07,2.19436e-08,0.784052,0.000591546,-2.46109e-07,-2.22805e-08,0.784643,0.000590987,-3.1295e-07,7.57368e-09,0.785234,0.000590384,-2.90229e-07,-8.01428e-09,0.785824,0.00058978,-3.14272e-07,2.44834e-08,0.786414,0.000589225,-2.40822e-07,-3.03148e-08,0.787003,0.000588652,-3.31766e-07,3.7171e-08,0.787591,0.0005881,-2.20253e-07,-5.87646e-08,0.788179,0.000587483,-3.96547e-07,7.86782e-08,0.788766,0.000586926,-1.60512e-07,-7.71342e-08,0.789353,0.000586374,-3.91915e-07,5.10444e-08,0.789939,0.000585743,-2.38782e-07,-7.83422e-09,0.790524,0.000585242,-2.62284e-07,-1.97076e-08,0.791109,0.000584658,-3.21407e-07,2.70598e-08,0.791693,0.000584097,-2.40228e-07,-2.89269e-08,0.792277,0.000583529,-3.27008e-07,2.90431e-08,0.792861,0.000582963,-2.39879e-07,-2.76409e-08,0.793443,0.0005824,-3.22802e-07,2.1916e-08,0.794025,0.00058182,-2.57054e-07,-4.18368e-10,0.794607,0.000581305,-2.58309e-07,-2.02425e-08,0.795188,0.000580727,-3.19036e-07,2.17838e-08,0.795768,0.000580155,-2.53685e-07,-7.28814e-09,0.796348,0.000579625,-2.75549e-07,7.36871e-09,0.796928,0.000579096,-2.53443e-07,-2.21867e-08,0.797506,0.000578523,-3.20003e-07,2.17736e-08,0.798085,0.000577948,-2.54683e-07,-5.30296e-09,0.798662,0.000577423,-2.70592e-07,-5.61698e-10,0.799239,0.00057688,-2.72277e-07,7.54977e-09,0.799816,0.000576358,-2.49627e-07,-2.96374e-08,0.800392,0.00057577,-3.38539e-07,5.1395e-08,0.800968,0.000575247,-1.84354e-07,-5.67335e-08,0.801543,0.000574708,-3.54555e-07,5.63297e-08,0.802117,0.000574168,-1.85566e-07,-4.93759e-08,0.802691,0.000573649,-3.33693e-07,2.19646e-08,0.803264,0.000573047,-2.678e-07,2.1122e-08,0.803837,0.000572575,-2.04433e-07,-4.68482e-08,0.804409,0.000572026,-3.44978e-07,4.70613e-08,0.804981,0.000571477,-2.03794e-07,-2.21877e-08,0.805552,0.000571003,-2.70357e-07,-1.79153e-08,0.806123,0.000570408,-3.24103e-07,3.42443e-08,0.806693,0.000569863,-2.2137e-07,1.47556e-10,0.807263,0.000569421,-2.20928e-07,-3.48345e-08,0.807832,0.000568874,-3.25431e-07,1.99812e-08,0.808401,0.000568283,-2.65487e-07,1.45143e-08,0.808969,0.000567796,-2.21945e-07,-1.84338e-08,0.809536,0.000567297,-2.77246e-07,-3.83608e-10,0.810103,0.000566741,-2.78397e-07,1.99683e-08,0.81067,0.000566244,-2.18492e-07,-1.98848e-08,0.811236,0.000565747,-2.78146e-07,-3.38976e-11,0.811801,0.000565191,-2.78248e-07,2.00204e-08,0.812366,0.000564695,-2.18187e-07,-2.04429e-08,0.812931,0.000564197,-2.79516e-07,2.1467e-09,0.813495,0.000563644,-2.73076e-07,1.18561e-08,0.814058,0.000563134,-2.37507e-07,1.00334e-08,0.814621,0.000562689,-2.07407e-07,-5.19898e-08,0.815183,0.000562118,-3.63376e-07,7.87163e-08,0.815745,0.000561627,-1.27227e-07,-8.40616e-08,0.816306,0.000561121,-3.79412e-07,7.87163e-08,0.816867,0.000560598,-1.43263e-07,-5.19898e-08,0.817428,0.000560156,-2.99233e-07,1.00335e-08,0.817988,0.000559587,-2.69132e-07,1.18559e-08,0.818547,0.000559085,-2.33564e-07,2.14764e-09,0.819106,0.000558624,-2.27122e-07,-2.04464e-08,0.819664,0.000558108,-2.88461e-07,2.00334e-08,0.820222,0.000557591,-2.28361e-07,-8.24277e-11,0.820779,0.000557135,-2.28608e-07,-1.97037e-08,0.821336,0.000556618,-2.87719e-07,1.92925e-08,0.821893,0.000556101,-2.29841e-07,2.13831e-09,0.822448,0.000555647,-2.23427e-07,-2.78458e-08,0.823004,0.000555117,-3.06964e-07,4.96402e-08,0.823559,0.000554652,-1.58043e-07,-5.15058e-08,0.824113,0.000554181,-3.12561e-07,3.71737e-08,0.824667,0.000553668,-2.0104e-07,-3.75844e-08,0.82522,0.000553153,-3.13793e-07,5.35592e-08,0.825773,0.000552686,-1.53115e-07,-5.74431e-08,0.826326,0.000552207,-3.25444e-07,5.7004e-08,0.826878,0.000551728,-1.54433e-07,-5.13635e-08,0.827429,0.000551265,-3.08523e-07,2.92406e-08,0.82798,0.000550735,-2.20801e-07,-5.99424e-09,0.828531,0.000550276,-2.38784e-07,-5.26363e-09,0.829081,0.000549782,-2.54575e-07,2.70488e-08,0.82963,0.000549354,-1.73429e-07,-4.33268e-08,0.83018,0.000548878,-3.03409e-07,2.7049e-08,0.830728,0.000548352,-2.22262e-07,-5.26461e-09,0.831276,0.000547892,-2.38056e-07,-5.99057e-09,0.831824,0.000547397,-2.56027e-07,2.92269e-08,0.832371,0.000546973,-1.68347e-07,-5.13125e-08,0.832918,0.000546482,-3.22284e-07,5.68139e-08,0.833464,0.000546008,-1.51843e-07,-5.67336e-08,0.83401,0.000545534,-3.22043e-07,5.09113e-08,0.834555,0.000545043,-1.6931e-07,-2.77022e-08,0.8351,0.000544621,-2.52416e-07,2.92924e-10,0.835644,0.000544117,-2.51537e-07,2.65305e-08,0.836188,0.000543694,-1.71946e-07,-4.68105e-08,0.836732,0.00054321,-3.12377e-07,4.15021e-08,0.837275,0.000542709,-1.87871e-07,1.13355e-11,0.837817,0.000542334,-1.87837e-07,-4.15474e-08,0.838359,0.000541833,-3.12479e-07,4.69691e-08,0.838901,0.000541349,-1.71572e-07,-2.71196e-08,0.839442,0.000540925,-2.52931e-07,1.90462e-09,0.839983,0.000540425,-2.47217e-07,1.95011e-08,0.840523,0.000539989,-1.88713e-07,-2.03045e-08,0.841063,0.00053955,-2.49627e-07,2.11216e-09,0.841602,0.000539057,-2.4329e-07,1.18558e-08,0.842141,0.000538606,-2.07723e-07,1.00691e-08,0.842679,0.000538221,-1.77516e-07,-5.21324e-08,0.843217,0.00053771,-3.33913e-07,7.92513e-08,0.843755,0.00053728,-9.6159e-08,-8.60587e-08,0.844292,0.000536829,-3.54335e-07,8.61696e-08,0.844828,0.000536379,-9.58263e-08,-7.98057e-08,0.845364,0.000535948,-3.35243e-07,5.42394e-08,0.8459,0.00053544,-1.72525e-07,-1.79426e-08,0.846435,0.000535041,-2.26353e-07,1.75308e-08,0.84697,0.000534641,-1.73761e-07,-5.21806e-08,0.847505,0.000534137,-3.30302e-07,7.19824e-08,0.848038,0.000533692,-1.14355e-07,-5.69349e-08,0.848572,0.000533293,-2.8516e-07,3.65479e-08,0.849105,0.000532832,-1.75516e-07,-2.96519e-08,0.849638,0.000532392,-2.64472e-07,2.2455e-08,0.85017,0.000531931,-1.97107e-07,-5.63451e-10,0.850702,0.000531535,-1.98797e-07,-2.02011e-08,0.851233,0.000531077,-2.59401e-07,2.17634e-08,0.851764,0.000530623,-1.94111e-07,-7.24794e-09,0.852294,0.000530213,-2.15854e-07,7.22832e-09,0.852824,0.000529803,-1.94169e-07,-2.16653e-08,0.853354,0.00052935,-2.59165e-07,1.98283e-08,0.853883,0.000528891,-1.9968e-07,1.95678e-09,0.854412,0.000528497,-1.9381e-07,-2.76554e-08,0.85494,0.000528027,-2.76776e-07,4.90603e-08,0.855468,0.00052762,-1.29596e-07,-4.93764e-08,0.855995,0.000527213,-2.77725e-07,2.92361e-08,0.856522,0.000526745,-1.90016e-07,-7.96341e-09,0.857049,0.000526341,-2.13907e-07,2.61752e-09,0.857575,0.000525922,-2.06054e-07,-2.50665e-09,0.8581,0.000525502,-2.13574e-07,7.40906e-09,0.858626,0.000525097,-1.91347e-07,-2.71296e-08,0.859151,0.000524633,-2.72736e-07,4.15048e-08,0.859675,0.000524212,-1.48221e-07,-1.96802e-08,0.860199,0.000523856,-2.07262e-07,-2.23886e-08,0.860723,0.000523375,-2.74428e-07,4.96299e-08,0.861246,0.000522975,-1.25538e-07,-5.69216e-08,0.861769,0.000522553,-2.96303e-07,5.88473e-08,0.862291,0.000522137,-1.19761e-07,-5.92584e-08,0.862813,0.00052172,-2.97536e-07,5.8977e-08,0.863334,0.000521301,-1.20605e-07,-5.74403e-08,0.863855,0.000520888,-2.92926e-07,5.15751e-08,0.864376,0.000520457,-1.38201e-07,-2.96506e-08,0.864896,0.000520091,-2.27153e-07,7.42277e-09,0.865416,0.000519659,-2.04885e-07,-4.05057e-11,0.865936,0.00051925,-2.05006e-07,-7.26074e-09,0.866455,0.000518818,-2.26788e-07,2.90835e-08,0.866973,0.000518451,-1.39538e-07,-4.94686e-08,0.867492,0.000518024,-2.87944e-07,4.95814e-08,0.868009,0.000517597,-1.39199e-07,-2.96479e-08,0.868527,0.000517229,-2.28143e-07,9.40539e-09,0.869044,0.000516801,-1.99927e-07,-7.9737e-09,0.86956,0.000516378,-2.23848e-07,2.24894e-08,0.870077,0.000515997,-1.5638e-07,-2.23793e-08,0.870592,0.000515617,-2.23517e-07,7.42302e-09,0.871108,0.000515193,-2.01248e-07,-7.31283e-09,0.871623,0.000514768,-2.23187e-07,2.18283e-08,0.872137,0.000514387,-1.57702e-07,-2.03959e-08,0.872652,0.000514011,-2.1889e-07,1.50711e-10,0.873165,0.000513573,-2.18437e-07,1.97931e-08,0.873679,0.000513196,-1.59058e-07,-1.97183e-08,0.874192,0.000512819,-2.18213e-07,-5.24324e-10,0.874704,0.000512381,-2.19786e-07,2.18156e-08,0.875217,0.000512007,-1.54339e-07,-2.71336e-08,0.875728,0.000511616,-2.3574e-07,2.71141e-08,0.87624,0.000511226,-1.54398e-07,-2.17182e-08,0.876751,0.000510852,-2.19552e-07,1.54131e-10,0.877262,0.000510414,-2.1909e-07,2.11017e-08,0.877772,0.000510039,-1.55785e-07,-2.49562e-08,0.878282,0.000509652,-2.30654e-07,1.91183e-08,0.878791,0.000509248,-1.73299e-07,8.08751e-09,0.8793,0.000508926,-1.49036e-07,-5.14684e-08,0.879809,0.000508474,-3.03441e-07,7.85766e-08,0.880317,0.000508103,-6.77112e-08,-8.40242e-08,0.880825,0.000507715,-3.19784e-07,7.87063e-08,0.881333,0.000507312,-8.36649e-08,-5.19871e-08,0.88184,0.000506988,-2.39626e-07,1.00327e-08,0.882346,0.000506539,-2.09528e-07,1.18562e-08,0.882853,0.000506156,-1.73959e-07,2.14703e-09,0.883359,0.000505814,-1.67518e-07,-2.04444e-08,0.883864,0.000505418,-2.28851e-07,2.00258e-08,0.88437,0.00050502,-1.68774e-07,-5.42855e-11,0.884874,0.000504682,-1.68937e-07,-1.98087e-08,0.885379,0.000504285,-2.28363e-07,1.96842e-08,0.885883,0.000503887,-1.6931e-07,6.76342e-10,0.886387,0.000503551,-1.67281e-07,-2.23896e-08,0.88689,0.000503149,-2.3445e-07,2.92774e-08,0.887393,0.000502768,-1.46618e-07,-3.51152e-08,0.887896,0.00050237,-2.51963e-07,5.15787e-08,0.888398,0.00050202,-9.72271e-08,-5.19903e-08,0.8889,0.00050167,-2.53198e-07,3.71732e-08,0.889401,0.000501275,-1.41678e-07,-3.70978e-08,0.889902,0.00050088,-2.52972e-07,5.16132e-08,0.890403,0.000500529,-9.81321e-08,-5.01459e-08,0.890903,0.000500183,-2.4857e-07,2.9761e-08,0.891403,0.000499775,-1.59287e-07,-9.29351e-09,0.891903,0.000499428,-1.87167e-07,7.41301e-09,0.892402,0.000499076,-1.64928e-07,-2.03585e-08,0.892901,0.000498685,-2.26004e-07,1.44165e-08,0.893399,0.000498276,-1.82754e-07,2.22974e-08,0.893898,0.000497978,-1.15862e-07,-4.40013e-08,0.894395,0.000497614,-2.47866e-07,3.44985e-08,0.894893,0.000497222,-1.44371e-07,-3.43882e-08,0.89539,0.00049683,-2.47535e-07,4.34497e-08,0.895886,0.000496465,-1.17186e-07,-2.02012e-08,0.896383,0.00049617,-1.7779e-07,-2.22497e-08,0.896879,0.000495748,-2.44539e-07,4.95952e-08,0.897374,0.000495408,-9.57532e-08,-5.69217e-08,0.89787,0.000495045,-2.66518e-07,5.88823e-08,0.898364,0.000494689,-8.98713e-08,-5.93983e-08,0.898859,0.000494331,-2.68066e-07,5.95017e-08,0.899353,0.000493973,-8.95613e-08,-5.9399e-08,0.899847,0.000493616,-2.67758e-07,5.8885e-08,0.90034,0.000493257,-9.11033e-08,-5.69317e-08,0.900833,0.000492904,-2.61898e-07,4.96326e-08,0.901326,0.000492529,-1.13001e-07,-2.23893e-08,0.901819,0.000492236,-1.80169e-07,-1.968e-08,0.902311,0.000491817,-2.39209e-07,4.15047e-08,0.902802,0.000491463,-1.14694e-07,-2.71296e-08,0.903293,0.000491152,-1.96083e-07,7.409e-09,0.903784,0.000490782,-1.73856e-07,-2.50645e-09,0.904275,0.000490427,-1.81376e-07,2.61679e-09,0.904765,0.000490072,-1.73525e-07,-7.96072e-09,0.905255,0.000489701,-1.97407e-07,2.92261e-08,0.905745,0.000489394,-1.09729e-07,-4.93389e-08,0.906234,0.000489027,-2.57746e-07,4.89204e-08,0.906723,0.000488658,-1.10985e-07,-2.71333e-08,0.907211,0.000488354,-1.92385e-07,8.30861e-12,0.907699,0.00048797,-1.9236e-07,2.71001e-08,0.908187,0.000487666,-1.1106e-07,-4.88041e-08,0.908675,0.000487298,-2.57472e-07,4.89069e-08,0.909162,0.000486929,-1.10751e-07,-2.76143e-08,0.909649,0.000486625,-1.93594e-07,1.9457e-09,0.910135,0.000486244,-1.87757e-07,1.98315e-08,0.910621,0.000485928,-1.28262e-07,-2.16671e-08,0.911107,0.000485606,-1.93264e-07,7.23216e-09,0.911592,0.000485241,-1.71567e-07,-7.26152e-09,0.912077,0.000484877,-1.93352e-07,2.18139e-08,0.912562,0.000484555,-1.2791e-07,-2.03895e-08,0.913047,0.000484238,-1.89078e-07,1.39494e-10,0.913531,0.000483861,-1.8866e-07,1.98315e-08,0.914014,0.000483543,-1.29165e-07,-1.98609e-08,0.914498,0.000483225,-1.88748e-07,7.39912e-12,0.914981,0.000482847,-1.88726e-07,1.98313e-08,0.915463,0.000482529,-1.29232e-07,-1.9728e-08,0.915946,0.000482212,-1.88416e-07,-5.24035e-10,0.916428,0.000481833,-1.89988e-07,2.18241e-08,0.916909,0.000481519,-1.24516e-07,-2.71679e-08,0.917391,0.000481188,-2.06019e-07,2.72427e-08,0.917872,0.000480858,-1.24291e-07,-2.21985e-08,0.918353,0.000480543,-1.90886e-07,1.94644e-09,0.918833,0.000480167,-1.85047e-07,1.44127e-08,0.919313,0.00047984,-1.41809e-07,7.39438e-12,0.919793,0.000479556,-1.41787e-07,-1.44423e-08,0.920272,0.000479229,-1.85114e-07,-1.84291e-09,0.920751,0.000478854,-1.90642e-07,2.18139e-08,0.92123,0.000478538,-1.25201e-07,-2.58081e-08,0.921708,0.00047821,-2.02625e-07,2.18139e-08,0.922186,0.00047787,-1.37183e-07,-1.84291e-09,0.922664,0.00047759,-1.42712e-07,-1.44423e-08,0.923141,0.000477262,-1.86039e-07,7.34701e-12,0.923618,0.00047689,-1.86017e-07,1.44129e-08,0.924095,0.000476561,-1.42778e-07,1.94572e-09,0.924572,0.000476281,-1.36941e-07,-2.21958e-08,0.925048,0.000475941,-2.03528e-07,2.72327e-08,0.925523,0.000475615,-1.2183e-07,-2.71304e-08,0.925999,0.00047529,-2.03221e-07,2.16843e-08,0.926474,0.000474949,-1.38168e-07,-2.16005e-12,0.926949,0.000474672,-1.38175e-07,-2.16756e-08,0.927423,0.000474331,-2.03202e-07,2.71001e-08,0.927897,0.000474006,-1.21902e-07,-2.71201e-08,0.928371,0.000473681,-2.03262e-07,2.17757e-08,0.928845,0.00047334,-1.37935e-07,-3.78028e-10,0.929318,0.000473063,-1.39069e-07,-2.02636e-08,0.929791,0.000472724,-1.9986e-07,2.18276e-08,0.930263,0.000472389,-1.34377e-07,-7.44231e-09,0.930736,0.000472098,-1.56704e-07,7.94165e-09,0.931208,0.000471809,-1.32879e-07,-2.43243e-08,0.931679,0.00047147,-2.05851e-07,2.97508e-08,0.932151,0.000471148,-1.16599e-07,-3.50742e-08,0.932622,0.000470809,-2.21822e-07,5.09414e-08,0.933092,0.000470518,-6.89976e-08,-4.94821e-08,0.933563,0.000470232,-2.17444e-07,2.77775e-08,0.934033,0.00046988,-1.34111e-07,-2.02351e-09,0.934502,0.000469606,-1.40182e-07,-1.96835e-08,0.934972,0.000469267,-1.99232e-07,2.11529e-08,0.935441,0.000468932,-1.35774e-07,-5.32332e-09,0.93591,0.000468644,-1.51743e-07,1.40413e-10,0.936378,0.000468341,-1.51322e-07,4.76166e-09,0.936846,0.000468053,-1.37037e-07,-1.9187e-08,0.937314,0.000467721,-1.94598e-07,1.23819e-08,0.937782,0.000467369,-1.57453e-07,2.92642e-08,0.938249,0.000467142,-6.96601e-08,-6.98342e-08,0.938716,0.000466793,-2.79163e-07,7.12586e-08,0.939183,0.000466449,-6.53869e-08,-3.63863e-08,0.939649,0.000466209,-1.74546e-07,1.46818e-08,0.940115,0.000465904,-1.305e-07,-2.2341e-08,0.940581,0.000465576,-1.97523e-07,1.50774e-08,0.941046,0.000465226,-1.52291e-07,2.16359e-08,0.941511,0.000464986,-8.73832e-08,-4.20162e-08,0.941976,0.000464685,-2.13432e-07,2.72198e-08,0.942441,0.00046434,-1.31773e-07,-7.2581e-09,0.942905,0.000464055,-1.53547e-07,1.81263e-09,0.943369,0.000463753,-1.48109e-07,7.58386e-12,0.943832,0.000463457,-1.48086e-07,-1.84298e-09,0.944296,0.000463155,-1.53615e-07,7.36433e-09,0.944759,0.00046287,-1.31522e-07,-2.76143e-08,0.945221,0.000462524,-2.14365e-07,4.34883e-08,0.945684,0.000462226,-8.39003e-08,-2.71297e-08,0.946146,0.000461977,-1.65289e-07,5.42595e-09,0.946608,0.000461662,-1.49012e-07,5.42593e-09,0.947069,0.000461381,-1.32734e-07,-2.71297e-08,0.94753,0.000461034,-2.14123e-07,4.34881e-08,0.947991,0.000460736,-8.36585e-08,-2.76134e-08,0.948452,0.000460486,-1.66499e-07,7.36083e-09,0.948912,0.000460175,-1.44416e-07,-1.82993e-09,0.949372,0.000459881,-1.49906e-07,-4.11073e-11,0.949832,0.000459581,-1.50029e-07,1.99434e-09,0.950291,0.000459287,-1.44046e-07,-7.93627e-09,0.950751,0.000458975,-1.67855e-07,2.97507e-08,0.951209,0.000458728,-7.86029e-08,-5.1462e-08,0.951668,0.000458417,-2.32989e-07,5.6888e-08,0.952126,0.000458121,-6.2325e-08,-5.68806e-08,0.952584,0.000457826,-2.32967e-07,5.14251e-08,0.953042,0.000457514,-7.86914e-08,-2.96107e-08,0.953499,0.000457268,-1.67523e-07,7.41296e-09,0.953956,0.000456955,-1.45285e-07,-4.11262e-11,0.954413,0.000456665,-1.45408e-07,-7.24847e-09,0.95487,0.000456352,-1.67153e-07,2.9035e-08,0.955326,0.000456105,-8.00484e-08,-4.92869e-08,0.955782,0.000455797,-2.27909e-07,4.89032e-08,0.956238,0.000455488,-8.11994e-08,-2.71166e-08,0.956693,0.000455244,-1.62549e-07,-4.13678e-11,0.957148,0.000454919,-1.62673e-07,2.72821e-08,0.957603,0.000454675,-8.0827e-08,-4.94824e-08,0.958057,0.000454365,-2.29274e-07,5.14382e-08,0.958512,0.000454061,-7.49597e-08,-3.7061e-08,0.958965,0.0004538,-1.86143e-07,3.72013e-08,0.959419,0.000453539,-7.45389e-08,-5.21396e-08,0.959873,0.000453234,-2.30958e-07,5.21476e-08,0.960326,0.000452928,-7.45146e-08,-3.72416e-08,0.960778,0.000452667,-1.8624e-07,3.72143e-08,0.961231,0.000452407,-7.45967e-08,-5.20109e-08,0.961683,0.000452101,-2.30629e-07,5.16199e-08,0.962135,0.000451795,-7.57696e-08,-3.52595e-08,0.962587,0.000451538,-1.81548e-07,2.98133e-08,0.963038,0.000451264,-9.2108e-08,-2.43892e-08,0.963489,0.000451007,-1.65276e-07,8.13892e-09,0.96394,0.000450701,-1.40859e-07,-8.16647e-09,0.964391,0.000450394,-1.65358e-07,2.45269e-08,0.964841,0.000450137,-9.17775e-08,-3.03367e-08,0.965291,0.000449863,-1.82787e-07,3.7215e-08,0.965741,0.000449609,-7.11424e-08,-5.89188e-08,0.96619,0.00044929,-2.47899e-07,7.92509e-08,0.966639,0.000449032,-1.01462e-08,-7.92707e-08,0.967088,0.000448773,-2.47958e-07,5.90181e-08,0.967537,0.000448455,-7.0904e-08,-3.75925e-08,0.967985,0.0004482,-1.83681e-07,3.17471e-08,0.968433,0.000447928,-8.84401e-08,-2.97913e-08,0.968881,0.000447662,-1.77814e-07,2.78133e-08,0.969329,0.000447389,-9.4374e-08,-2.18572e-08,0.969776,0.000447135,-1.59946e-07,1.10134e-11,0.970223,0.000446815,-1.59913e-07,2.18132e-08,0.97067,0.000446561,-9.44732e-08,-2.76591e-08,0.971116,0.000446289,-1.7745e-07,2.92185e-08,0.971562,0.000446022,-8.97948e-08,-2.96104e-08,0.972008,0.000445753,-1.78626e-07,2.96185e-08,0.972454,0.000445485,-8.97706e-08,-2.92588e-08,0.972899,0.000445218,-1.77547e-07,2.78123e-08,0.973344,0.000444946,-9.41103e-08,-2.23856e-08,0.973789,0.000444691,-1.61267e-07,2.12559e-09,0.974233,0.000444374,-1.5489e-07,1.38833e-08,0.974678,0.000444106,-1.13241e-07,1.94591e-09,0.975122,0.000443886,-1.07403e-07,-2.16669e-08,0.975565,0.000443606,-1.72404e-07,2.5117e-08,0.976009,0.000443336,-9.70526e-08,-1.91963e-08,0.976452,0.000443085,-1.54642e-07,-7.93627e-09,0.976895,0.000442752,-1.7845e-07,5.09414e-08,0.977338,0.000442548,-2.56262e-08,-7.66201e-08,0.97778,0.000442266,-2.55486e-07,7.67249e-08,0.978222,0.000441986,-2.53118e-08,-5.14655e-08,0.978664,0.000441781,-1.79708e-07,9.92773e-09,0.979106,0.000441451,-1.49925e-07,1.17546e-08,0.979547,0.000441186,-1.14661e-07,2.65868e-09,0.979988,0.000440965,-1.06685e-07,-2.23893e-08,0.980429,0.000440684,-1.73853e-07,2.72939e-08,0.980869,0.000440419,-9.19716e-08,-2.71816e-08,0.98131,0.000440153,-1.73516e-07,2.18278e-08,0.98175,0.000439872,-1.08033e-07,-5.24833e-10,0.982189,0.000439654,-1.09607e-07,-1.97284e-08,0.982629,0.000439376,-1.68793e-07,1.98339e-08,0.983068,0.000439097,-1.09291e-07,-2.62901e-12,0.983507,0.000438879,-1.09299e-07,-1.98234e-08,0.983946,0.000438601,-1.68769e-07,1.96916e-08,0.984384,0.000438322,-1.09694e-07,6.6157e-10,0.984823,0.000438105,-1.0771e-07,-2.23379e-08,0.985261,0.000437823,-1.74723e-07,2.90855e-08,0.985698,0.00043756,-8.74669e-08,-3.43992e-08,0.986136,0.000437282,-1.90665e-07,4.89068e-08,0.986573,0.000437048,-4.39442e-08,-4.20188e-08,0.98701,0.000436834,-1.7e-07,-4.11073e-11,0.987446,0.000436494,-1.70124e-07,4.21832e-08,0.987883,0.00043628,-4.35742e-08,-4.94824e-08,0.988319,0.000436044,-1.92021e-07,3.6537e-08,0.988755,0.00043577,-8.24102e-08,-3.70611e-08,0.989191,0.000435494,-1.93593e-07,5.21026e-08,0.989626,0.000435263,-3.72855e-08,-5.21402e-08,0.990061,0.000435032,-1.93706e-07,3.7249e-08,0.990496,0.000434756,-8.19592e-08,-3.72512e-08,0.990931,0.000434481,-1.93713e-07,5.21511e-08,0.991365,0.00043425,-3.72595e-08,-5.21439e-08,0.991799,0.000434019,-1.93691e-07,3.72152e-08,0.992233,0.000433743,-8.20456e-08,-3.71123e-08,0.992667,0.000433468,-1.93382e-07,5.16292e-08,0.9931,0.000433236,-3.84947e-08,-5.01953e-08,0.993533,0.000433008,-1.89081e-07,2.99427e-08,0.993966,0.00043272,-9.92525e-08,-9.9708e-09,0.994399,0.000432491,-1.29165e-07,9.94051e-09,0.994831,0.000432263,-9.93434e-08,-2.97912e-08,0.995263,0.000431975,-1.88717e-07,4.96198e-08,0.995695,0.000431746,-3.98578e-08,-4.94785e-08,0.996127,0.000431518,-1.88293e-07,2.9085e-08,0.996558,0.000431229,-1.01038e-07,-7.25675e-09,0.996989,0.000431005,-1.22809e-07,-5.79945e-11,0.99742,0.000430759,-1.22983e-07,7.48873e-09,0.997851,0.000430536,-1.00516e-07,-2.98969e-08,0.998281,0.000430245,-1.90207e-07,5.24942e-08,0.998711,0.000430022,-3.27246e-08,-6.08706e-08,0.999141,0.000429774,-2.15336e-07,7.17788e-08,0.999571,0.000429392,0.,0.};
+
+    template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct Lab2RGB;
+
+    template <int scn, int dcn, bool srgb, int blueIdx> struct Lab2RGB<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float lThresh = 0.008856f * 903.3f;
+            const float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
+
+            float Y, fy;
+
+            if (src.x <= lThresh)
+            {
+                Y = src.x / 903.3f;
+                fy = 7.787f * Y + 16.0f / 116.0f;
+            }
+            else
+            {
+                fy = (src.x + 16.0f) / 116.0f;
+                Y = fy * fy * fy;
+            }
+
+            float X = src.y / 500.0f + fy;
+            float Z = fy - src.z / 200.0f;
+
+            if (X <= fThresh)
+                X = (X - 16.0f / 116.0f) / 7.787f;
+            else
+                X = X * X * X;
+
+            if (Z <= fThresh)
+                Z = (Z - 16.0f / 116.0f) / 7.787f;
+            else
+                Z = Z * Z * Z;
+
+            float B = 0.052891f * X - 0.204043f * Y + 1.151152f * Z;
+            float G = -0.921235f * X + 1.875991f * Y + 0.045244f * Z;
+            float R = 3.079933f * X - 1.537150f * Y - 0.542782f * Z;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = blueIdx == 0 ? B : R;
+            dst.y = G;
+            dst.z = blueIdx == 0 ? R : B;
+            setAlpha(dst, ColorChannel<float>::max());
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, bool srgb, int blueIdx> struct Lab2RGB<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename MakeVec<uchar, scn>::type, typename MakeVec<uchar, dcn>::type>
+    {
+        __device__ typename MakeVec<uchar, dcn>::type operator ()(const typename MakeVec<uchar, scn>::type& src) const
+        {
+            float3 buf;
+
+            buf.x = src.x * (100.f / 255.f);
+            buf.y = src.y - 128;
+            buf.z = src.z - 128;
+
+            Lab2RGB<float, 3, 3, srgb, blueIdx> cvtf;
+            buf = cvtf(buf);
+
+            typename MakeVec<uchar, dcn>::type dst;
+
+            dst.x = saturate_cast<uchar>(buf.x * 255.f);
+            dst.y = saturate_cast<uchar>(buf.y * 255.f);
+            dst.z = saturate_cast<uchar>(buf.z * 255.f);
+            setAlpha(dst, ColorChannel<uchar>::max());
+
+            return dst;
+        }
+    };
+
+    // RGB to Luv
+
+    __constant__ float c_LabCbrtTab[] = {0.137931,0.0114066,0.,1.18859e-07,0.149338,0.011407,3.56578e-07,-5.79396e-07,0.160745,0.0114059,-1.38161e-06,2.16892e-06,0.172151,0.0114097,5.12516e-06,-8.0814e-06,0.183558,0.0113957,-1.9119e-05,3.01567e-05,0.194965,0.0114479,7.13509e-05,-0.000112545,0.206371,0.011253,-0.000266285,-0.000106493,0.217252,0.0104009,-0.000585765,7.32149e-05,0.22714,0.00944906,-0.00036612,1.21917e-05,0.236235,0.0087534,-0.000329545,2.01753e-05,0.244679,0.00815483,-0.000269019,1.24435e-05,0.252577,0.00765412,-0.000231689,1.05618e-05,0.26001,0.00722243,-0.000200003,8.26662e-06,0.267041,0.00684723,-0.000175203,6.76746e-06,0.27372,0.00651712,-0.000154901,5.61192e-06,0.280088,0.00622416,-0.000138065,4.67009e-06,0.286179,0.00596204,-0.000124055,3.99012e-06,0.292021,0.0057259,-0.000112085,3.36032e-06,0.297638,0.00551181,-0.000102004,2.95338e-06,0.30305,0.00531666,-9.31435e-05,2.52875e-06,0.308277,0.00513796,-8.55572e-05,2.22022e-06,0.313331,0.00497351,-7.88966e-05,1.97163e-06,0.318228,0.00482163,-7.29817e-05,1.7248e-06,0.322978,0.00468084,-6.78073e-05,1.55998e-06,0.327593,0.0045499,-6.31274e-05,1.36343e-06,0.332081,0.00442774,-5.90371e-05,1.27136e-06,0.336451,0.00431348,-5.5223e-05,1.09111e-06,0.34071,0.00420631,-5.19496e-05,1.0399e-06,0.344866,0.00410553,-4.88299e-05,9.18347e-07,0.348923,0.00401062,-4.60749e-05,8.29942e-07,0.352889,0.00392096,-4.35851e-05,7.98478e-07,0.356767,0.00383619,-4.11896e-05,6.84917e-07,0.360562,0.00375586,-3.91349e-05,6.63976e-07,0.36428,0.00367959,-3.7143e-05,5.93086e-07,0.367923,0.00360708,-3.53637e-05,5.6976e-07,0.371495,0.00353806,-3.36544e-05,4.95533e-07,0.375,0.00347224,-3.21678e-05,4.87951e-07,0.378441,0.00340937,-3.0704e-05,4.4349e-07,0.38182,0.00334929,-2.93735e-05,4.20297e-07,0.38514,0.0032918,-2.81126e-05,3.7872e-07,0.388404,0.00323671,-2.69764e-05,3.596e-07,0.391614,0.00318384,-2.58976e-05,3.5845e-07,0.394772,0.00313312,-2.48223e-05,2.92765e-07,0.397881,0.00308435,-2.3944e-05,3.18232e-07,0.400942,0.00303742,-2.29893e-05,2.82046e-07,0.403957,0.00299229,-2.21432e-05,2.52315e-07,0.406927,0.00294876,-2.13862e-05,2.58416e-07,0.409855,0.00290676,-2.0611e-05,2.33939e-07,0.412741,0.00286624,-1.99092e-05,2.36342e-07,0.415587,0.00282713,-1.92001e-05,1.916e-07,0.418396,0.00278931,-1.86253e-05,2.1915e-07,0.421167,0.00275271,-1.79679e-05,1.83498e-07,0.423901,0.00271733,-1.74174e-05,1.79343e-07,0.426602,0.00268303,-1.68794e-05,1.72013e-07,0.429268,0.00264979,-1.63633e-05,1.75686e-07,0.431901,0.00261759,-1.58363e-05,1.3852e-07,0.434503,0.00258633,-1.54207e-05,1.64304e-07,0.437074,0.00255598,-1.49278e-05,1.28136e-07,0.439616,0.00252651,-1.45434e-05,1.57618e-07,0.442128,0.0024979,-1.40705e-05,1.0566e-07,0.444612,0.00247007,-1.37535e-05,1.34998e-07,0.447068,0.00244297,-1.33485e-05,1.29207e-07,0.449498,0.00241666,-1.29609e-05,9.32347e-08,0.451902,0.00239102,-1.26812e-05,1.23703e-07,0.45428,0.00236603,-1.23101e-05,9.74072e-08,0.456634,0.0023417,-1.20179e-05,1.12518e-07,0.458964,0.002318,-1.16803e-05,7.83681e-08,0.46127,0.00229488,-1.14452e-05,1.10452e-07,0.463554,0.00227232,-1.11139e-05,7.58719e-08,0.465815,0.00225032,-1.08863e-05,9.2699e-08,0.468055,0.00222882,-1.06082e-05,8.97738e-08,0.470273,0.00220788,-1.03388e-05,5.4845e-08,0.47247,0.00218736,-1.01743e-05,1.0808e-07,0.474648,0.00216734,-9.85007e-06,4.9277e-08,0.476805,0.00214779,-9.70224e-06,8.22408e-08,0.478943,0.00212863,-9.45551e-06,6.87942e-08,0.481063,0.00210993,-9.24913e-06,5.98144e-08,0.483163,0.00209161,-9.06969e-06,7.93789e-08,0.485246,0.00207371,-8.83155e-06,3.99032e-08,0.487311,0.00205616,-8.71184e-06,8.88325e-08,0.489358,0.002039,-8.44534e-06,2.20004e-08,0.491389,0.00202218,-8.37934e-06,9.13872e-08,0.493403,0.0020057,-8.10518e-06,2.96829e-08,0.495401,0.00198957,-8.01613e-06,5.81028e-08,0.497382,0.00197372,-7.84183e-06,6.5731e-08,0.499348,0.00195823,-7.64463e-06,3.66019e-08,0.501299,0.00194305,-7.53483e-06,2.62811e-08,0.503234,0.00192806,-7.45598e-06,9.66907e-08,0.505155,0.00191344,-7.16591e-06,4.18928e-09,0.507061,0.00189912,-7.15334e-06,6.53665e-08,0.508953,0.00188501,-6.95724e-06,3.23686e-08,0.510831,0.00187119,-6.86014e-06,4.35774e-08,0.512696,0.0018576,-6.72941e-06,3.17406e-08,0.514547,0.00184424,-6.63418e-06,6.78785e-08,0.516384,0.00183117,-6.43055e-06,-5.23126e-09,0.518209,0.0018183,-6.44624e-06,7.22562e-08,0.520021,0.00180562,-6.22947e-06,1.42292e-08,0.52182,0.0017932,-6.18679e-06,4.9641e-08,0.523607,0.00178098,-6.03786e-06,2.56259e-08,0.525382,0.00176898,-5.96099e-06,2.66696e-08,0.527145,0.00175714,-5.88098e-06,4.65094e-08,0.528897,0.00174552,-5.74145e-06,2.57114e-08,0.530637,0.00173411,-5.66431e-06,2.94588e-08,0.532365,0.00172287,-5.57594e-06,3.52667e-08,0.534082,0.00171182,-5.47014e-06,8.28868e-09,0.535789,0.00170091,-5.44527e-06,5.07871e-08,0.537484,0.00169017,-5.29291e-06,2.69817e-08,0.539169,0.00167967,-5.21197e-06,2.01009e-08,0.540844,0.0016693,-5.15166e-06,1.18237e-08,0.542508,0.00165903,-5.11619e-06,5.18135e-08,0.544162,0.00164896,-4.96075e-06,1.9341e-08,0.545806,0.00163909,-4.90273e-06,-9.96867e-09,0.54744,0.00162926,-4.93263e-06,8.01382e-08,0.549064,0.00161963,-4.69222e-06,-1.25601e-08,0.550679,0.00161021,-4.7299e-06,2.97067e-08,0.552285,0.00160084,-4.64078e-06,1.29426e-08,0.553881,0.0015916,-4.60195e-06,3.77327e-08,0.555468,0.00158251,-4.48875e-06,1.49412e-08,0.557046,0.00157357,-4.44393e-06,2.17118e-08,0.558615,0.00156475,-4.3788e-06,1.74206e-08,0.560176,0.00155605,-4.32653e-06,2.78152e-08,0.561727,0.00154748,-4.24309e-06,-9.47239e-09,0.563271,0.00153896,-4.27151e-06,6.9679e-08,0.564805,0.00153063,-4.06247e-06,-3.08246e-08,0.566332,0.00152241,-4.15494e-06,5.36188e-08,0.56785,0.00151426,-3.99409e-06,-4.83594e-09,0.56936,0.00150626,-4.00859e-06,2.53293e-08,0.570863,0.00149832,-3.93261e-06,2.27286e-08,0.572357,0.00149052,-3.86442e-06,2.96541e-09,0.573844,0.0014828,-3.85552e-06,2.50147e-08,0.575323,0.00147516,-3.78048e-06,1.61842e-08,0.576794,0.00146765,-3.73193e-06,2.94582e-08,0.578258,0.00146028,-3.64355e-06,-1.48076e-08,0.579715,0.00145295,-3.68798e-06,2.97724e-08,0.581164,0.00144566,-3.59866e-06,1.49272e-08,0.582606,0.00143851,-3.55388e-06,2.97285e-08,0.584041,0.00143149,-3.46469e-06,-1.46323e-08,0.585469,0.00142451,-3.50859e-06,2.88004e-08,0.58689,0.00141758,-3.42219e-06,1.864e-08,0.588304,0.00141079,-3.36627e-06,1.58482e-08,0.589712,0.00140411,-3.31872e-06,-2.24279e-08,0.591112,0.00139741,-3.38601e-06,7.38639e-08,0.592507,0.00139085,-3.16441e-06,-3.46088e-08,0.593894,0.00138442,-3.26824e-06,4.96675e-09,0.595275,0.0013779,-3.25334e-06,7.4346e-08,0.59665,0.00137162,-3.0303e-06,-6.39319e-08,0.598019,0.00136536,-3.2221e-06,6.21725e-08,0.599381,0.00135911,-3.03558e-06,-5.94423e-09,0.600737,0.00135302,-3.05341e-06,2.12091e-08,0.602087,0.00134697,-2.98979e-06,-1.92876e-08,0.603431,0.00134094,-3.04765e-06,5.5941e-08,0.604769,0.00133501,-2.87983e-06,-2.56622e-08,0.606101,0.00132917,-2.95681e-06,4.67078e-08,0.607427,0.0013234,-2.81669e-06,-4.19592e-08,0.608748,0.00131764,-2.94257e-06,6.15243e-08,0.610062,0.00131194,-2.75799e-06,-2.53244e-08,0.611372,0.00130635,-2.83397e-06,3.97739e-08,0.612675,0.0013008,-2.71465e-06,-1.45618e-08,0.613973,0.00129533,-2.75833e-06,1.84733e-08,0.615266,0.00128986,-2.70291e-06,2.73606e-10,0.616553,0.00128446,-2.70209e-06,4.00367e-08,0.617835,0.00127918,-2.58198e-06,-4.12113e-08,0.619111,0.00127389,-2.70561e-06,6.52039e-08,0.620383,0.00126867,-2.51e-06,-4.07901e-08,0.621649,0.00126353,-2.63237e-06,3.83516e-08,0.62291,0.00125838,-2.51732e-06,6.59315e-09,0.624166,0.00125337,-2.49754e-06,-5.11939e-09,0.625416,0.00124836,-2.5129e-06,1.38846e-08,0.626662,0.00124337,-2.47124e-06,9.18514e-09,0.627903,0.00123846,-2.44369e-06,8.97952e-09,0.629139,0.0012336,-2.41675e-06,1.45012e-08,0.63037,0.00122881,-2.37325e-06,-7.37949e-09,0.631597,0.00122404,-2.39538e-06,1.50169e-08,0.632818,0.00121929,-2.35033e-06,6.91648e-09,0.634035,0.00121461,-2.32958e-06,1.69219e-08,0.635248,0.00121,-2.27882e-06,-1.49997e-08,0.636455,0.0012054,-2.32382e-06,4.30769e-08,0.637659,0.00120088,-2.19459e-06,-3.80986e-08,0.638857,0.00119638,-2.30888e-06,4.97134e-08,0.640051,0.00119191,-2.15974e-06,-4.15463e-08,0.641241,0.00118747,-2.28438e-06,5.68667e-08,0.642426,0.00118307,-2.11378e-06,-7.10641e-09,0.643607,0.00117882,-2.1351e-06,-2.8441e-08,0.644784,0.00117446,-2.22042e-06,6.12658e-08,0.645956,0.00117021,-2.03663e-06,-3.78083e-08,0.647124,0.00116602,-2.15005e-06,3.03627e-08,0.648288,0.00116181,-2.05896e-06,-2.40379e-08,0.649448,0.00115762,-2.13108e-06,6.57887e-08,0.650603,0.00115356,-1.93371e-06,-6.03028e-08,0.651755,0.00114951,-2.11462e-06,5.62134e-08,0.652902,0.00114545,-1.94598e-06,-4.53417e-08,0.654046,0.00114142,-2.082e-06,6.55489e-08,0.655185,0.00113745,-1.88536e-06,-3.80396e-08,0.656321,0.00113357,-1.99948e-06,2.70049e-08,0.657452,0.00112965,-1.91846e-06,-1.03755e-08,0.65858,0.00112578,-1.94959e-06,1.44973e-08,0.659704,0.00112192,-1.9061e-06,1.1991e-08,0.660824,0.00111815,-1.87012e-06,-2.85634e-09,0.66194,0.0011144,-1.87869e-06,-5.65782e-10,0.663053,0.00111064,-1.88039e-06,5.11947e-09,0.664162,0.0011069,-1.86503e-06,3.96924e-08,0.665267,0.00110328,-1.74595e-06,-4.46795e-08,0.666368,0.00109966,-1.87999e-06,1.98161e-08,0.667466,0.00109596,-1.82054e-06,2.502e-08,0.66856,0.00109239,-1.74548e-06,-6.86593e-10,0.669651,0.0010889,-1.74754e-06,-2.22739e-08,0.670738,0.00108534,-1.81437e-06,3.01776e-08,0.671821,0.0010818,-1.72383e-06,2.07732e-08,0.672902,0.00107841,-1.66151e-06,-5.36658e-08,0.673978,0.00107493,-1.82251e-06,7.46802e-08,0.675051,0.00107151,-1.59847e-06,-6.62411e-08,0.676121,0.00106811,-1.79719e-06,7.10748e-08,0.677188,0.00106473,-1.58397e-06,-3.92441e-08,0.678251,0.00106145,-1.7017e-06,2.62973e-08,0.679311,0.00105812,-1.62281e-06,-6.34035e-09,0.680367,0.00105486,-1.64183e-06,-9.36249e-10,0.68142,0.00105157,-1.64464e-06,1.00854e-08,0.68247,0.00104831,-1.61438e-06,2.01995e-08,0.683517,0.00104514,-1.55378e-06,-3.1279e-08,0.68456,0.00104194,-1.64762e-06,4.53114e-08,0.685601,0.00103878,-1.51169e-06,-3.07573e-08,0.686638,0.00103567,-1.60396e-06,1.81133e-08,0.687672,0.00103251,-1.54962e-06,1.79085e-08,0.688703,0.00102947,-1.49589e-06,-3.01428e-08,0.689731,0.00102639,-1.58632e-06,4.30583e-08,0.690756,0.00102334,-1.45715e-06,-2.28814e-08,0.691778,0.00102036,-1.52579e-06,-1.11373e-08,0.692797,0.00101727,-1.5592e-06,6.74305e-08,0.693812,0.00101436,-1.35691e-06,-7.97709e-08,0.694825,0.0010114,-1.59622e-06,7.28391e-08,0.695835,0.00100843,-1.37771e-06,-3.27715e-08,0.696842,0.00100558,-1.47602e-06,-1.35807e-09,0.697846,0.00100262,-1.48009e-06,3.82037e-08,0.698847,0.000999775,-1.36548e-06,-3.22474e-08,0.699846,0.000996948,-1.46223e-06,3.11809e-08,0.700841,0.000994117,-1.36868e-06,-3.28714e-08,0.701834,0.000991281,-1.4673e-06,4.07001e-08,0.702824,0.000988468,-1.3452e-06,-1.07197e-08,0.703811,0.000985746,-1.37736e-06,2.17866e-09,0.704795,0.000982998,-1.37082e-06,2.00521e-09,0.705777,0.000980262,-1.3648e-06,-1.01996e-08,0.706756,0.000977502,-1.3954e-06,3.87931e-08,0.707732,0.000974827,-1.27902e-06,-2.57632e-08,0.708706,0.000972192,-1.35631e-06,4.65513e-09,0.709676,0.000969493,-1.34235e-06,7.14257e-09,0.710645,0.00096683,-1.32092e-06,2.63791e-08,0.71161,0.000964267,-1.24178e-06,-5.30543e-08,0.712573,0.000961625,-1.40095e-06,6.66289e-08,0.713533,0.000959023,-1.20106e-06,-3.46474e-08,0.714491,0.000956517,-1.305e-06,1.23559e-08,0.715446,0.000953944,-1.26793e-06,-1.47763e-08,0.716399,0.000951364,-1.31226e-06,4.67494e-08,0.717349,0.000948879,-1.17201e-06,-5.3012e-08,0.718297,0.000946376,-1.33105e-06,4.60894e-08,0.719242,0.000943852,-1.19278e-06,-1.21366e-08,0.720185,0.00094143,-1.22919e-06,2.45673e-09,0.721125,0.000938979,-1.22182e-06,2.30966e-09,0.722063,0.000936543,-1.21489e-06,-1.16954e-08,0.722998,0.000934078,-1.24998e-06,4.44718e-08,0.723931,0.000931711,-1.11656e-06,-4.69823e-08,0.724861,0.000929337,-1.25751e-06,2.4248e-08,0.725789,0.000926895,-1.18477e-06,9.5949e-09,0.726715,0.000924554,-1.15598e-06,-3.02286e-09,0.727638,0.000922233,-1.16505e-06,2.49649e-09,0.72856,0.00091991,-1.15756e-06,-6.96321e-09,0.729478,0.000917575,-1.17845e-06,2.53564e-08,0.730395,0.000915294,-1.10238e-06,-3.48578e-08,0.731309,0.000912984,-1.20695e-06,5.44704e-08,0.732221,0.000910734,-1.04354e-06,-6.38144e-08,0.73313,0.000908455,-1.23499e-06,8.15781e-08,0.734038,0.00090623,-9.90253e-07,-8.3684e-08,0.734943,0.000903999,-1.2413e-06,7.43441e-08,0.735846,0.000901739,-1.01827e-06,-3.48787e-08,0.736746,0.000899598,-1.12291e-06,5.56596e-09,0.737645,0.000897369,-1.10621e-06,1.26148e-08,0.738541,0.000895194,-1.06837e-06,3.57935e-09,0.739435,0.000893068,-1.05763e-06,-2.69322e-08,0.740327,0.000890872,-1.13842e-06,4.45448e-08,0.741217,0.000888729,-1.00479e-06,-3.20376e-08,0.742105,0.000886623,-1.1009e-06,2.40011e-08,0.74299,0.000884493,-1.0289e-06,-4.36209e-09,0.743874,0.000882422,-1.04199e-06,-6.55268e-09,0.744755,0.000880319,-1.06164e-06,3.05728e-08,0.745634,0.000878287,-9.69926e-07,-5.61338e-08,0.746512,0.000876179,-1.13833e-06,7.4753e-08,0.747387,0.000874127,-9.14068e-07,-6.40644e-08,0.74826,0.000872106,-1.10626e-06,6.22955e-08,0.749131,0.000870081,-9.19375e-07,-6.59083e-08,0.75,0.000868044,-1.1171e-06,8.21284e-08,0.750867,0.000866056,-8.70714e-07,-8.37915e-08,0.751732,0.000864064,-1.12209e-06,7.42237e-08,0.752595,0.000862042,-8.99418e-07,-3.42894e-08,0.753456,0.00086014,-1.00229e-06,3.32955e-09,0.754315,0.000858146,-9.92297e-07,2.09712e-08,0.755173,0.000856224,-9.29384e-07,-2.76096e-08,0.756028,0.000854282,-1.01221e-06,2.98627e-08,0.756881,0.000852348,-9.22625e-07,-3.22365e-08,0.757733,0.000850406,-1.01933e-06,3.94786e-08,0.758582,0.000848485,-9.00898e-07,-6.46833e-09,0.75943,0.000846664,-9.20303e-07,-1.36052e-08,0.760275,0.000844783,-9.61119e-07,1.28447e-09,0.761119,0.000842864,-9.57266e-07,8.4674e-09,0.761961,0.000840975,-9.31864e-07,2.44506e-08,0.762801,0.000839185,-8.58512e-07,-4.6665e-08,0.763639,0.000837328,-9.98507e-07,4.30001e-08,0.764476,0.00083546,-8.69507e-07,-6.12609e-09,0.76531,0.000833703,-8.87885e-07,-1.84959e-08,0.766143,0.000831871,-9.43372e-07,2.05052e-08,0.766974,0.000830046,-8.81857e-07,-3.92026e-09,0.767803,0.000828271,-8.93618e-07,-4.82426e-09,0.768631,0.000826469,-9.0809e-07,2.32172e-08,0.769456,0.000824722,-8.38439e-07,-2.84401e-08,0.77028,0.00082296,-9.23759e-07,3.09386e-08,0.771102,0.000821205,-8.30943e-07,-3.57099e-08,0.771922,0.000819436,-9.38073e-07,5.22963e-08,0.772741,0.000817717,-7.81184e-07,-5.42658e-08,0.773558,0.000815992,-9.43981e-07,4.55579e-08,0.774373,0.000814241,-8.07308e-07,-8.75656e-09,0.775186,0.0008126,-8.33578e-07,-1.05315e-08,0.775998,0.000810901,-8.65172e-07,-8.72188e-09,0.776808,0.000809145,-8.91338e-07,4.54191e-08,0.777616,0.000807498,-7.5508e-07,-5.37454e-08,0.778423,0.000805827,-9.16317e-07,5.03532e-08,0.779228,0.000804145,-7.65257e-07,-2.84584e-08,0.780031,0.000802529,-8.50632e-07,3.87579e-09,0.780833,0.00080084,-8.39005e-07,1.29552e-08,0.781633,0.0007992,-8.00139e-07,3.90804e-09,0.782432,0.000797612,-7.88415e-07,-2.85874e-08,0.783228,0.000795949,-8.74177e-07,5.0837e-08,0.784023,0.000794353,-7.21666e-07,-5.55513e-08,0.784817,0.000792743,-8.8832e-07,5.21587e-08,0.785609,0.000791123,-7.31844e-07,-3.38744e-08,0.786399,0.000789558,-8.33467e-07,2.37342e-08,0.787188,0.000787962,-7.62264e-07,-1.45775e-09,0.787975,0.000786433,-7.66638e-07,-1.79034e-08,0.788761,0.000784846,-8.20348e-07,1.34665e-08,0.789545,0.000783246,-7.79948e-07,2.3642e-08,0.790327,0.000781757,-7.09022e-07,-4.84297e-08,0.791108,0.000780194,-8.54311e-07,5.08674e-08,0.791888,0.000778638,-7.01709e-07,-3.58303e-08,0.792666,0.000777127,-8.092e-07,3.28493e-08,0.793442,0.000775607,-7.10652e-07,-3.59624e-08,0.794217,0.000774078,-8.1854e-07,5.13959e-08,0.79499,0.000772595,-6.64352e-07,-5.04121e-08,0.795762,0.000771115,-8.15588e-07,3.10431e-08,0.796532,0.000769577,-7.22459e-07,-1.41557e-08,0.797301,0.00076809,-7.64926e-07,2.55795e-08,0.798069,0.000766636,-6.88187e-07,-2.85578e-08,0.798835,0.000765174,-7.73861e-07,2.90472e-08,0.799599,0.000763714,-6.86719e-07,-2.80262e-08,0.800362,0.000762256,-7.70798e-07,2.34531e-08,0.801123,0.000760785,-7.00438e-07,-6.18144e-09,0.801884,0.000759366,-7.18983e-07,1.27263e-09,0.802642,0.000757931,-7.15165e-07,1.09101e-09,0.803399,0.000756504,-7.11892e-07,-5.63675e-09,0.804155,0.000755064,-7.28802e-07,2.14559e-08,0.80491,0.00075367,-6.64434e-07,-2.05821e-08,0.805663,0.00075228,-7.26181e-07,1.26812e-09,0.806414,0.000750831,-7.22377e-07,1.55097e-08,0.807164,0.000749433,-6.75848e-07,-3.70216e-09,0.807913,0.00074807,-6.86954e-07,-7.0105e-10,0.80866,0.000746694,-6.89057e-07,6.5063e-09,0.809406,0.000745336,-6.69538e-07,-2.53242e-08,0.810151,0.000743921,-7.45511e-07,3.51858e-08,0.810894,0.000742535,-6.39953e-07,3.79034e-09,0.811636,0.000741267,-6.28582e-07,-5.03471e-08,0.812377,0.000739858,-7.79624e-07,7.83886e-08,0.813116,0.000738534,-5.44458e-07,-8.43935e-08,0.813854,0.000737192,-7.97638e-07,8.03714e-08,0.81459,0.000735838,-5.56524e-07,-5.82784e-08,0.815325,0.00073455,-7.31359e-07,3.35329e-08,0.816059,0.000733188,-6.3076e-07,-1.62486e-08,0.816792,0.000731878,-6.79506e-07,3.14614e-08,0.817523,0.000730613,-5.85122e-07,-4.99925e-08,0.818253,0.000729293,-7.35099e-07,4.92994e-08,0.818982,0.000727971,-5.87201e-07,-2.79959e-08,0.819709,0.000726712,-6.71189e-07,3.07959e-09,0.820435,0.000725379,-6.6195e-07,1.56777e-08,0.82116,0.000724102,-6.14917e-07,-6.18564e-09,0.821883,0.000722854,-6.33474e-07,9.06488e-09,0.822606,0.000721614,-6.06279e-07,-3.00739e-08,0.823327,0.000720311,-6.96501e-07,5.16262e-08,0.824046,0.000719073,-5.41623e-07,-5.72214e-08,0.824765,0.000717818,-7.13287e-07,5.80503e-08,0.825482,0.000716566,-5.39136e-07,-5.57703e-08,0.826198,0.00071532,-7.06447e-07,4.58215e-08,0.826912,0.000714045,-5.68983e-07,-8.30636e-09,0.827626,0.000712882,-5.93902e-07,-1.25961e-08,0.828338,0.000711656,-6.3169e-07,-9.13985e-10,0.829049,0.00071039,-6.34432e-07,1.62519e-08,0.829759,0.00070917,-5.85676e-07,-4.48904e-09,0.830468,0.000707985,-5.99143e-07,1.70418e-09,0.831175,0.000706792,-5.9403e-07,-2.32768e-09,0.831881,0.000705597,-6.01014e-07,7.60648e-09,0.832586,0.000704418,-5.78194e-07,-2.80982e-08,0.83329,0.000703177,-6.62489e-07,4.51817e-08,0.833993,0.000701988,-5.26944e-07,-3.34192e-08,0.834694,0.000700834,-6.27201e-07,2.88904e-08,0.835394,0.000699666,-5.4053e-07,-2.25378e-08,0.836093,0.000698517,-6.08143e-07,1.65589e-09,0.836791,0.000697306,-6.03176e-07,1.59142e-08,0.837488,0.000696147,-5.55433e-07,-5.70801e-09,0.838184,0.000695019,-5.72557e-07,6.91792e-09,0.838878,0.000693895,-5.51803e-07,-2.19637e-08,0.839571,0.000692725,-6.17694e-07,2.13321e-08,0.840263,0.000691554,-5.53698e-07,-3.75996e-09,0.840954,0.000690435,-5.64978e-07,-6.29219e-09,0.841644,0.000689287,-5.83855e-07,2.89287e-08,0.842333,0.000688206,-4.97068e-07,-4.98181e-08,0.843021,0.000687062,-6.46523e-07,5.11344e-08,0.843707,0.000685922,-4.9312e-07,-3.55102e-08,0.844393,0.00068483,-5.9965e-07,3.13019e-08,0.845077,0.000683724,-5.05745e-07,-3.00925e-08,0.84576,0.000682622,-5.96022e-07,2.94636e-08,0.846442,0.000681519,-5.07631e-07,-2.81572e-08,0.847123,0.000680419,-5.92103e-07,2.35606e-08,0.847803,0.000679306,-5.21421e-07,-6.48045e-09,0.848482,0.000678243,-5.40863e-07,2.36124e-09,0.849159,0.000677169,-5.33779e-07,-2.96461e-09,0.849836,0.000676092,-5.42673e-07,9.49728e-09,0.850512,0.000675035,-5.14181e-07,-3.50245e-08,0.851186,0.000673902,-6.19254e-07,7.09959e-08,0.851859,0.000672876,-4.06267e-07,-7.01453e-08,0.852532,0.000671853,-6.16703e-07,3.07714e-08,0.853203,0.000670712,-5.24388e-07,6.66423e-09,0.853873,0.000669684,-5.04396e-07,2.17629e-09,0.854542,0.000668681,-4.97867e-07,-1.53693e-08,0.855211,0.000667639,-5.43975e-07,-3.03752e-10,0.855878,0.000666551,-5.44886e-07,1.65844e-08,0.856544,0.000665511,-4.95133e-07,-6.42907e-09,0.857209,0.000664501,-5.1442e-07,9.13195e-09,0.857873,0.0006635,-4.87024e-07,-3.00987e-08,0.858536,0.000662435,-5.7732e-07,5.16584e-08,0.859198,0.000661436,-4.22345e-07,-5.73255e-08,0.859859,0.000660419,-5.94322e-07,5.84343e-08,0.860518,0.000659406,-4.19019e-07,-5.72022e-08,0.861177,0.000658396,-5.90626e-07,5.11653e-08,0.861835,0.000657368,-4.3713e-07,-2.82495e-08,0.862492,0.000656409,-5.21878e-07,2.22788e-09,0.863148,0.000655372,-5.15195e-07,1.9338e-08,0.863803,0.0006544,-4.5718e-07,-1.99754e-08,0.864457,0.000653425,-5.17107e-07,9.59024e-10,0.86511,0.000652394,-5.1423e-07,1.61393e-08,0.865762,0.000651414,-4.65812e-07,-5.91149e-09,0.866413,0.000650465,-4.83546e-07,7.50665e-09,0.867063,0.00064952,-4.61026e-07,-2.4115e-08,0.867712,0.000648526,-5.33371e-07,2.93486e-08,0.86836,0.000647547,-4.45325e-07,-3.36748e-08,0.869007,0.000646555,-5.4635e-07,4.57461e-08,0.869653,0.0006456,-4.09112e-07,-3.01002e-08,0.870298,0.000644691,-4.99412e-07,1.50501e-08,0.870942,0.000643738,-4.54262e-07,-3.01002e-08,0.871585,0.000642739,-5.44563e-07,4.57461e-08,0.872228,0.000641787,-4.07324e-07,-3.36748e-08,0.872869,0.000640871,-5.08349e-07,2.93486e-08,0.873509,0.000639943,-4.20303e-07,-2.4115e-08,0.874149,0.00063903,-4.92648e-07,7.50655e-09,0.874787,0.000638067,-4.70128e-07,-5.91126e-09,0.875425,0.000637109,-4.87862e-07,1.61385e-08,0.876062,0.000636182,-4.39447e-07,9.61961e-10,0.876697,0.000635306,-4.36561e-07,-1.99863e-08,0.877332,0.000634373,-4.9652e-07,1.93785e-08,0.877966,0.000633438,-4.38384e-07,2.07697e-09,0.878599,0.000632567,-4.32153e-07,-2.76864e-08,0.879231,0.00063162,-5.15212e-07,4.90641e-08,0.879862,0.000630737,-3.6802e-07,-4.93606e-08,0.880493,0.000629852,-5.16102e-07,2.9169e-08,0.881122,0.000628908,-4.28595e-07,-7.71083e-09,0.881751,0.000628027,-4.51727e-07,1.6744e-09,0.882378,0.000627129,-4.46704e-07,1.01317e-09,0.883005,0.000626239,-4.43665e-07,-5.72703e-09,0.883631,0.000625334,-4.60846e-07,2.1895e-08,0.884255,0.000624478,-3.95161e-07,-2.22481e-08,0.88488,0.000623621,-4.61905e-07,7.4928e-09,0.885503,0.00062272,-4.39427e-07,-7.72306e-09,0.886125,0.000621818,-4.62596e-07,2.33995e-08,0.886746,0.000620963,-3.92398e-07,-2.62704e-08,0.887367,0.000620099,-4.71209e-07,2.20775e-08,0.887987,0.000619223,-4.04976e-07,-2.43496e-09,0.888605,0.000618406,-4.12281e-07,-1.23377e-08,0.889223,0.000617544,-4.49294e-07,-7.81876e-09,0.88984,0.000616622,-4.72751e-07,4.36128e-08,0.890457,0.000615807,-3.41912e-07,-4.7423e-08,0.891072,0.000614981,-4.84181e-07,2.68698e-08,0.891687,0.000614093,-4.03572e-07,-4.51384e-10,0.8923,0.000613285,-4.04926e-07,-2.50643e-08,0.892913,0.0006124,-4.80119e-07,4.11038e-08,0.893525,0.000611563,-3.56808e-07,-2.01414e-08,0.894136,0.000610789,-4.17232e-07,-2.01426e-08,0.894747,0.000609894,-4.7766e-07,4.11073e-08,0.895356,0.000609062,-3.54338e-07,-2.50773e-08,0.895965,0.000608278,-4.2957e-07,-4.02954e-10,0.896573,0.000607418,-4.30779e-07,2.66891e-08,0.89718,0.000606636,-3.50711e-07,-4.67489e-08,0.897786,0.000605795,-4.90958e-07,4.10972e-08,0.898391,0.000604936,-3.67666e-07,1.56948e-09,0.898996,0.000604205,-3.62958e-07,-4.73751e-08,0.8996,0.000603337,-5.05083e-07,6.87214e-08,0.900202,0.000602533,-2.98919e-07,-4.86966e-08,0.900805,0.000601789,-4.45009e-07,6.85589e-09,0.901406,0.00060092,-4.24441e-07,2.1273e-08,0.902007,0.000600135,-3.60622e-07,-3.23434e-08,0.902606,0.000599317,-4.57652e-07,4.84959e-08,0.903205,0.000598547,-3.12164e-07,-4.24309e-08,0.903803,0.000597795,-4.39457e-07,2.01844e-09,0.904401,0.000596922,-4.33402e-07,3.43571e-08,0.904997,0.000596159,-3.30331e-07,-2.02374e-08,0.905593,0.000595437,-3.91043e-07,-1.30123e-08,0.906188,0.000594616,-4.3008e-07,1.26819e-08,0.906782,0.000593794,-3.92034e-07,2.18894e-08,0.907376,0.000593076,-3.26366e-07,-4.06349e-08,0.907968,0.000592301,-4.4827e-07,2.1441e-08,0.90856,0.000591469,-3.83947e-07,1.44754e-08,0.909151,0.000590744,-3.40521e-07,-1.97379e-08,0.909742,0.000590004,-3.99735e-07,4.87161e-09,0.910331,0.000589219,-3.8512e-07,2.51532e-10,0.91092,0.00058845,-3.84366e-07,-5.87776e-09,0.911508,0.000587663,-4.01999e-07,2.32595e-08,0.912096,0.000586929,-3.3222e-07,-2.75554e-08,0.912682,0.000586182,-4.14887e-07,2.73573e-08,0.913268,0.000585434,-3.32815e-07,-2.22692e-08,0.913853,0.000584702,-3.99622e-07,2.11486e-09,0.914437,0.000583909,-3.93278e-07,1.38098e-08,0.915021,0.000583164,-3.51848e-07,2.25042e-09,0.915604,0.000582467,-3.45097e-07,-2.28115e-08,0.916186,0.000581708,-4.13531e-07,2.93911e-08,0.916767,0.000580969,-3.25358e-07,-3.51481e-08,0.917348,0.000580213,-4.30803e-07,5.15967e-08,0.917928,0.000579506,-2.76012e-07,-5.20296e-08,0.918507,0.000578798,-4.32101e-07,3.73124e-08,0.919085,0.000578046,-3.20164e-07,-3.76154e-08,0.919663,0.000577293,-4.3301e-07,5.35447e-08,0.92024,0.000576587,-2.72376e-07,-5.7354e-08,0.920816,0.000575871,-4.44438e-07,5.66621e-08,0.921391,0.000575152,-2.74452e-07,-5.00851e-08,0.921966,0.000574453,-4.24707e-07,2.4469e-08,0.92254,0.000573677,-3.513e-07,1.18138e-08,0.923114,0.000573009,-3.15859e-07,-1.21195e-08,0.923686,0.000572341,-3.52217e-07,-2.29403e-08,0.924258,0.000571568,-4.21038e-07,4.4276e-08,0.924829,0.000570859,-2.8821e-07,-3.49546e-08,0.9254,0.000570178,-3.93074e-07,3.59377e-08,0.92597,0.000569499,-2.85261e-07,-4.91915e-08,0.926539,0.000568781,-4.32835e-07,4.16189e-08,0.927107,0.00056804,-3.07979e-07,1.92523e-09,0.927675,0.00056743,-3.02203e-07,-4.93198e-08,0.928242,0.000566678,-4.50162e-07,7.61447e-08,0.928809,0.000566006,-2.21728e-07,-7.6445e-08,0.929374,0.000565333,-4.51063e-07,5.08216e-08,0.929939,0.000564583,-2.98599e-07,-7.63212e-09,0.930503,0.000563963,-3.21495e-07,-2.02931e-08,0.931067,0.000563259,-3.82374e-07,2.92001e-08,0.93163,0.000562582,-2.94774e-07,-3.69025e-08,0.932192,0.000561882,-4.05482e-07,5.88053e-08,0.932754,0.000561247,-2.29066e-07,-7.91094e-08,0.933315,0.000560552,-4.66394e-07,7.88184e-08,0.933875,0.000559856,-2.29939e-07,-5.73501e-08,0.934434,0.000559224,-4.01989e-07,3.13727e-08,0.934993,0.000558514,-3.07871e-07,-8.53611e-09,0.935551,0.000557873,-3.33479e-07,2.77175e-09,0.936109,0.000557214,-3.25164e-07,-2.55091e-09,0.936666,0.000556556,-3.32817e-07,7.43188e-09,0.937222,0.000555913,-3.10521e-07,-2.71766e-08,0.937778,0.00055521,-3.92051e-07,4.167e-08,0.938333,0.000554551,-2.67041e-07,-2.02941e-08,0.938887,0.000553956,-3.27923e-07,-2.00984e-08,0.93944,0.00055324,-3.88218e-07,4.10828e-08,0.939993,0.000552587,-2.6497e-07,-2.50237e-08,0.940546,0.000551982,-3.40041e-07,-5.92583e-10,0.941097,0.0005513,-3.41819e-07,2.7394e-08,0.941648,0.000550698,-2.59637e-07,-4.93788e-08,0.942199,0.000550031,-4.07773e-07,5.09119e-08,0.942748,0.000549368,-2.55038e-07,-3.50595e-08,0.943297,0.000548753,-3.60216e-07,2.97214e-08,0.943846,0.000548122,-2.71052e-07,-2.42215e-08,0.944394,0.000547507,-3.43716e-07,7.55985e-09,0.944941,0.000546842,-3.21037e-07,-6.01796e-09,0.945487,0.000546182,-3.3909e-07,1.65119e-08,0.946033,0.000545553,-2.89555e-07,-4.2498e-10,0.946578,0.000544973,-2.9083e-07,-1.4812e-08,0.947123,0.000544347,-3.35266e-07,6.83068e-11,0.947667,0.000543676,-3.35061e-07,1.45388e-08,0.94821,0.00054305,-2.91444e-07,1.38123e-09,0.948753,0.000542471,-2.87301e-07,-2.00637e-08,0.949295,0.000541836,-3.47492e-07,1.92688e-08,0.949837,0.000541199,-2.89685e-07,2.59298e-09,0.950378,0.000540628,-2.81906e-07,-2.96407e-08,0.950918,0.000539975,-3.70829e-07,5.63652e-08,0.951458,0.000539402,-2.01733e-07,-7.66107e-08,0.951997,0.000538769,-4.31565e-07,7.12638e-08,0.952535,0.00053812,-2.17774e-07,-2.96305e-08,0.953073,0.000537595,-3.06665e-07,-1.23464e-08,0.95361,0.000536945,-3.43704e-07,1.94114e-08,0.954147,0.000536316,-2.8547e-07,-5.69451e-09,0.954683,0.000535728,-3.02554e-07,3.36666e-09,0.955219,0.000535133,-2.92454e-07,-7.77208e-09,0.955753,0.000534525,-3.1577e-07,2.77216e-08,0.956288,0.000533976,-2.32605e-07,-4.35097e-08,0.956821,0.00053338,-3.63134e-07,2.7108e-08,0.957354,0.000532735,-2.8181e-07,-5.31772e-09,0.957887,0.000532156,-2.97764e-07,-5.83718e-09,0.958419,0.000531543,-3.15275e-07,2.86664e-08,0.95895,0.000530998,-2.29276e-07,-4.9224e-08,0.959481,0.000530392,-3.76948e-07,4.90201e-08,0.960011,0.000529785,-2.29887e-07,-2.76471e-08,0.96054,0.000529243,-3.12829e-07,1.96385e-09,0.961069,0.000528623,-3.06937e-07,1.97917e-08,0.961598,0.000528068,-2.47562e-07,-2.15261e-08,0.962125,0.000527508,-3.1214e-07,6.70795e-09,0.962653,0.000526904,-2.92016e-07,-5.30573e-09,0.963179,0.000526304,-3.07934e-07,1.4515e-08,0.963705,0.000525732,-2.64389e-07,6.85048e-09,0.964231,0.000525224,-2.43837e-07,-4.19169e-08,0.964756,0.00052461,-3.69588e-07,4.1608e-08,0.96528,0.000523996,-2.44764e-07,-5.30598e-09,0.965804,0.000523491,-2.60682e-07,-2.03841e-08,0.966327,0.000522908,-3.21834e-07,2.72378e-08,0.966849,0.000522346,-2.40121e-07,-2.89625e-08,0.967371,0.000521779,-3.27008e-07,2.90075e-08,0.967893,0.000521212,-2.39986e-07,-2.74629e-08,0.968414,0.00052065,-3.22374e-07,2.12396e-08,0.968934,0.000520069,-2.58656e-07,2.10922e-09,0.969454,0.000519558,-2.52328e-07,-2.96765e-08,0.969973,0.000518964,-3.41357e-07,5.6992e-08,0.970492,0.000518452,-1.70382e-07,-7.90821e-08,0.97101,0.000517874,-4.07628e-07,8.05224e-08,0.971528,0.000517301,-1.66061e-07,-6.41937e-08,0.972045,0.000516776,-3.58642e-07,5.70429e-08,0.972561,0.00051623,-1.87513e-07,-4.47686e-08,0.973077,0.00051572,-3.21819e-07,2.82237e-09,0.973593,0.000515085,-3.13352e-07,3.34792e-08,0.974108,0.000514559,-2.12914e-07,-1.75298e-08,0.974622,0.000514081,-2.65503e-07,-2.29648e-08,0.975136,0.000513481,-3.34398e-07,4.97843e-08,0.975649,0.000512961,-1.85045e-07,-5.6963e-08,0.976162,0.00051242,-3.55934e-07,5.88585e-08,0.976674,0.000511885,-1.79359e-07,-5.92616e-08,0.977185,0.000511348,-3.57143e-07,5.89785e-08,0.977696,0.000510811,-1.80208e-07,-5.74433e-08,0.978207,0.000510278,-3.52538e-07,5.15854e-08,0.978717,0.000509728,-1.97781e-07,-2.9689e-08,0.979226,0.000509243,-2.86848e-07,7.56591e-09,0.979735,0.000508692,-2.64151e-07,-5.74649e-10,0.980244,0.000508162,-2.65875e-07,-5.26732e-09,0.980752,0.000507615,-2.81677e-07,2.16439e-08,0.981259,0.000507116,-2.16745e-07,-2.17037e-08,0.981766,0.000506618,-2.81856e-07,5.56636e-09,0.982272,0.000506071,-2.65157e-07,-5.61689e-10,0.982778,0.000505539,-2.66842e-07,-3.31963e-09,0.983283,0.000504995,-2.76801e-07,1.38402e-08,0.983788,0.000504483,-2.3528e-07,7.56339e-09,0.984292,0.000504035,-2.1259e-07,-4.40938e-08,0.984796,0.000503478,-3.44871e-07,4.96026e-08,0.985299,0.000502937,-1.96064e-07,-3.51071e-08,0.985802,0.000502439,-3.01385e-07,3.12212e-08,0.986304,0.00050193,-2.07721e-07,-3.0173e-08,0.986806,0.000501424,-2.9824e-07,2.9866e-08,0.987307,0.000500917,-2.08642e-07,-2.96865e-08,0.987808,0.000500411,-2.97702e-07,2.92753e-08,0.988308,0.000499903,-2.09876e-07,-2.78101e-08,0.988807,0.0004994,-2.93306e-07,2.23604e-08,0.989307,0.000498881,-2.26225e-07,-2.02681e-09,0.989805,0.000498422,-2.32305e-07,-1.42531e-08,0.990303,0.000497915,-2.75065e-07,-5.65232e-10,0.990801,0.000497363,-2.76761e-07,1.65141e-08,0.991298,0.000496859,-2.27218e-07,-5.88639e-09,0.991795,0.000496387,-2.44878e-07,7.0315e-09,0.992291,0.000495918,-2.23783e-07,-2.22396e-08,0.992787,0.000495404,-2.90502e-07,2.23224e-08,0.993282,0.00049489,-2.23535e-07,-7.44543e-09,0.993776,0.000494421,-2.45871e-07,7.45924e-09,0.994271,0.000493951,-2.23493e-07,-2.23915e-08,0.994764,0.000493437,-2.90668e-07,2.25021e-08,0.995257,0.000492923,-2.23161e-07,-8.01218e-09,0.99575,0.000492453,-2.47198e-07,9.54669e-09,0.996242,0.000491987,-2.18558e-07,-3.01746e-08,0.996734,0.000491459,-3.09082e-07,5.1547e-08,0.997225,0.000490996,-1.54441e-07,-5.68039e-08,0.997716,0.000490517,-3.24853e-07,5.64594e-08,0.998206,0.000490036,-1.55474e-07,-4.98245e-08,0.998696,0.000489576,-3.04948e-07,2.36292e-08,0.999186,0.000489037,-2.3406e-07,1.49121e-08,0.999674,0.000488613,-1.89324e-07,-2.3673e-08,1.00016,0.000488164,-2.60343e-07,2.01754e-08,1.00065,0.000487704,-1.99816e-07,-5.70288e-08,1.00114,0.000487133,-3.70903e-07,8.87303e-08,1.00162,0.000486657,-1.04712e-07,-5.94737e-08,1.00211,0.000486269,-2.83133e-07,2.99553e-08,1.0026,0.000485793,-1.93267e-07,-6.03474e-08,1.00308,0.000485225,-3.74309e-07,9.2225e-08,1.00357,0.000484754,-9.76345e-08,-7.0134e-08,1.00405,0.000484348,-3.08036e-07,6.91016e-08,1.00454,0.000483939,-1.00731e-07,-8.70633e-08,1.00502,0.000483476,-3.61921e-07,4.07328e-08,1.0055,0.000482875,-2.39723e-07,4.33413e-08,1.00599,0.000482525,-1.09699e-07,-9.48886e-08,1.00647,0.000482021,-3.94365e-07,9.77947e-08,1.00695,0.000481526,-1.00981e-07,-5.78713e-08,1.00743,0.00048115,-2.74595e-07,1.44814e-08,1.00791,0.000480645,-2.31151e-07,-5.42665e-11,1.00839,0.000480182,-2.31314e-07,-1.42643e-08,1.00887,0.000479677,-2.74106e-07,5.71115e-08,1.00935,0.0004793,-1.02772e-07,-9.49724e-08,1.00983,0.000478809,-3.87689e-07,8.43596e-08,1.01031,0.000478287,-1.3461e-07,-4.04755e-09,1.01079,0.000478006,-1.46753e-07,-6.81694e-08,1.01127,0.000477508,-3.51261e-07,3.83067e-08,1.01174,0.00047692,-2.36341e-07,3.41521e-08,1.01222,0.00047655,-1.33885e-07,-5.57058e-08,1.0127,0.000476115,-3.01002e-07,6.94616e-08,1.01317,0.000475721,-9.26174e-08,-1.02931e-07,1.01365,0.000475227,-4.01412e-07,1.03846e-07,1.01412,0.000474736,-8.98751e-08,-7.40321e-08,1.0146,0.000474334,-3.11971e-07,7.30735e-08,1.01507,0.00047393,-9.27508e-08,-9.90527e-08,1.01554,0.000473447,-3.89909e-07,8.47188e-08,1.01602,0.000472921,-1.35753e-07,-1.40381e-09,1.01649,0.000472645,-1.39964e-07,-7.91035e-08,1.01696,0.000472128,-3.77275e-07,7.93993e-08,1.01744,0.000471612,-1.39077e-07,-7.52607e-11,1.01791,0.000471334,-1.39302e-07,-7.90983e-08,1.01838,0.000470818,-3.76597e-07,7.80499e-08,1.01885,0.000470299,-1.42448e-07,5.31733e-09,1.01932,0.00047003,-1.26496e-07,-9.93193e-08,1.01979,0.000469479,-4.24453e-07,1.53541e-07,1.02026,0.00046909,3.617e-08,-1.57217e-07,1.02073,0.000468691,-4.35482e-07,1.177e-07,1.02119,0.000468173,-8.23808e-08,-7.51659e-08,1.02166,0.000467783,-3.07878e-07,6.37538e-08,1.02213,0.000467358,-1.16617e-07,-6.064e-08,1.0226,0.000466943,-2.98537e-07,5.9597e-08,1.02306,0.000466525,-1.19746e-07,-5.85386e-08,1.02353,0.00046611,-2.95362e-07,5.53482e-08,1.024,0.000465685,-1.29317e-07,-4.36449e-08,1.02446,0.000465296,-2.60252e-07,2.20268e-11,1.02493,0.000464775,-2.60186e-07,4.35568e-08,1.02539,0.000464386,-1.29516e-07,-5.50398e-08,1.02586,0.000463961,-2.94635e-07,5.73932e-08,1.02632,0.000463544,-1.22456e-07,-5.53236e-08,1.02678,0.000463133,-2.88426e-07,4.46921e-08,1.02725,0.000462691,-1.5435e-07,-4.23534e-09,1.02771,0.000462369,-1.67056e-07,-2.77507e-08,1.02817,0.000461952,-2.50308e-07,-3.97101e-09,1.02863,0.000461439,-2.62221e-07,4.36348e-08,1.02909,0.000461046,-1.31317e-07,-5.13589e-08,1.02955,0.000460629,-2.85394e-07,4.25913e-08,1.03001,0.000460186,-1.5762e-07,2.0285e-10,1.03047,0.000459871,-1.57011e-07,-4.34027e-08,1.03093,0.000459427,-2.87219e-07,5.41987e-08,1.03139,0.000459015,-1.24623e-07,-5.4183e-08,1.03185,0.000458604,-2.87172e-07,4.33239e-08,1.03231,0.000458159,-1.572e-07,9.65817e-11,1.03277,0.000457845,-1.56911e-07,-4.37103e-08,1.03323,0.0004574,-2.88041e-07,5.55351e-08,1.03368,0.000456991,-1.21436e-07,-5.9221e-08,1.03414,0.00045657,-2.99099e-07,6.21394e-08,1.0346,0.000456158,-1.1268e-07,-7.01275e-08,1.03505,0.000455723,-3.23063e-07,9.91614e-08,1.03551,0.000455374,-2.55788e-08,-8.80996e-08,1.03596,0.000455058,-2.89878e-07,1.48184e-08,1.03642,0.000454523,-2.45422e-07,2.88258e-08,1.03687,0.000454119,-1.58945e-07,-1.09125e-08,1.03733,0.000453768,-1.91682e-07,1.48241e-08,1.03778,0.000453429,-1.4721e-07,-4.83838e-08,1.03823,0.00045299,-2.92361e-07,5.95019e-08,1.03869,0.000452584,-1.13856e-07,-7.04146e-08,1.03914,0.000452145,-3.25099e-07,1.02947e-07,1.03959,0.000451803,-1.62583e-08,-1.02955e-07,1.04004,0.000451462,-3.25123e-07,7.04544e-08,1.04049,0.000451023,-1.1376e-07,-5.96534e-08,1.04094,0.000450616,-2.9272e-07,4.89499e-08,1.04139,0.000450178,-1.45871e-07,-1.69369e-08,1.04184,0.000449835,-1.96681e-07,1.87977e-08,1.04229,0.000449498,-1.40288e-07,-5.82539e-08,1.04274,0.000449043,-3.1505e-07,9.50087e-08,1.04319,0.000448698,-3.00238e-08,-8.33623e-08,1.04364,0.000448388,-2.80111e-07,2.20363e-11,1.04409,0.000447828,-2.80045e-07,8.32742e-08,1.04454,0.000447517,-3.02221e-08,-9.47002e-08,1.04498,0.000447173,-3.14323e-07,5.7108e-08,1.04543,0.000446716,-1.42999e-07,-1.45225e-08,1.04588,0.000446386,-1.86566e-07,9.82022e-10,1.04632,0.000446016,-1.8362e-07,1.05944e-08,1.04677,0.00044568,-1.51837e-07,-4.33597e-08,1.04721,0.000445247,-2.81916e-07,4.36352e-08,1.04766,0.000444814,-1.51011e-07,-1.19717e-08,1.0481,0.000444476,-1.86926e-07,4.25158e-09,1.04855,0.000444115,-1.74171e-07,-5.03461e-09,1.04899,0.000443751,-1.89275e-07,1.58868e-08,1.04944,0.00044342,-1.41614e-07,-5.85127e-08,1.04988,0.000442961,-3.17152e-07,9.89548e-08,1.05032,0.000442624,-2.0288e-08,-9.88878e-08,1.05076,0.000442287,-3.16951e-07,5.81779e-08,1.05121,0.000441827,-1.42418e-07,-1.46144e-08,1.05165,0.000441499,-1.86261e-07,2.79892e-10,1.05209,0.000441127,-1.85421e-07,1.34949e-08,1.05253,0.000440797,-1.44937e-07,-5.42594e-08,1.05297,0.000440344,-3.07715e-07,8.43335e-08,1.05341,0.000439982,-5.47146e-08,-4.46558e-08,1.05385,0.000439738,-1.88682e-07,-2.49193e-08,1.05429,0.000439286,-2.6344e-07,2.5124e-08,1.05473,0.000438835,-1.88068e-07,4.36328e-08,1.05517,0.000438589,-5.71699e-08,-8.04459e-08,1.05561,0.000438234,-2.98508e-07,3.97324e-08,1.05605,0.000437756,-1.79311e-07,4.07258e-08,1.05648,0.000437519,-5.71332e-08,-8.34263e-08,1.05692,0.000437155,-3.07412e-07,5.45608e-08,1.05736,0.000436704,-1.4373e-07,-1.56078e-08,1.05779,0.000436369,-1.90553e-07,7.87043e-09,1.05823,0.000436012,-1.66942e-07,-1.58739e-08,1.05867,0.00043563,-2.14563e-07,5.56251e-08,1.0591,0.000435368,-4.76881e-08,-8.74172e-08,1.05954,0.000435011,-3.0994e-07,5.56251e-08,1.05997,0.000434558,-1.43064e-07,-1.58739e-08,1.06041,0.000434224,-1.90686e-07,7.87042e-09,1.06084,0.000433866,-1.67075e-07,-1.56078e-08,1.06127,0.000433485,-2.13898e-07,5.45609e-08,1.06171,0.000433221,-5.02157e-08,-8.34263e-08,1.06214,0.00043287,-3.00495e-07,4.07258e-08,1.06257,0.000432391,-1.78317e-07,3.97325e-08,1.063,0.000432154,-5.91198e-08,-8.04464e-08,1.06344,0.000431794,-3.00459e-07,4.36347e-08,1.06387,0.000431324,-1.69555e-07,2.5117e-08,1.0643,0.000431061,-9.42041e-08,-2.48934e-08,1.06473,0.000430798,-1.68884e-07,-4.47527e-08,1.06516,0.000430326,-3.03142e-07,8.46951e-08,1.06559,0.000429973,-4.90573e-08,-5.56089e-08,1.06602,0.000429708,-2.15884e-07,1.85314e-08,1.06645,0.000429332,-1.6029e-07,-1.85166e-08,1.06688,0.000428956,-2.1584e-07,5.5535e-08,1.06731,0.000428691,-4.92347e-08,-8.44142e-08,1.06774,0.000428339,-3.02477e-07,4.37032e-08,1.06816,0.000427865,-1.71368e-07,2.88107e-08,1.06859,0.000427609,-8.49356e-08,-3.97367e-08,1.06902,0.00042732,-2.04146e-07,1.09267e-08,1.06945,0.000426945,-1.71365e-07,-3.97023e-09,1.06987,0.00042659,-1.83276e-07,4.9542e-09,1.0703,0.000426238,-1.68414e-07,-1.58466e-08,1.07073,0.000425854,-2.15953e-07,5.84321e-08,1.07115,0.000425597,-4.0657e-08,-9.86725e-08,1.07158,0.00042522,-3.36674e-07,9.78392e-08,1.072,0.00042484,-4.31568e-08,-5.42658e-08,1.07243,0.000424591,-2.05954e-07,1.45377e-11,1.07285,0.000424179,-2.0591e-07,5.42076e-08,1.07328,0.00042393,-4.32877e-08,-9.76357e-08,1.0737,0.00042355,-3.36195e-07,9.79165e-08,1.07412,0.000423172,-4.24451e-08,-5.56118e-08,1.07455,0.00042292,-2.09281e-07,5.32143e-09,1.07497,0.000422518,-1.93316e-07,3.43261e-08,1.07539,0.000422234,-9.0338e-08,-2.34165e-08,1.07581,0.000421983,-1.60588e-07,-5.98692e-08,1.07623,0.000421482,-3.40195e-07,1.43684e-07,1.07666,0.000421233,9.08574e-08,-1.5724e-07,1.07708,0.000420943,-3.80862e-07,1.27647e-07,1.0775,0.000420564,2.0791e-09,-1.1493e-07,1.07792,0.000420223,-3.4271e-07,9.36534e-08,1.07834,0.000419819,-6.17499e-08,-2.12653e-08,1.07876,0.000419632,-1.25546e-07,-8.59219e-09,1.07918,0.000419355,-1.51322e-07,-6.35752e-08,1.0796,0.000418861,-3.42048e-07,1.43684e-07,1.08002,0.000418608,8.90034e-08,-1.53532e-07,1.08043,0.000418326,-3.71593e-07,1.12817e-07,1.08085,0.000417921,-3.31414e-08,-5.93184e-08,1.08127,0.000417677,-2.11097e-07,5.24697e-09,1.08169,0.00041727,-1.95356e-07,3.83305e-08,1.0821,0.000416995,-8.03642e-08,-3.93597e-08,1.08252,0.000416716,-1.98443e-07,-1.0094e-10,1.08294,0.000416319,-1.98746e-07,3.97635e-08,1.08335,0.00041604,-7.94557e-08,-3.97437e-08,1.08377,0.000415762,-1.98687e-07,1.94215e-12,1.08419,0.000415365,-1.98681e-07,3.97359e-08,1.0846,0.000415087,-7.94732e-08,-3.97362e-08,1.08502,0.000414809,-1.98682e-07,-4.31063e-13,1.08543,0.000414411,-1.98683e-07,3.97379e-08,1.08584,0.000414133,-7.94694e-08,-3.97418e-08,1.08626,0.000413855,-1.98695e-07,2.00563e-11,1.08667,0.000413458,-1.98635e-07,3.96616e-08,1.08709,0.000413179,-7.965e-08,-3.9457e-08,1.0875,0.000412902,-1.98021e-07,-1.04281e-09,1.08791,0.000412502,-2.01149e-07,4.36282e-08,1.08832,0.000412231,-7.02648e-08,-5.42608e-08,1.08874,0.000411928,-2.33047e-07,5.42057e-08,1.08915,0.000411624,-7.04301e-08,-4.33527e-08,1.08956,0.000411353,-2.00488e-07,-4.07378e-12,1.08997,0.000410952,-2.005e-07,4.3369e-08,1.09038,0.000410681,-7.03934e-08,-5.42627e-08,1.09079,0.000410378,-2.33182e-07,5.44726e-08,1.0912,0.000410075,-6.97637e-08,-4.44186e-08,1.09161,0.000409802,-2.03019e-07,3.99235e-09,1.09202,0.000409408,-1.91042e-07,2.84491e-08,1.09243,0.000409111,-1.05695e-07,1.42043e-09,1.09284,0.000408904,-1.01434e-07,-3.41308e-08,1.09325,0.000408599,-2.03826e-07,1.58937e-08,1.09366,0.000408239,-1.56145e-07,-2.94438e-08,1.09406,0.000407838,-2.44476e-07,1.01881e-07,1.09447,0.000407655,6.11676e-08,-1.39663e-07,1.09488,0.000407358,-3.57822e-07,9.91432e-08,1.09529,0.00040694,-6.03921e-08,-1.84912e-08,1.09569,0.000406764,-1.15866e-07,-2.51785e-08,1.0961,0.000406457,-1.91401e-07,-4.03115e-12,1.09651,0.000406074,-1.91413e-07,2.51947e-08,1.09691,0.000405767,-1.15829e-07,1.84346e-08,1.09732,0.00040559,-6.05254e-08,-9.89332e-08,1.09772,0.000405172,-3.57325e-07,1.3888e-07,1.09813,0.000404874,5.93136e-08,-9.8957e-08,1.09853,0.000404696,-2.37557e-07,1.853e-08,1.09894,0.000404277,-1.81968e-07,2.48372e-08,1.09934,0.000403987,-1.07456e-07,1.33047e-09,1.09975,0.000403776,-1.03465e-07,-3.01591e-08,1.10015,0.000403479,-1.93942e-07,9.66054e-11,1.10055,0.000403091,-1.93652e-07,2.97727e-08,1.10096,0.000402793,-1.04334e-07,2.19273e-11,1.10136,0.000402585,-1.04268e-07,-2.98604e-08,1.10176,0.000402287,-1.93849e-07,2.10325e-10,1.10216,0.0004019,-1.93218e-07,2.90191e-08,1.10256,0.0004016,-1.06161e-07,2.92264e-09,1.10297,0.000401397,-9.73931e-08,-4.07096e-08,1.10337,0.00040108,-2.19522e-07,4.07067e-08,1.10377,0.000400763,-9.7402e-08,-2.90783e-09,1.10417,0.000400559,-1.06126e-07,-2.90754e-08,1.10457,0.00040026,-1.93352e-07,9.00021e-14,1.10497,0.000399873,-1.93351e-07,2.9075e-08,1.10537,0.000399574,-1.06126e-07,2.90902e-09,1.10577,0.00039937,-9.73992e-08,-4.07111e-08,1.10617,0.000399053,-2.19533e-07,4.07262e-08,1.10657,0.000398736,-9.73541e-08,-2.98424e-09,1.10697,0.000398533,-1.06307e-07,-2.87892e-08,1.10736,0.000398234,-1.92674e-07,-1.06824e-09,1.10776,0.000397845,-1.95879e-07,3.30622e-08,1.10816,0.000397552,-9.66926e-08,-1.19712e-08,1.10856,0.000397323,-1.32606e-07,1.48225e-08,1.10895,0.000397102,-8.81387e-08,-4.73187e-08,1.10935,0.000396784,-2.30095e-07,5.52429e-08,1.10975,0.00039649,-6.4366e-08,-5.44437e-08,1.11014,0.000396198,-2.27697e-07,4.33226e-08,1.11054,0.000395872,-9.77293e-08,3.62656e-10,1.11094,0.000395678,-9.66414e-08,-4.47732e-08,1.11133,0.00039535,-2.30961e-07,5.95208e-08,1.11173,0.000395067,-5.23985e-08,-7.41008e-08,1.11212,0.00039474,-2.74701e-07,1.17673e-07,1.11252,0.000394543,7.83181e-08,-1.58172e-07,1.11291,0.000394225,-3.96199e-07,1.57389e-07,1.1133,0.000393905,7.59679e-08,-1.13756e-07,1.1137,0.000393716,-2.653e-07,5.92165e-08,1.11409,0.000393363,-8.76507e-08,-3.90074e-09,1.11449,0.000393176,-9.93529e-08,-4.36136e-08,1.11488,0.000392846,-2.30194e-07,5.91457e-08,1.11527,0.000392563,-5.27564e-08,-7.376e-08,1.11566,0.000392237,-2.74037e-07,1.16685e-07,1.11606,0.000392039,7.60189e-08,-1.54562e-07,1.11645,0.000391727,-3.87667e-07,1.43935e-07,1.11684,0.000391384,4.4137e-08,-6.35487e-08,1.11723,0.000391281,-1.46509e-07,-8.94896e-09,1.11762,0.000390961,-1.73356e-07,-1.98647e-08,1.11801,0.000390555,-2.3295e-07,8.8408e-08,1.1184,0.000390354,3.22736e-08,-9.53486e-08,1.11879,0.000390133,-2.53772e-07,5.45677e-08,1.11918,0.000389789,-9.0069e-08,-3.71296e-09,1.11957,0.000389598,-1.01208e-07,-3.97159e-08,1.11996,0.000389276,-2.20355e-07,4.33671e-08,1.12035,0.000388966,-9.02542e-08,-1.45431e-08,1.12074,0.000388741,-1.33883e-07,1.48052e-08,1.12113,0.000388518,-8.94678e-08,-4.46778e-08,1.12152,0.000388205,-2.23501e-07,4.46966e-08,1.12191,0.000387892,-8.94114e-08,-1.48992e-08,1.12229,0.000387669,-1.34109e-07,1.49003e-08,1.12268,0.000387445,-8.94082e-08,-4.47019e-08,1.12307,0.000387132,-2.23514e-07,4.4698e-08,1.12345,0.000386819,-8.942e-08,-1.48806e-08,1.12384,0.000386596,-1.34062e-07,1.48245e-08,1.12423,0.000386372,-8.95885e-08,-4.44172e-08,1.12461,0.00038606,-2.2284e-07,4.36351e-08,1.125,0.000385745,-9.19348e-08,-1.09139e-08,1.12539,0.000385528,-1.24677e-07,2.05584e-11,1.12577,0.000385279,-1.24615e-07,1.08317e-08,1.12616,0.000385062,-9.21198e-08,-4.33473e-08,1.12654,0.000384748,-2.22162e-07,4.33481e-08,1.12693,0.000384434,-9.21174e-08,-1.08356e-08,1.12731,0.000384217,-1.24624e-07,-5.50907e-12,1.12769,0.000383968,-1.24641e-07,1.08577e-08,1.12808,0.000383751,-9.20679e-08,-4.34252e-08,1.12846,0.000383437,-2.22343e-07,4.36337e-08,1.12884,0.000383123,-9.14422e-08,-1.19005e-08,1.12923,0.000382904,-1.27144e-07,3.96813e-09,1.12961,0.000382662,-1.15239e-07,-3.97207e-09,1.12999,0.000382419,-1.27155e-07,1.19201e-08,1.13038,0.000382201,-9.1395e-08,-4.37085e-08,1.13076,0.000381887,-2.2252e-07,4.37046e-08,1.13114,0.000381573,-9.14068e-08,-1.19005e-08,1.13152,0.000381355,-1.27108e-07,3.89734e-09,1.1319,0.000381112,-1.15416e-07,-3.68887e-09,1.13228,0.00038087,-1.26483e-07,1.08582e-08,1.13266,0.00038065,-9.39083e-08,-3.97438e-08,1.13304,0.000380343,-2.1314e-07,2.89076e-08,1.13342,0.000380003,-1.26417e-07,4.33225e-08,1.1338,0.00037988,3.55072e-09,-8.29883e-08,1.13418,0.000379638,-2.45414e-07,5.0212e-08,1.13456,0.000379298,-9.47781e-08,1.34964e-09,1.13494,0.000379113,-9.07292e-08,-5.56105e-08,1.13532,0.000378764,-2.57561e-07,1.01883e-07,1.1357,0.000378555,4.80889e-08,-1.13504e-07,1.13608,0.000378311,-2.92423e-07,1.13713e-07,1.13646,0.000378067,4.87176e-08,-1.02931e-07,1.13683,0.000377856,-2.60076e-07,5.95923e-08,1.13721,0.000377514,-8.12988e-08,-1.62288e-08,1.13759,0.000377303,-1.29985e-07,5.32278e-09,1.13797,0.000377059,-1.14017e-07,-5.06237e-09,1.13834,0.000376816,-1.29204e-07,1.49267e-08,1.13872,0.000376602,-8.44237e-08,-5.46444e-08,1.1391,0.000376269,-2.48357e-07,8.44417e-08,1.13947,0.000376026,4.96815e-09,-4.47039e-08,1.13985,0.000375902,-1.29143e-07,-2.48355e-08,1.14023,0.000375569,-2.0365e-07,2.48368e-08,1.1406,0.000375236,-1.2914e-07,4.46977e-08,1.14098,0.000375112,4.95341e-09,-8.44184e-08,1.14135,0.000374869,-2.48302e-07,5.45572e-08,1.14173,0.000374536,-8.463e-08,-1.46013e-08,1.1421,0.000374323,-1.28434e-07,3.8478e-09,1.14247,0.000374077,-1.1689e-07,-7.89941e-10,1.14285,0.000373841,-1.1926e-07,-6.88042e-10,1.14322,0.0003736,-1.21324e-07,3.54213e-09,1.1436,0.000373368,-1.10698e-07,-1.34805e-08,1.14397,0.000373107,-1.51139e-07,5.03798e-08,1.14434,0.000372767,0.,0.};
+
+    template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct RGB2Luv;
+
+    template <int scn, int dcn, bool srgb, int blueIdx> struct RGB2Luv<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float _d = 1.f / (0.950456f + 15 + 1.088754f * 3);
+            const float _un = 13 * (4 * 0.950456f * _d);
+            const float _vn = 13 * (9 * _d);
+
+            float B = blueIdx == 0 ? src.x : src.z;
+            float G = src.y;
+            float R = blueIdx == 0 ? src.z : src.x;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            float X = R * 0.412453f + G * 0.357580f + B * 0.180423f;
+            float Y = R * 0.212671f + G * 0.715160f + B * 0.072169f;
+            float Z = R * 0.019334f + G * 0.119193f + B * 0.950227f;
+
+            float L = splineInterpolate(Y * (LAB_CBRT_TAB_SIZE / 1.5f), c_LabCbrtTab, LAB_CBRT_TAB_SIZE);
+            L = 116.f * L - 16.f;
+
+            const float d = (4 * 13) / ::fmaxf(X + 15 * Y + 3 * Z, numeric_limits<float>::epsilon());
+            float u = L * (X * d - _un);
+            float v = L * ((9 * 0.25f) * Y * d - _vn);
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = L;
+            dst.y = u;
+            dst.z = v;
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, bool srgb, int blueIdx> struct RGB2Luv<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename MakeVec<uchar, scn>::type, typename MakeVec<uchar, dcn>::type>
+    {
+        __device__ typename MakeVec<uchar, dcn>::type operator ()(const typename MakeVec<uchar, scn>::type& src) const
+        {
+            float3 buf;
+
+            buf.x = src.x * (1.f / 255.f);
+            buf.y = src.y * (1.f / 255.f);
+            buf.z = src.z * (1.f / 255.f);
+
+            RGB2Luv<float, 3, 3, srgb, blueIdx> cvtf;
+            buf = cvtf(buf);
+
+            typename MakeVec<uchar, dcn>::type dst;
+
+            dst.x = saturate_cast<uchar>(buf.x * 2.55f);
+            dst.y = saturate_cast<uchar>(buf.y * 0.72033898305084743f + 96.525423728813564f);
+            dst.z = saturate_cast<uchar>(buf.z * 0.9732824427480916f + 136.259541984732824f);
+
+            return dst;
+        }
+    };
+
+    // Luv to RGB
+
+    template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct Luv2RGB;
+
+    template <int scn, int dcn, bool srgb, int blueIdx> struct Luv2RGB<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename MakeVec<float, scn>::type, typename MakeVec<float, dcn>::type>
+    {
+        __device__ typename MakeVec<float, dcn>::type operator ()(const typename MakeVec<float, scn>::type& src) const
+        {
+            const float _d = 1.f / (0.950456f + 15 + 1.088754f * 3);
+            const float _un = 4 * 0.950456f * _d;
+            const float _vn = 9 * _d;
+
+            float L = src.x;
+            float u = src.y;
+            float v = src.z;
+
+            float Y = (L + 16.f) * (1.f / 116.f);
+            Y = Y * Y * Y;
+
+            float d = (1.f / 13.f) / L;
+            u = u * d + _un;
+            v = v * d + _vn;
+
+            float iv = 1.f / v;
+            float X = 2.25f * u * Y * iv;
+            float Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
+
+            float B = 0.055648f * X - 0.204043f * Y + 1.057311f * Z;
+            float G = -0.969256f * X + 1.875991f * Y + 0.041556f * Z;
+            float R = 3.240479f * X - 1.537150f * Y - 0.498535f * Z;
+
+            R = ::fminf(::fmaxf(R, 0.f), 1.f);
+            G = ::fminf(::fmaxf(G, 0.f), 1.f);
+            B = ::fminf(::fmaxf(B, 0.f), 1.f);
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            typename MakeVec<float, dcn>::type dst;
+
+            dst.x = blueIdx == 0 ? B : R;
+            dst.y = G;
+            dst.z = blueIdx == 0 ? R : B;
+            setAlpha(dst, ColorChannel<float>::max());
+
+            return dst;
+        }
+    };
+
+    template <int scn, int dcn, bool srgb, int blueIdx> struct Luv2RGB<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename MakeVec<uchar, scn>::type, typename MakeVec<uchar, dcn>::type>
+    {
+        __device__ typename MakeVec<uchar, dcn>::type operator ()(const typename MakeVec<uchar, scn>::type& src) const
+        {
+            float3 buf;
+
+            buf.x = src.x * (100.f / 255.f);
+            buf.y = src.y * 1.388235294117647f - 134.f;
+            buf.z = src.z * 1.027450980392157f - 140.f;
+
+            Luv2RGB<float, 3, 3, srgb, blueIdx> cvtf;
+            buf = cvtf(buf);
+
+            typename MakeVec<uchar, dcn>::type dst;
+
+            dst.x = saturate_cast<uchar>(buf.x * 255.f);
+            dst.y = saturate_cast<uchar>(buf.y * 255.f);
+            dst.z = saturate_cast<uchar>(buf.z * 255.f);
+            setAlpha(dst, ColorChannel<uchar>::max());
+
+            return dst;
+        }
+    };
+
+    #undef CV_CUDEV_DESCALE
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/functional/functional.hpp b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
new file mode 100644
index 00000000000..f6569cf3d55
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
@@ -0,0 +1,904 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_FUNCTIONAL_FUNCTIONAL_HPP
+#define OPENCV_CUDEV_FUNCTIONAL_FUNCTIONAL_HPP
+
+#include "../common.hpp"
+#include "../util/saturate_cast.hpp"
+#include "../util/vec_traits.hpp"
+#include "../util/vec_math.hpp"
+#include "../util/type_traits.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// Function Objects
+
+template <typename _Arg, typename _Result> struct unary_function
+{
+    typedef _Arg    argument_type;
+    typedef _Result result_type;
+};
+
+template <typename _Arg1, typename _Arg2, typename _Result> struct binary_function
+{
+    typedef _Arg1   first_argument_type;
+    typedef _Arg2   second_argument_type;
+    typedef _Result result_type;
+};
+
+// Arithmetic Operations
+
+template <typename T> struct plus : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return saturate_cast<T>(a + b);
+    }
+};
+
+template <typename T> struct minus : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return saturate_cast<T>(a - b);
+    }
+};
+
+template <typename T> struct multiplies : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return saturate_cast<T>(a * b);
+    }
+};
+
+template <typename T> struct divides : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return saturate_cast<T>(a / b);
+    }
+};
+
+template <typename T> struct modulus : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return saturate_cast<T>(a % b);
+    }
+};
+
+template <typename T> struct negate : unary_function<T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a) const
+    {
+        return saturate_cast<T>(-a);
+    }
+};
+
+// Comparison Operations
+
+template <typename T> struct equal_to : binary_function<T, T, typename MakeVec<uchar, VecTraits<T>::cn>::type>
+{
+    __device__ __forceinline__ typename MakeVec<uchar, VecTraits<T>::cn>::type
+                                    operator ()(typename TypeTraits<T>::parameter_type a,
+                                                typename TypeTraits<T>::parameter_type b) const
+    {
+        return a == b;
+    }
+};
+
+template <typename T> struct not_equal_to : binary_function<T, T, typename MakeVec<uchar, VecTraits<T>::cn>::type>
+{
+    __device__ __forceinline__ typename MakeVec<uchar, VecTraits<T>::cn>::type
+                                    operator ()(typename TypeTraits<T>::parameter_type a,
+                                                typename TypeTraits<T>::parameter_type b) const
+    {
+        return a != b;
+    }
+};
+
+template <typename T> struct greater : binary_function<T, T, typename MakeVec<uchar, VecTraits<T>::cn>::type>
+{
+    __device__ __forceinline__ typename MakeVec<uchar, VecTraits<T>::cn>::type
+                                    operator ()(typename TypeTraits<T>::parameter_type a,
+                                                typename TypeTraits<T>::parameter_type b) const
+    {
+        return a > b;
+    }
+};
+
+template <typename T> struct less : binary_function<T, T, typename MakeVec<uchar, VecTraits<T>::cn>::type>
+{
+    __device__ __forceinline__ typename MakeVec<uchar, VecTraits<T>::cn>::type
+                                    operator ()(typename TypeTraits<T>::parameter_type a,
+                                                typename TypeTraits<T>::parameter_type b) const
+    {
+        return a < b;
+    }
+};
+
+template <typename T> struct greater_equal : binary_function<T, T, typename MakeVec<uchar, VecTraits<T>::cn>::type>
+{
+    __device__ __forceinline__ typename MakeVec<uchar, VecTraits<T>::cn>::type
+                                    operator ()(typename TypeTraits<T>::parameter_type a,
+                                                typename TypeTraits<T>::parameter_type b) const
+    {
+        return a >= b;
+    }
+};
+
+template <typename T> struct less_equal : binary_function<T, T, typename MakeVec<uchar, VecTraits<T>::cn>::type>
+{
+    __device__ __forceinline__ typename MakeVec<uchar, VecTraits<T>::cn>::type
+                                    operator ()(typename TypeTraits<T>::parameter_type a,
+                                                typename TypeTraits<T>::parameter_type b) const
+    {
+        return a <= b;
+    }
+};
+
+// Logical Operations
+
+template <typename T> struct logical_and : binary_function<T, T, typename MakeVec<uchar, VecTraits<T>::cn>::type>
+{
+    __device__ __forceinline__ typename MakeVec<uchar, VecTraits<T>::cn>::type
+                                    operator ()(typename TypeTraits<T>::parameter_type a,
+                                                typename TypeTraits<T>::parameter_type b) const
+    {
+        return a && b;
+    }
+};
+
+template <typename T> struct logical_or : binary_function<T, T, typename MakeVec<uchar, VecTraits<T>::cn>::type>
+{
+    __device__ __forceinline__ typename MakeVec<uchar, VecTraits<T>::cn>::type
+                                    operator ()(typename TypeTraits<T>::parameter_type a,
+                                                typename TypeTraits<T>::parameter_type b) const
+    {
+        return a || b;
+    }
+};
+
+template <typename T> struct logical_not : unary_function<T, typename MakeVec<uchar, VecTraits<T>::cn>::type>
+{
+    __device__ __forceinline__ typename MakeVec<uchar, VecTraits<T>::cn>::type
+                                    operator ()(typename TypeTraits<T>::parameter_type a) const
+    {
+        return !a;
+    }
+};
+
+// Bitwise Operations
+
+template <typename T> struct bit_and : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return a & b;
+    }
+};
+
+template <typename T> struct bit_or : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return a | b;
+    }
+};
+
+template <typename T> struct bit_xor : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return a ^ b;
+    }
+};
+
+template <typename T> struct bit_not : unary_function<T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type v) const
+    {
+        return ~v;
+    }
+};
+
+template <typename T> struct bit_lshift : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return a << b;
+    }
+};
+
+template <typename T> struct bit_rshift : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return a >> b;
+    }
+};
+
+// Generalized Identity Operations
+
+template <typename T> struct identity : unary_function<T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type x) const
+    {
+        return x;
+    }
+};
+
+template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
+{
+    __device__ __forceinline__ T1
+                    operator ()(typename TypeTraits<T1>::parameter_type lhs,
+                                typename TypeTraits<T2>::parameter_type) const
+    {
+        return lhs;
+    }
+};
+
+template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
+{
+    __device__ __forceinline__ T2
+                    operator ()(typename TypeTraits<T1>::parameter_type,
+                                typename TypeTraits<T2>::parameter_type rhs) const
+    {
+        return rhs;
+    }
+};
+
+// Min/Max Operations
+
+template <typename T> struct maximum : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return max(a, b);
+    }
+};
+
+template <typename T> struct minimum : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a,
+                                             typename TypeTraits<T>::parameter_type b) const
+    {
+        return min(a, b);
+    }
+};
+
+#define CV_CUDEV_MINMAX_INST(type, maxop, minop) \
+    template <> struct maximum<type> : binary_function<type, type, type> \
+    { \
+        __device__ __forceinline__ type operator ()(type a, type b) const {return maxop(a, b);} \
+    }; \
+    template <> struct minimum<type> : binary_function<type, type, type> \
+    { \
+        __device__ __forceinline__ type operator ()(type a, type b) const {return minop(a, b);} \
+    };
+
+
+CV_CUDEV_MINMAX_INST(uchar, ::max, ::min)
+CV_CUDEV_MINMAX_INST(schar, ::max, ::min)
+CV_CUDEV_MINMAX_INST(ushort, ::max, ::min)
+CV_CUDEV_MINMAX_INST(short, ::max, ::min)
+CV_CUDEV_MINMAX_INST(int, ::max, ::min)
+CV_CUDEV_MINMAX_INST(uint, ::max, ::min)
+CV_CUDEV_MINMAX_INST(float, ::fmaxf, ::fminf)
+CV_CUDEV_MINMAX_INST(double, ::fmax, ::fmin)
+
+#undef CV_CUDEV_MINMAX_INST
+
+// abs_func
+
+template <typename T> struct abs_func : unary_function<T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type x) const
+    {
+        return abs(x);
+    }
+};
+
+template <> struct abs_func<uchar> : unary_function<uchar, uchar>
+{
+    __device__ __forceinline__ uchar operator ()(uchar x) const
+    {
+        return x;
+    }
+};
+
+template <> struct abs_func<schar> : unary_function<schar, schar>
+{
+    __device__ __forceinline__ schar operator ()(schar x) const
+    {
+        return ::abs((int) x);
+    }
+};
+
+template <> struct abs_func<ushort> : unary_function<ushort, ushort>
+{
+    __device__ __forceinline__ ushort operator ()(ushort x) const
+    {
+        return x;
+    }
+};
+
+template <> struct abs_func<short> : unary_function<short, short>
+{
+    __device__ __forceinline__ short operator ()(short x) const
+    {
+        return ::abs((int) x);
+    }
+};
+
+template <> struct abs_func<uint> : unary_function<uint, uint>
+{
+    __device__ __forceinline__ uint operator ()(uint x) const
+    {
+        return x;
+    }
+};
+
+template <> struct abs_func<int> : unary_function<int, int>
+{
+    __device__ __forceinline__ int operator ()(int x) const
+    {
+        return ::abs(x);
+    }
+};
+
+template <> struct abs_func<float> : unary_function<float, float>
+{
+    __device__ __forceinline__ float operator ()(float x) const
+    {
+        return ::fabsf(x);
+    }
+};
+
+template <> struct abs_func<double> : unary_function<double, double>
+{
+    __device__ __forceinline__ double operator ()(double x) const
+    {
+        return ::fabs(x);
+    }
+};
+
+// absdiff_func
+
+template <typename T> struct absdiff_func : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type a, typename TypeTraits<T>::parameter_type b) const
+    {
+        abs_func<T> f;
+        return f(a - b);
+    }
+};
+
+// Math functions
+
+template <typename T> struct sqr_func : unary_function<T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type x) const
+    {
+        return x * x;
+    }
+};
+
+namespace functional_detail
+{
+    template <typename T> struct FloatType
+    {
+        typedef typename MakeVec<
+            typename LargerType<float, typename VecTraits<T>::elem_type>::type,
+            VecTraits<T>::cn
+        >::type type;
+    };
+}
+
+#define CV_CUDEV_UNARY_FUNCTION_INST(name, func) \
+    template <typename T> struct name ## _func : unary_function<T, typename functional_detail::FloatType<T>::type> \
+    { \
+        __device__ __forceinline__ typename functional_detail::FloatType<T>::type operator ()(typename TypeTraits<T>::parameter_type a) const \
+        { \
+            return name(a); \
+        } \
+    }; \
+    template <> struct name ## _func<uchar> : unary_function<uchar, float> \
+    { \
+        __device__ __forceinline__ float operator ()(uchar a) const \
+        { \
+            return func ## f(a); \
+        } \
+    }; \
+    template <> struct name ## _func<schar> : unary_function<schar, float> \
+    { \
+        __device__ __forceinline__ float operator ()(schar a) const \
+        { \
+            return func ## f(a); \
+        } \
+    }; \
+    template <> struct name ## _func<ushort> : unary_function<ushort, float> \
+    { \
+        __device__ __forceinline__ float operator ()(ushort a) const \
+        { \
+            return func ## f(a); \
+        } \
+    }; \
+    template <> struct name ## _func<short> : unary_function<short, float> \
+    { \
+        __device__ __forceinline__ float operator ()(short a) const \
+        { \
+            return func ## f(a); \
+        } \
+    }; \
+    template <> struct name ## _func<uint> : unary_function<uint, float> \
+    { \
+        __device__ __forceinline__ float operator ()(uint a) const \
+        { \
+            return func ## f(a); \
+        } \
+    }; \
+    template <> struct name ## _func<int> : unary_function<int, float> \
+    { \
+        __device__ __forceinline__ float operator ()(int a) const \
+        { \
+            return func ## f(a); \
+        } \
+    }; \
+    template <> struct name ## _func<float> : unary_function<float, float> \
+    { \
+        __device__ __forceinline__ float operator ()(float a) const \
+        { \
+            return func ## f(a); \
+        } \
+    }; \
+    template <> struct name ## _func<double> : unary_function<double, double> \
+    { \
+        __device__ __forceinline__ double operator ()(double a) const \
+        { \
+            return func(a); \
+        } \
+    };
+
+CV_CUDEV_UNARY_FUNCTION_INST(sqrt, ::sqrt)
+CV_CUDEV_UNARY_FUNCTION_INST(exp, ::exp)
+CV_CUDEV_UNARY_FUNCTION_INST(exp2, ::exp2)
+CV_CUDEV_UNARY_FUNCTION_INST(exp10, ::exp10)
+CV_CUDEV_UNARY_FUNCTION_INST(log, ::log)
+CV_CUDEV_UNARY_FUNCTION_INST(log2, ::log2)
+CV_CUDEV_UNARY_FUNCTION_INST(log10, ::log10)
+CV_CUDEV_UNARY_FUNCTION_INST(sin, ::sin)
+CV_CUDEV_UNARY_FUNCTION_INST(cos, ::cos)
+CV_CUDEV_UNARY_FUNCTION_INST(tan, ::tan)
+CV_CUDEV_UNARY_FUNCTION_INST(asin, ::asin)
+CV_CUDEV_UNARY_FUNCTION_INST(acos, ::acos)
+CV_CUDEV_UNARY_FUNCTION_INST(atan, ::atan)
+CV_CUDEV_UNARY_FUNCTION_INST(sinh, ::sinh)
+CV_CUDEV_UNARY_FUNCTION_INST(cosh, ::cosh)
+CV_CUDEV_UNARY_FUNCTION_INST(tanh, ::tanh)
+CV_CUDEV_UNARY_FUNCTION_INST(asinh, ::asinh)
+CV_CUDEV_UNARY_FUNCTION_INST(acosh, ::acosh)
+CV_CUDEV_UNARY_FUNCTION_INST(atanh, ::atanh)
+
+#undef CV_CUDEV_UNARY_FUNCTION_INST
+
+#define CV_CUDEV_BINARY_FUNCTION_INST(name, func) \
+    template <typename T> struct name ## _func : binary_function<T, T, typename functional_detail::FloatType<T>::type> \
+    { \
+        __device__ __forceinline__ typename functional_detail::FloatType<T>::type operator ()(typename TypeTraits<T>::parameter_type a, typename TypeTraits<T>::parameter_type b) const \
+        { \
+            return name(a, b); \
+        } \
+    }; \
+    template <> struct name ## _func<uchar> : binary_function<uchar, uchar, float> \
+    { \
+        __device__ __forceinline__ float operator ()(uchar a, uchar b) const \
+        { \
+            return func ## f(a, b); \
+        } \
+    }; \
+    template <> struct name ## _func<schar> : binary_function<schar, schar, float> \
+    { \
+        __device__ __forceinline__ float operator ()(schar a, schar b) const \
+        { \
+            return func ## f(a, b); \
+        } \
+    }; \
+    template <> struct name ## _func<ushort> : binary_function<ushort, ushort, float> \
+    { \
+        __device__ __forceinline__ float operator ()(ushort a, ushort b) const \
+        { \
+            return func ## f(a, b); \
+        } \
+    }; \
+    template <> struct name ## _func<short> : binary_function<short, short, float> \
+    { \
+        __device__ __forceinline__ float operator ()(short a, short b) const \
+        { \
+            return func ## f(a, b); \
+        } \
+    }; \
+    template <> struct name ## _func<uint> : binary_function<uint, uint, float> \
+    { \
+        __device__ __forceinline__ float operator ()(uint a, uint b) const \
+        { \
+            return func ## f(a, b); \
+        } \
+    }; \
+    template <> struct name ## _func<int> : binary_function<int, int, float> \
+    { \
+        __device__ __forceinline__ float operator ()(int a, int b) const \
+        { \
+            return func ## f(a, b); \
+        } \
+    }; \
+    template <> struct name ## _func<float> : binary_function<float, float, float> \
+    { \
+        __device__ __forceinline__ float operator ()(float a, float b) const \
+        { \
+            return func ## f(a, b); \
+        } \
+    }; \
+    template <> struct name ## _func<double> : binary_function<double, double, double> \
+    { \
+        __device__ __forceinline__ double operator ()(double a, double b) const \
+        { \
+            return func(a, b); \
+        } \
+    };
+
+CV_CUDEV_BINARY_FUNCTION_INST(hypot, ::hypot)
+CV_CUDEV_BINARY_FUNCTION_INST(atan2, ::atan2)
+
+#undef CV_CUDEV_BINARY_FUNCTION_INST
+
+template <typename T> struct magnitude_func : binary_function<T, T, typename functional_detail::FloatType<T>::type>
+{
+    __device__ __forceinline__ typename functional_detail::FloatType<T>::type operator ()(typename TypeTraits<T>::parameter_type a, typename TypeTraits<T>::parameter_type b) const
+    {
+        sqrt_func<typename functional_detail::FloatType<T>::type> f;
+        return f(a * a + b * b);
+    }
+};
+
+template <typename T> struct magnitude_sqr_func : binary_function<T, T, typename functional_detail::FloatType<T>::type>
+{
+    __device__ __forceinline__ typename functional_detail::FloatType<T>::type operator ()(typename TypeTraits<T>::parameter_type a, typename TypeTraits<T>::parameter_type b) const
+    {
+        return a * a + b * b;
+    }
+};
+
+template <typename T, bool angleInDegrees> struct direction_func : binary_function<T, T, T>
+{
+    __device__ T operator ()(T x, T y) const
+    {
+        atan2_func<T> f;
+        typename atan2_func<T>::result_type angle = f(y, x);
+
+        angle += (angle < 0) * (2.0f * CV_PI_F);
+
+        if (angleInDegrees)
+            angle *= (180.0f / CV_PI_F);
+
+        return saturate_cast<T>(angle);
+    }
+};
+
+template <typename T> struct pow_func : binary_function<T, float, float>
+{
+    __device__ __forceinline__ float operator ()(T val, float power) const
+    {
+        return ::powf(val, power);
+    }
+};
+template <> struct pow_func<double> : binary_function<double, double, double>
+{
+    __device__ __forceinline__ double operator ()(double val, double power) const
+    {
+        return ::pow(val, power);
+    }
+};
+
+// Saturate Cast Functor
+
+template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
+{
+    __device__ __forceinline__ D operator ()(typename TypeTraits<T>::parameter_type v) const
+    {
+        return saturate_cast<D>(v);
+    }
+};
+
+// Convert Fp16 dummy
+template <typename T, typename D> struct saturate_cast_fp16_func;
+
+// Convert Fp16 from Fp32
+template <> struct saturate_cast_fp16_func<float, short> : unary_function<float, short>
+{
+    __device__ __forceinline__ short operator ()(float v) const
+    {
+        return cast_fp16<float, short>(v);
+    }
+};
+
+// Convert Fp16 to Fp32
+template <> struct saturate_cast_fp16_func<short, float> : unary_function<short, float>
+{
+    __device__ __forceinline__ float operator ()(short v) const
+    {
+        return cast_fp16<short, float>(v);
+    }
+};
+
+// Threshold Functors
+
+template <typename T> struct ThreshBinaryFunc : unary_function<T, T>
+{
+    T thresh;
+    T maxVal;
+
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        return saturate_cast<T>(src > thresh) * maxVal;
+    }
+};
+
+template <typename T>
+__host__ __device__ ThreshBinaryFunc<T> thresh_binary_func(T thresh, T maxVal)
+{
+    ThreshBinaryFunc<T> f;
+    f.thresh = thresh;
+    f.maxVal = maxVal;
+    return f;
+}
+
+template <typename T> struct ThreshBinaryInvFunc : unary_function<T, T>
+{
+    T thresh;
+    T maxVal;
+
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        return saturate_cast<T>(src <= thresh) * maxVal;
+    }
+};
+
+template <typename T>
+__host__ __device__ ThreshBinaryInvFunc<T> thresh_binary_inv_func(T thresh, T maxVal)
+{
+    ThreshBinaryInvFunc<T> f;
+    f.thresh = thresh;
+    f.maxVal = maxVal;
+    return f;
+}
+
+template <typename T> struct ThreshTruncFunc : unary_function<T, T>
+{
+    T thresh;
+
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        minimum<T> minOp;
+        return minOp(src, thresh);
+    }
+};
+
+template <typename T>
+__host__ __device__ ThreshTruncFunc<T> thresh_trunc_func(T thresh)
+{
+    ThreshTruncFunc<T> f;
+    f.thresh = thresh;
+    return f;
+}
+
+template <typename T> struct ThreshToZeroFunc : unary_function<T, T>
+{
+    T thresh;
+
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        return saturate_cast<T>(src > thresh) * src;
+    }
+};
+
+template <typename T>
+__host__ __device__ ThreshToZeroFunc<T> thresh_to_zero_func(T thresh)
+{
+    ThreshToZeroFunc<T> f;
+    f.thresh = thresh;
+    return f;
+}
+
+template <typename T> struct ThreshToZeroInvFunc : unary_function<T, T>
+{
+    T thresh;
+
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        return saturate_cast<T>(src <= thresh) * src;
+    }
+};
+
+template <typename T>
+__host__ __device__ ThreshToZeroInvFunc<T> thresh_to_zero_inv_func(T thresh)
+{
+    ThreshToZeroInvFunc<T> f;
+    f.thresh = thresh;
+    return f;
+}
+
+// Function Object Adaptors
+
+template <class Predicate> struct UnaryNegate : unary_function<typename Predicate::argument_type, typename Predicate::result_type>
+{
+    Predicate pred;
+
+    __device__ __forceinline__ typename Predicate::result_type operator ()(
+            typename TypeTraits<typename Predicate::argument_type>::parameter_type x) const
+    {
+        return !pred(x);
+    }
+};
+
+template <class Predicate>
+__host__ __device__ UnaryNegate<Predicate> not1(const Predicate& pred)
+{
+    UnaryNegate<Predicate> n;
+    n.pred = pred;
+    return n;
+}
+
+template <class Predicate> struct BinaryNegate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, typename Predicate::result_type>
+{
+    Predicate pred;
+
+    __device__ __forceinline__ typename Predicate::result_type operator ()(
+            typename TypeTraits<typename Predicate::first_argument_type>::parameter_type x,
+            typename TypeTraits<typename Predicate::second_argument_type>::parameter_type y) const
+    {
+        return !pred(x, y);
+    }
+};
+
+template <class Predicate>
+__host__ __device__ BinaryNegate<Predicate> not2(const Predicate& pred)
+{
+    BinaryNegate<Predicate> n;
+    n.pred = pred;
+    return n;
+}
+
+template <class Op> struct Binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
+{
+    Op op;
+    typename Op::first_argument_type arg1;
+
+    __device__ __forceinline__ typename Op::result_type operator ()(
+            typename TypeTraits<typename Op::second_argument_type>::parameter_type a) const
+    {
+        return op(arg1, a);
+    }
+};
+
+template <class Op>
+__host__ __device__ Binder1st<Op> bind1st(const Op& op, const typename Op::first_argument_type& arg1)
+{
+    Binder1st<Op> b;
+    b.op = op;
+    b.arg1 = arg1;
+    return b;
+}
+
+template <class Op> struct Binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
+{
+    Op op;
+    typename Op::second_argument_type arg2;
+
+    __device__ __forceinline__ typename Op::result_type operator ()(
+            typename TypeTraits<typename Op::first_argument_type>::parameter_type a) const
+    {
+        return op(a, arg2);
+    }
+};
+
+template <class Op>
+__host__ __device__ Binder2nd<Op> bind2nd(const Op& op, const typename Op::second_argument_type& arg2)
+{
+    Binder2nd<Op> b;
+    b.op = op;
+    b.arg2 = arg2;
+    return b;
+}
+
+// Functor Traits
+
+template <typename F> struct IsUnaryFunction
+{
+    typedef char Yes;
+    struct No {Yes a[2];};
+
+    template <typename T, typename D> static Yes check(unary_function<T, D>);
+    static No check(...);
+
+    static F makeF();
+
+    enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+};
+
+template <typename F> struct IsBinaryFunction
+{
+    typedef char Yes;
+    struct No {Yes a[2];};
+
+    template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
+    static No check(...);
+
+    static F makeF();
+
+    enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/functional/tuple_adapter.hpp b/modules/cudev/include/opencv2/cudev/functional/tuple_adapter.hpp
new file mode 100644
index 00000000000..15331818b1c
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/functional/tuple_adapter.hpp
@@ -0,0 +1,103 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_FUNCTIONAL_TUPLE_ADAPTER_HPP
+#define OPENCV_CUDEV_FUNCTIONAL_TUPLE_ADAPTER_HPP
+
+#include "../common.hpp"
+#include "../util/tuple.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class Op, int n> struct UnaryTupleAdapter
+{
+    typedef typename Op::result_type result_type;
+
+    Op op;
+
+    template <class Tuple>
+    __device__ __forceinline__ typename Op::result_type operator ()(const Tuple& t) const
+    {
+        return op(get<n>(t));
+    }
+};
+
+template <int n, class Op>
+__host__ __device__ UnaryTupleAdapter<Op, n> unaryTupleAdapter(const Op& op)
+{
+    UnaryTupleAdapter<Op, n> a;
+    a.op = op;
+    return a;
+}
+
+template <class Op, int n0, int n1> struct BinaryTupleAdapter
+{
+    typedef typename Op::result_type result_type;
+
+    Op op;
+
+    template <class Tuple>
+    __device__ __forceinline__ typename Op::result_type operator ()(const Tuple& t) const
+    {
+        return op(get<n0>(t), get<n1>(t));
+    }
+};
+
+template <int n0, int n1, class Op>
+__host__ __device__ BinaryTupleAdapter<Op, n0, n1> binaryTupleAdapter(const Op& op)
+{
+    BinaryTupleAdapter<Op, n0, n1> a;
+    a.op = op;
+    return a;
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/copy.hpp b/modules/cudev/include/opencv2/cudev/grid/copy.hpp
new file mode 100644
index 00000000000..cbaca840392
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/copy.hpp
@@ -0,0 +1,457 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_COPY_HPP
+#define OPENCV_CUDEV_GRID_COPY_HPP
+
+#include "../common.hpp"
+#include "../util/tuple.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/glob.hpp"
+#include "../ptr2d/mask.hpp"
+#include "../ptr2d/zip.hpp"
+#include "detail/copy.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridCopy_(const SrcPtr& src, GpuMat_<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dst.create(rows, cols);
+
+    grid_copy_detail::copy<Policy>(shrinkPtr(src), shrinkPtr(dst), shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridCopy_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_copy_detail::copy<Policy>(shrinkPtr(src), shrinkPtr(dst), shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridCopy_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst.create(rows, cols);
+
+    grid_copy_detail::copy<Policy>(shrinkPtr(src), shrinkPtr(dst), WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridCopy_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+
+    grid_copy_detail::copy<Policy>(shrinkPtr(src), shrinkPtr(dst), WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, class MaskPtr>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                         shrinkPtr(mask),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, class MaskPtr>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                         shrinkPtr(mask),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                         WithOutMask(),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                         WithOutMask(),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, class MaskPtr>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                         shrinkPtr(mask),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, class MaskPtr>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                         shrinkPtr(mask),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                         WithOutMask(),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                         WithOutMask(),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3, class MaskPtr>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+    get<3>(dst).create(rows, cols);
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                         shrinkPtr(mask),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3, class MaskPtr>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(get<3>(dst)) == rows && getCols(get<3>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                         shrinkPtr(mask),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+    get<3>(dst).create(rows, cols);
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                         WithOutMask(),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(get<3>(dst)) == rows && getCols(get<3>(dst)) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                         WithOutMask(),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
+// Default Policy
+
+struct DefaultCopyPolicy
+{
+    enum {
+        block_size_x = 32,
+        block_size_y = 8
+    };
+};
+
+template <class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridCopy(const SrcPtr& src, GpuMat_<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridCopy(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridCopy(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridCopy(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1, class MaskPtr>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1, class MaskPtr>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1, typename D2, class MaskPtr>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1, typename D2, class MaskPtr>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1, typename D2>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1, typename D2>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3, class MaskPtr>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3, class MaskPtr>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
+template <class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/copy.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/copy.hpp
new file mode 100644
index 00000000000..b6fce94548c
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/copy.hpp
@@ -0,0 +1,132 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_COPY_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_COPY_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/tuple.hpp"
+#include "../../util/saturate_cast.hpp"
+#include "../../ptr2d/glob.hpp"
+#include "../../ptr2d/traits.hpp"
+
+namespace cv { namespace cudev {
+
+namespace grid_copy_detail
+{
+    template <class SrcPtr, typename DstType, class MaskPtr>
+    __global__ void copy(const SrcPtr src, GlobPtr<DstType> dst, const MaskPtr mask, const int rows, const int cols)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        dst(y, x) = saturate_cast<DstType>(src(y, x));
+    }
+
+    template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+    __host__ void copy(const SrcPtr& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+        copy<<<grid, block, 0, stream>>>(src, dst, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+
+    template <int count> struct Unroll
+    {
+        template <class SrcPtrTuple, class DstPtrTuple>
+        __device__ static void copy(const SrcPtrTuple& src, DstPtrTuple& dst, const int y, const int x)
+        {
+            typedef typename tuple_element<count - 1, DstPtrTuple>::type dst_ptr_type;
+            typedef typename PtrTraits<dst_ptr_type>::value_type dst_type;
+
+            get<count - 1>(dst)(y, x) = saturate_cast<dst_type>(get<count - 1>(src)(y, x));
+            Unroll<count - 1>::copy(src, dst, y, x);
+        }
+    };
+    template <> struct Unroll<0>
+    {
+        template <class SrcPtrTuple, class DstPtrTuple>
+        __device__ __forceinline__ static void copy(const SrcPtrTuple&, DstPtrTuple&, const int, const int)
+        {
+        }
+    };
+
+    template <class SrcPtrTuple, class DstPtrTuple, class MaskPtr>
+    __global__ void copy_tuple(const SrcPtrTuple src, DstPtrTuple dst, const MaskPtr mask, const int rows, const int cols)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        Unroll<tuple_size<SrcPtrTuple>::value>::copy(src, dst, y, x);
+    }
+
+    template <class Policy, class SrcPtrTuple, class DstPtrTuple, class MaskPtr>
+    __host__ void copy_tuple(const SrcPtrTuple& src, const DstPtrTuple& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+        copy_tuple<<<grid, block, 0, stream>>>(src, dst, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/histogram.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/histogram.hpp
new file mode 100644
index 00000000000..8b6164e930d
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/histogram.hpp
@@ -0,0 +1,111 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_HISTOGRAM_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_HISTOGRAM_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/atomic.hpp"
+
+namespace cv { namespace cudev {
+
+namespace grid_histogram_detail
+{
+    template <int BIN_COUNT, int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
+    __global__ void histogram(const SrcPtr src, ResType* hist, const MaskPtr mask, const int rows, const int cols)
+    {
+    #if CV_CUDEV_ARCH >= 120
+        __shared__ ResType smem[BIN_COUNT];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        for (int i = tid; i < BIN_COUNT; i += BLOCK_SIZE)
+            smem[i] = 0;
+
+        __syncthreads();
+
+        if (y < rows)
+        {
+            for (int x = threadIdx.x; x < cols; x += blockDim.x)
+            {
+                if (mask(y, x))
+                {
+                    const uint data = src(y, x);
+                    atomicAdd(&smem[data % BIN_COUNT], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        for (int i = tid; i < BIN_COUNT; i += BLOCK_SIZE)
+        {
+            const ResType histVal = smem[i];
+            if (histVal > 0)
+                atomicAdd(hist + i, histVal);
+        }
+    #endif
+    }
+
+    template <int BIN_COUNT, class Policy, class SrcPtr, typename ResType, class MaskPtr>
+    __host__ void histogram(const SrcPtr& src, ResType* hist, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(rows, block.y));
+
+        const int BLOCK_SIZE = Policy::block_size_x * Policy::block_size_y;
+
+        histogram<BIN_COUNT, BLOCK_SIZE><<<grid, block, 0, stream>>>(src, hist, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
new file mode 100644
index 00000000000..7672aca71db
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@@ -0,0 +1,627 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../warp/shuffle.hpp"
+#include "../../block/scan.hpp"
+#include "../../ptr2d/glob.hpp"
+
+namespace cv { namespace cudev {
+
+namespace integral_detail
+{
+    // horizontal_pass
+
+    template <int NUM_SCAN_THREADS, class SrcPtr, typename D>
+    __global__ void horizontal_pass(const SrcPtr src, GlobPtr<D> dst, const int cols)
+    {
+        __shared__ D smem[NUM_SCAN_THREADS * 2];
+        __shared__ D carryElem;
+
+        carryElem = 0;
+
+        __syncthreads();
+
+        D* dst_row = dst.row(blockIdx.x);
+
+        int numBuckets = divUp(cols, NUM_SCAN_THREADS);
+        int offsetX = 0;
+
+        while (numBuckets--)
+        {
+            const int curElemOffs = offsetX + threadIdx.x;
+
+            D curElem = 0.0f;
+
+            if (curElemOffs < cols)
+                curElem = src(blockIdx.x, curElemOffs);
+
+            const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x);
+
+            if (curElemOffs < cols)
+                dst_row[curElemOffs] = carryElem + curScanElem;
+
+            offsetX += NUM_SCAN_THREADS;
+
+            __syncthreads();
+
+            if (threadIdx.x == NUM_SCAN_THREADS - 1)
+            {
+                carryElem += curScanElem;
+            }
+
+            __syncthreads();
+        }
+    }
+
+    template <int NUM_SCAN_THREADS, typename T, typename D>
+    __global__ void horizontal_pass(const GlobPtr<T> src, GlobPtr<D> dst, const int cols)
+    {
+        __shared__ D smem[NUM_SCAN_THREADS * 2];
+        __shared__ D carryElem;
+
+        carryElem = 0;
+
+        __syncthreads();
+
+        const T* src_row = src.row(blockIdx.x);
+        D* dst_row = dst.row(blockIdx.x);
+
+        int numBuckets = divUp(cols, NUM_SCAN_THREADS);
+        int offsetX = 0;
+
+        while (numBuckets--)
+        {
+            const int curElemOffs = offsetX + threadIdx.x;
+
+            D curElem = 0.0f;
+
+            if (curElemOffs < cols)
+                curElem = src_row[curElemOffs];
+
+            const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x);
+
+            if (curElemOffs < cols)
+                dst_row[curElemOffs] = carryElem + curScanElem;
+
+            offsetX += NUM_SCAN_THREADS;
+
+            __syncthreads();
+
+            if (threadIdx.x == NUM_SCAN_THREADS - 1)
+            {
+                carryElem += curScanElem;
+            }
+
+            __syncthreads();
+        }
+    }
+
+    template <class SrcPtr, typename D>
+    __host__ void horizontal_pass(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream)
+    {
+        const int NUM_SCAN_THREADS = 256;
+
+        const dim3 block(NUM_SCAN_THREADS);
+        const dim3 grid(rows);
+
+        horizontal_pass<NUM_SCAN_THREADS><<<grid, block, 0, stream>>>(src, dst, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+    }
+
+    // horisontal_pass_8u_shfl
+
+    __device__ static uchar4 int_to_uchar4(unsigned int in)
+    {
+        uchar4 bytes;
+        bytes.x = (in & 0x000000ff) >>  0;
+        bytes.y = (in & 0x0000ff00) >>  8;
+        bytes.z = (in & 0x00ff0000) >> 16;
+        bytes.w = (in & 0xff000000) >> 24;
+        return bytes;
+    }
+
+    __global__ static void horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img, GlobPtr<uint4> integral)
+    {
+    #if CV_CUDEV_ARCH >= 300
+        __shared__ int sums[128];
+
+        const int id = threadIdx.x;
+        const int lane_id = id % warpSize;
+        const int warp_id = id / warpSize;
+
+        const uint4 data = img(blockIdx.x, id);
+
+        const uchar4 a = int_to_uchar4(data.x);
+        const uchar4 b = int_to_uchar4(data.y);
+        const uchar4 c = int_to_uchar4(data.z);
+        const uchar4 d = int_to_uchar4(data.w);
+
+        int result[16];
+
+        result[0]  =              a.x;
+        result[1]  = result[0]  + a.y;
+        result[2]  = result[1]  + a.z;
+        result[3]  = result[2]  + a.w;
+
+        result[4]  = result[3]  + b.x;
+        result[5]  = result[4]  + b.y;
+        result[6]  = result[5]  + b.z;
+        result[7]  = result[6]  + b.w;
+
+        result[8]  = result[7]  + c.x;
+        result[9]  = result[8]  + c.y;
+        result[10] = result[9]  + c.z;
+        result[11] = result[10] + c.w;
+
+        result[12] = result[11] + d.x;
+        result[13] = result[12] + d.y;
+        result[14] = result[13] + d.z;
+        result[15] = result[14] + d.w;
+
+        int sum = result[15];
+
+        // the prefix sum for each thread's 16 value is computed,
+        // now the final sums (result[15]) need to be shared
+        // with the other threads and add.  To do this,
+        // the shfl_up() instruction is used and a shuffle scan
+        // operation is performed to distribute the sums to the correct
+        // threads
+        #pragma unroll
+        for (int i = 1; i < 32; i *= 2)
+        {
+            const int n = shfl_up(sum, i, 32);
+
+            if (lane_id >= i)
+            {
+                #pragma unroll
+                for (int k = 0; k < 16; ++k)
+                    result[k] += n;
+
+                sum += n;
+            }
+        }
+
+        // Now the final sum for the warp must be shared
+        // between warps.  This is done by each warp
+        // having a thread store to shared memory, then
+        // having some other warp load the values and
+        // compute a prefix sum, again by using shfl_up.
+        // The results are uniformly added back to the warps.
+        // last thread in the warp holding sum of the warp
+        // places that in shared
+        if (threadIdx.x % warpSize == warpSize - 1)
+            sums[warp_id] = result[15];
+
+        __syncthreads();
+
+        if (warp_id == 0)
+        {
+            int warp_sum = sums[lane_id];
+
+            #pragma unroll
+            for (int i = 1; i <= 32; i *= 2)
+            {
+                const int n = shfl_up(warp_sum, i, 32);
+
+                if (lane_id >= i)
+                    warp_sum += n;
+            }
+
+            sums[lane_id] = warp_sum;
+        }
+
+        __syncthreads();
+
+        int blockSum = 0;
+
+        // fold in unused warp
+        if (warp_id > 0)
+        {
+            blockSum = sums[warp_id - 1];
+
+            #pragma unroll
+            for (int k = 0; k < 16; ++k)
+                result[k] += blockSum;
+        }
+
+        // assemble result
+        // Each thread has 16 values to write, which are
+        // now integer data (to avoid overflow).  Instead of
+        // each thread writing consecutive uint4s, the
+        // approach shown here experiments using
+        // the shuffle command to reformat the data
+        // inside the registers so that each thread holds
+        // consecutive data to be written so larger contiguous
+        // segments can be assembled for writing.
+
+        /*
+            For example data that needs to be written as
+
+            GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
+            but is stored in registers (r0..r3), in four threads (0..3) as:
+
+            threadId   0  1  2  3
+              r0      x0 y0 z0 w0
+              r1      x1 y1 z1 w1
+              r2      x2 y2 z2 w2
+              r3      x3 y3 z3 w3
+
+              after apply shfl_xor operations to move data between registers r1..r3:
+
+            threadId  00 01 10 11
+                      x0 y0 z0 w0
+             xor(01)->y1 x1 w1 z1
+             xor(10)->z2 w2 x2 y2
+             xor(11)->w3 z3 y3 x3
+
+             and now x0..x3, and z0..z3 can be written out in order by all threads.
+
+             In the current code, each register above is actually representing
+             four integers to be written as uint4's to GMEM.
+        */
+
+        result[4]  = shfl_xor(result[4] , 1, 32);
+        result[5]  = shfl_xor(result[5] , 1, 32);
+        result[6]  = shfl_xor(result[6] , 1, 32);
+        result[7]  = shfl_xor(result[7] , 1, 32);
+
+        result[8]  = shfl_xor(result[8] , 2, 32);
+        result[9]  = shfl_xor(result[9] , 2, 32);
+        result[10] = shfl_xor(result[10], 2, 32);
+        result[11] = shfl_xor(result[11], 2, 32);
+
+        result[12] = shfl_xor(result[12], 3, 32);
+        result[13] = shfl_xor(result[13], 3, 32);
+        result[14] = shfl_xor(result[14], 3, 32);
+        result[15] = shfl_xor(result[15], 3, 32);
+
+        uint4* integral_row = integral.row(blockIdx.x);
+        uint4 output;
+
+        ///////
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
+
+        ///////
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
+
+        // continuning from the above example,
+        // this use of shfl_xor() places the y0..y3 and w0..w3 data
+        // in order.
+
+        #pragma unroll
+        for (int i = 0; i < 16; ++i)
+            result[i] = shfl_xor(result[i], 1, 32);
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
+
+        ///////
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
+    #endif
+    }
+
+    __host__ static void horisontal_pass_8u_shfl(const GlobPtr<uchar> src, GlobPtr<uint> integral, int rows, int cols, cudaStream_t stream)
+    {
+        // each thread handles 16 values, use 1 block/row
+        // save, because step is actually can't be less 512 bytes
+        const int block = cols / 16;
+
+        // launch 1 block / row
+        const int grid = rows;
+
+        CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(horisontal_pass_8u_shfl_kernel, cudaFuncCachePreferL1) );
+
+        GlobPtr<uint4> src4 = globPtr((uint4*) src.data, src.step);
+        GlobPtr<uint4> integral4 = globPtr((uint4*) integral.data, integral.step);
+
+        horisontal_pass_8u_shfl_kernel<<<grid, block, 0, stream>>>(src4, integral4);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+    }
+
+    // vertical
+
+    template <typename T>
+    __global__ void vertical_pass(GlobPtr<T> integral, const int rows, const int cols)
+    {
+    #if CV_CUDEV_ARCH >= 300
+        __shared__ T sums[32][9];
+
+        const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+        const int lane_id = tidx % 8;
+
+        sums[threadIdx.x][threadIdx.y] = 0;
+        __syncthreads();
+
+        T stepSum = 0;
+
+        int numBuckets = divUp(rows, blockDim.y);
+        int y = threadIdx.y;
+
+        while (numBuckets--)
+        {
+            T* p = integral.row(y) + tidx;
+
+            T sum = (tidx < cols) && (y < rows) ? *p : 0;
+
+            sums[threadIdx.x][threadIdx.y] = sum;
+            __syncthreads();
+
+            // place into SMEM
+            // shfl scan reduce the SMEM, reformating so the column
+            // sums are computed in a warp
+            // then read out properly
+            const int j = threadIdx.x % 8;
+            const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+            T partial_sum = sums[k][j];
+
+            for (int i = 1; i <= 8; i *= 2)
+            {
+                T n = shfl_up(partial_sum, i, 32);
+
+                if (lane_id >= i)
+                    partial_sum += n;
+            }
+
+            sums[k][j] = partial_sum;
+            __syncthreads();
+
+            if (threadIdx.y > 0)
+                sum += sums[threadIdx.x][threadIdx.y - 1];
+
+            sum += stepSum;
+            stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+            __syncthreads();
+
+            if ((tidx < cols) && (y < rows))
+            {
+                *p = sum;
+            }
+
+            y += blockDim.y;
+        }
+    #else
+        __shared__ T smem[32][32];
+        __shared__ T prevVals[32];
+
+        volatile T* smem_row = &smem[0][0] + 64 * threadIdx.y;
+
+        if (threadIdx.y == 0)
+            prevVals[threadIdx.x] = 0;
+
+        __syncthreads();
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+        int numBuckets = divUp(rows, 8 * 4);
+        int offsetY = 0;
+
+        while (numBuckets--)
+        {
+            const int curRowOffs = offsetY + threadIdx.y;
+
+            T curElems[4];
+            T temp[4];
+
+            // load patch
+
+            smem[threadIdx.y +  0][threadIdx.x] = 0.0f;
+            smem[threadIdx.y +  8][threadIdx.x] = 0.0f;
+            smem[threadIdx.y + 16][threadIdx.x] = 0.0f;
+            smem[threadIdx.y + 24][threadIdx.x] = 0.0f;
+
+            if (x < cols)
+            {
+                for (int i = 0; i < 4; ++i)
+                {
+                    if (curRowOffs + i * 8 < rows)
+                        smem[threadIdx.y + i * 8][threadIdx.x] = integral(curRowOffs + i * 8, x);
+                }
+            }
+
+            __syncthreads();
+
+            // reduce
+
+            curElems[0] = smem[threadIdx.x][threadIdx.y     ];
+            curElems[1] = smem[threadIdx.x][threadIdx.y +  8];
+            curElems[2] = smem[threadIdx.x][threadIdx.y + 16];
+            curElems[3] = smem[threadIdx.x][threadIdx.y + 24];
+
+            __syncthreads();
+
+            temp[0] = curElems[0] = warpScanInclusive(curElems[0], smem_row, threadIdx.x);
+            temp[1] = curElems[1] = warpScanInclusive(curElems[1], smem_row, threadIdx.x);
+            temp[2] = curElems[2] = warpScanInclusive(curElems[2], smem_row, threadIdx.x);
+            temp[3] = curElems[3] = warpScanInclusive(curElems[3], smem_row, threadIdx.x);
+
+            curElems[0] += prevVals[threadIdx.y     ];
+            curElems[1] += prevVals[threadIdx.y +  8];
+            curElems[2] += prevVals[threadIdx.y + 16];
+            curElems[3] += prevVals[threadIdx.y + 24];
+
+            __syncthreads();
+
+            if (threadIdx.x == 31)
+            {
+                prevVals[threadIdx.y     ] += temp[0];
+                prevVals[threadIdx.y +  8] += temp[1];
+                prevVals[threadIdx.y + 16] += temp[2];
+                prevVals[threadIdx.y + 24] += temp[3];
+            }
+
+            smem[threadIdx.y     ][threadIdx.x] = curElems[0];
+            smem[threadIdx.y +  8][threadIdx.x] = curElems[1];
+            smem[threadIdx.y + 16][threadIdx.x] = curElems[2];
+            smem[threadIdx.y + 24][threadIdx.x] = curElems[3];
+
+            __syncthreads();
+
+            // store patch
+
+            if (x < cols)
+            {
+                // read 4 value from source
+                for (int i = 0; i < 4; ++i)
+                {
+                    if (curRowOffs + i * 8 < rows)
+                        integral(curRowOffs + i * 8, x) = smem[threadIdx.x][threadIdx.y + i * 8];
+                }
+            }
+
+            __syncthreads();
+
+            offsetY += 8 * 4;
+        }
+    #endif
+    }
+
+    template <typename T>
+    __host__ void vertical_pass(const GlobPtr<T>& integral, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(cols, block.x));
+
+        vertical_pass<<<grid, block, 0, stream>>>(integral, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+    }
+
+    // integral
+
+    template <class SrcPtr, typename D>
+    __host__ void integral(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream)
+    {
+        horizontal_pass(src, dst, rows, cols, stream);
+        vertical_pass(dst, rows, cols, stream);
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+
+    __host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream)
+    {
+        if (deviceSupports(FEATURE_SET_COMPUTE_30)
+            && (cols % 64 == 0)
+            && reinterpret_cast<intptr_t>(src.data) % 32 == 0
+            && reinterpret_cast<intptr_t>(dst.data) % 32 == 0)
+        {
+            horisontal_pass_8u_shfl(src, dst, rows, cols, stream);
+        }
+        else
+        {
+            horizontal_pass(src, dst, rows, cols, stream);
+        }
+
+        vertical_pass(dst, rows, cols, stream);
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+
+    __host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream)
+    {
+        GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step);
+        integral(src, dstui, rows, cols, stream);
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/minmaxloc.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/minmaxloc.hpp
new file mode 100644
index 00000000000..9e4f348e1a8
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/minmaxloc.hpp
@@ -0,0 +1,177 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_MINMAXLOC_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_MINMAXLOC_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/vec_traits.hpp"
+#include "../../util/type_traits.hpp"
+#include "../../util/limits.hpp"
+#include "../../block/reduce.hpp"
+
+namespace cv { namespace cudev {
+
+namespace grid_minmaxloc_detail
+{
+    template <int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
+    __global__ void minMaxLoc_pass_1(const SrcPtr src, ResType* minVal, ResType* maxVal, int* minLoc, int* maxLoc, const MaskPtr mask, const int rows, const int cols, const int patch_y, const int patch_x)
+    {
+        __shared__ ResType sMinVal[BLOCK_SIZE];
+        __shared__ ResType sMaxVal[BLOCK_SIZE];
+        __shared__ uint sMinLoc[BLOCK_SIZE];
+        __shared__ uint sMaxLoc[BLOCK_SIZE];
+
+        const int x0 = blockIdx.x * blockDim.x * patch_x + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * patch_y + threadIdx.y;
+
+        ResType myMin = numeric_limits<ResType>::max();
+        ResType myMax = -numeric_limits<ResType>::max();
+        int myMinLoc = -1;
+        int myMaxLoc = -1;
+
+        for (int i = 0, y = y0; i < patch_y && y < rows; ++i, y += blockDim.y)
+        {
+            for (int j = 0, x = x0; j < patch_x && x < cols; ++j, x += blockDim.x)
+            {
+                if (mask(y, x))
+                {
+                    const ResType srcVal = src(y, x);
+
+                    if (srcVal < myMin)
+                    {
+                        myMin = srcVal;
+                        myMinLoc = y * cols + x;
+                    }
+
+                    if (srcVal > myMax)
+                    {
+                        myMax = srcVal;
+                        myMaxLoc = y * cols + x;
+                    }
+                }
+            }
+        }
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        blockReduceKeyVal<BLOCK_SIZE>(smem_tuple(sMinVal, sMaxVal), tie(myMin, myMax),
+                                      smem_tuple(sMinLoc, sMaxLoc), tie(myMinLoc, myMaxLoc),
+                                      tid,
+                                      make_tuple(less<ResType>(), greater<ResType>()));
+
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (tid == 0)
+        {
+            minVal[bid] = myMin;
+            maxVal[bid] = myMax;
+            minLoc[bid] = myMinLoc;
+            maxLoc[bid] = myMaxLoc;
+        }
+    }
+
+    template <int BLOCK_SIZE, typename T>
+    __global__ void minMaxLoc_pass_2(T* minMal, T* maxVal, int* minLoc, int* maxLoc, int count)
+    {
+        __shared__ T sMinVal[BLOCK_SIZE];
+        __shared__ T sMaxVal[BLOCK_SIZE];
+        __shared__ int sMinLoc[BLOCK_SIZE];
+        __shared__ int sMaxLoc[BLOCK_SIZE];
+
+        const int idx = ::min(threadIdx.x, count - 1);
+
+        T myMin = minMal[idx];
+        T myMax = maxVal[idx];
+        int myMinLoc = minLoc[idx];
+        int myMaxLoc = maxLoc[idx];
+
+        blockReduceKeyVal<BLOCK_SIZE>(smem_tuple(sMinVal, sMaxVal), tie(myMin, myMax),
+                                      smem_tuple(sMinLoc, sMaxLoc), tie(myMinLoc, myMaxLoc),
+                                      threadIdx.x,
+                                      make_tuple(less<T>(), greater<T>()));
+
+        if (threadIdx.x == 0)
+        {
+            minMal[0] = myMin;
+            maxVal[0] = myMax;
+            minLoc[0] = myMinLoc;
+            maxLoc[0] = myMaxLoc;
+        }
+    }
+
+    template <class Policy>
+    void getLaunchCfg(int rows, int cols, dim3& block, dim3& grid)
+    {
+        block = dim3(Policy::block_size_x, Policy::block_size_y);
+        grid = dim3(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));
+
+        grid.x = ::min(grid.x, block.x);
+        grid.y = ::min(grid.y, block.y);
+    }
+
+    template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+    __host__ void minMaxLoc(const SrcPtr& src, ResType* minVal, ResType* maxVal, int* minLoc, int* maxLoc, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        dim3 block, grid;
+        getLaunchCfg<Policy>(rows, cols, block, grid);
+
+        const int patch_x = divUp(divUp(cols, grid.x), block.x);
+        const int patch_y = divUp(divUp(rows, grid.y), block.y);
+
+        minMaxLoc_pass_1<Policy::block_size_x * Policy::block_size_y><<<grid, block, 0, stream>>>(src, minVal, maxVal, minLoc, maxLoc, mask, rows, cols, patch_y, patch_x);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        minMaxLoc_pass_2<Policy::block_size_x * Policy::block_size_y><<<1, Policy::block_size_x * Policy::block_size_y, 0, stream>>>(minVal, maxVal, minLoc, maxLoc, grid.x * grid.y);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/pyr_down.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/pyr_down.hpp
new file mode 100644
index 00000000000..51cdf1bfb9b
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/pyr_down.hpp
@@ -0,0 +1,201 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_PYR_DOWN_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_PYR_DOWN_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/vec_traits.hpp"
+#include "../../util/saturate_cast.hpp"
+#include "../../util/type_traits.hpp"
+#include "../../ptr2d/glob.hpp"
+#include "../../ptr2d/traits.hpp"
+
+namespace cv { namespace cudev {
+
+namespace pyramids_detail
+{
+    template <class Brd, class SrcPtr, typename DstType>
+    __global__ void pyrDown(const SrcPtr src, GlobPtr<DstType> dst, const int src_rows, const int src_cols, const int dst_cols)
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+        typedef typename VecTraits<src_type>::elem_type src_elem_type;
+        typedef typename LargerType<float, src_elem_type>::type work_elem_type;
+        typedef typename MakeVec<work_elem_type, VecTraits<src_type>::cn>::type work_type;
+
+        __shared__ work_type smem[256 + 4];
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y;
+
+        const int src_y = 2 * y;
+
+        if (src_y >= 2 && src_y < src_rows - 2 && x >= 2 && x < src_cols - 2)
+        {
+            {
+                work_type sum;
+
+                sum =       0.0625f * src(src_y - 2, x);
+                sum = sum + 0.25f   * src(src_y - 1, x);
+                sum = sum + 0.375f  * src(src_y    , x);
+                sum = sum + 0.25f   * src(src_y + 1, x);
+                sum = sum + 0.0625f * src(src_y + 2, x);
+
+                smem[2 + threadIdx.x] = sum;
+            }
+
+            if (threadIdx.x < 2)
+            {
+                const int left_x = x - 2;
+
+                work_type sum;
+
+                sum =       0.0625f * src(src_y - 2, left_x);
+                sum = sum + 0.25f   * src(src_y - 1, left_x);
+                sum = sum + 0.375f  * src(src_y    , left_x);
+                sum = sum + 0.25f   * src(src_y + 1, left_x);
+                sum = sum + 0.0625f * src(src_y + 2, left_x);
+
+                smem[threadIdx.x] = sum;
+            }
+
+            if (threadIdx.x > 253)
+            {
+                const int right_x = x + 2;
+
+                work_type sum;
+
+                sum =       0.0625f * src(src_y - 2, right_x);
+                sum = sum + 0.25f   * src(src_y - 1, right_x);
+                sum = sum + 0.375f  * src(src_y    , right_x);
+                sum = sum + 0.25f   * src(src_y + 1, right_x);
+                sum = sum + 0.0625f * src(src_y + 2, right_x);
+
+                smem[4 + threadIdx.x] = sum;
+            }
+        }
+        else
+        {
+            {
+                work_type sum;
+
+                sum =       0.0625f * src(Brd::idx_low(src_y - 2, src_rows) , Brd::idx_high(x, src_cols));
+                sum = sum + 0.25f   * src(Brd::idx_low(src_y - 1, src_rows) , Brd::idx_high(x, src_cols));
+                sum = sum + 0.375f  * src(src_y                             , Brd::idx_high(x, src_cols));
+                sum = sum + 0.25f   * src(Brd::idx_high(src_y + 1, src_rows), Brd::idx_high(x, src_cols));
+                sum = sum + 0.0625f * src(Brd::idx_high(src_y + 2, src_rows), Brd::idx_high(x, src_cols));
+
+                smem[2 + threadIdx.x] = sum;
+            }
+
+            if (threadIdx.x < 2)
+            {
+                const int left_x = x - 2;
+
+                work_type sum;
+
+                sum =       0.0625f * src(Brd::idx_low(src_y - 2, src_rows) , Brd::idx_low(Brd::idx_high(left_x, src_cols), src_cols));
+                sum = sum + 0.25f   * src(Brd::idx_low(src_y - 1, src_rows) , Brd::idx_low(Brd::idx_high(left_x, src_cols), src_cols));
+                sum = sum + 0.375f  * src(src_y                             , Brd::idx_low(Brd::idx_high(left_x, src_cols), src_cols));
+                sum = sum + 0.25f   * src(Brd::idx_high(src_y + 1, src_rows), Brd::idx_low(Brd::idx_high(left_x, src_cols), src_cols));
+                sum = sum + 0.0625f * src(Brd::idx_high(src_y + 2, src_rows), Brd::idx_low(Brd::idx_high(left_x, src_cols), src_cols));
+
+                smem[threadIdx.x] = sum;
+            }
+
+            if (threadIdx.x > 253)
+            {
+                const int right_x = x + 2;
+
+                work_type sum;
+
+                sum =       0.0625f * src(Brd::idx_low(src_y - 2, src_rows) , Brd::idx_high(right_x, src_cols));
+                sum = sum + 0.25f   * src(Brd::idx_low(src_y - 1, src_rows) , Brd::idx_high(right_x, src_cols));
+                sum = sum + 0.375f  * src(src_y                             , Brd::idx_high(right_x, src_cols));
+                sum = sum + 0.25f   * src(Brd::idx_high(src_y + 1, src_rows), Brd::idx_high(right_x, src_cols));
+                sum = sum + 0.0625f * src(Brd::idx_high(src_y + 2, src_rows), Brd::idx_high(right_x, src_cols));
+
+                smem[4 + threadIdx.x] = sum;
+            }
+        }
+
+        __syncthreads();
+
+        if (threadIdx.x < 128)
+        {
+            const int tid2 = threadIdx.x * 2;
+
+            work_type sum;
+
+            sum =       0.0625f * smem[2 + tid2 - 2];
+            sum = sum + 0.25f   * smem[2 + tid2 - 1];
+            sum = sum + 0.375f  * smem[2 + tid2    ];
+            sum = sum + 0.25f   * smem[2 + tid2 + 1];
+            sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+            const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+
+            if (dst_x < dst_cols)
+                dst(y, dst_x) = saturate_cast<DstType>(sum);
+        }
+    }
+
+    template <class Brd, class SrcPtr, typename DstType>
+    __host__ void pyrDown(const SrcPtr& src, const GlobPtr<DstType>& dst, int src_rows, int src_cols, int dst_rows, int dst_cols, cudaStream_t stream)
+    {
+        const dim3 block(256);
+        const dim3 grid(divUp(src_cols, block.x), dst_rows);
+
+        pyrDown<Brd><<<grid, block, 0, stream>>>(src, dst, src_rows, src_cols, dst_cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/pyr_up.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/pyr_up.hpp
new file mode 100644
index 00000000000..b5543ae0266
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/pyr_up.hpp
@@ -0,0 +1,172 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_PYR_UP_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_PYR_UP_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/vec_traits.hpp"
+#include "../../util/saturate_cast.hpp"
+#include "../../util/type_traits.hpp"
+#include "../../ptr2d/glob.hpp"
+#include "../../ptr2d/traits.hpp"
+
+namespace cv { namespace cudev {
+
+namespace pyramids_detail
+{
+    template <class SrcPtr, typename DstType>
+    __global__ void pyrUp(const SrcPtr src, GlobPtr<DstType> dst, const int src_rows, const int src_cols, const int dst_rows, const int dst_cols)
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+        typedef typename VecTraits<src_type>::elem_type src_elem_type;
+        typedef typename LargerType<float, src_elem_type>::type work_elem_type;
+        typedef typename MakeVec<work_elem_type, VecTraits<src_type>::cn>::type work_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        __shared__ work_type s_srcPatch[10][10];
+        __shared__ work_type s_dstPatch[20][16];
+
+        if (threadIdx.x < 10 && threadIdx.y < 10)
+        {
+            int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
+            int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
+
+            srcx = ::abs(srcx);
+            srcx = ::min(src_cols - 1, srcx);
+
+            srcy = ::abs(srcy);
+            srcy = ::min(src_rows - 1, srcy);
+
+            s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<work_type>(src(srcy, srcx));
+        }
+
+        __syncthreads();
+
+        work_type sum = VecTraits<work_type>::all(0);
+
+        const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
+        const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
+        const bool eveny = ((threadIdx.y & 1) == 0);
+        const int tidx = threadIdx.x;
+
+        if (eveny)
+        {
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
+        }
+
+        s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
+
+        if (threadIdx.y < 2)
+        {
+            sum = VecTraits<work_type>::all(0);
+
+            if (eveny)
+            {
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+            }
+
+            s_dstPatch[threadIdx.y][threadIdx.x] = sum;
+        }
+
+        if (threadIdx.y > 13)
+        {
+            sum = VecTraits<work_type>::all(0);
+
+            if (eveny)
+            {
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+            }
+
+            s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
+        }
+
+        __syncthreads();
+
+        sum = VecTraits<work_type>::all(0);
+
+        const int tidy = threadIdx.y;
+
+        sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
+        sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
+        sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
+        sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
+        sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
+
+        if (x < dst_cols && y < dst_rows)
+            dst(y, x) = saturate_cast<DstType>(4.0f * sum);
+    }
+
+    template <class SrcPtr, typename DstType>
+    __host__ void pyrUp(const SrcPtr& src, const GlobPtr<DstType>& dst, int src_rows, int src_cols, int dst_rows, int dst_cols, cudaStream_t stream)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(dst_cols, block.x), divUp(dst_rows, block.y));
+
+        pyrUp<<<grid, block, 0, stream>>>(src, dst, src_rows, src_cols, dst_rows, dst_cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
new file mode 100644
index 00000000000..2c8dfb36ecb
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/reduce.hpp
@@ -0,0 +1,466 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/tuple.hpp"
+#include "../../util/saturate_cast.hpp"
+#include "../../util/atomic.hpp"
+#include "../../util/vec_traits.hpp"
+#include "../../util/type_traits.hpp"
+#include "../../util/limits.hpp"
+#include "../../block/reduce.hpp"
+#include "../../functional/functional.hpp"
+#include "../../ptr2d/traits.hpp"
+
+namespace cv { namespace cudev {
+
+namespace grid_reduce_detail
+{
+    // Unroll
+
+    template <int cn> struct Unroll;
+
+    template <> struct Unroll<1>
+    {
+        template <int BLOCK_SIZE, typename R>
+        __device__ __forceinline__ static volatile R* smem(R* ptr)
+        {
+            return ptr;
+        }
+
+        template <typename R>
+        __device__ __forceinline__ static R& res(R& val)
+        {
+            return val;
+        }
+
+        template <class Op>
+        __device__ __forceinline__ static const Op& op(const Op& aop)
+        {
+            return aop;
+        }
+    };
+
+    template <> struct Unroll<2>
+    {
+        template <int BLOCK_SIZE, typename R>
+        __device__ __forceinline__ static tuple<volatile R*, volatile R*> smem(R* ptr)
+        {
+            return smem_tuple(ptr, ptr + BLOCK_SIZE);
+        }
+
+        template <typename R>
+        __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> res(R& val)
+        {
+            return tie(val.x, val.y);
+        }
+
+        template <class Op>
+        __device__ __forceinline__ static tuple<Op, Op> op(const Op& aop)
+        {
+            return make_tuple(aop, aop);
+        }
+    };
+
+    template <> struct Unroll<3>
+    {
+        template <int BLOCK_SIZE, typename R>
+        __device__ __forceinline__ static tuple<volatile R*, volatile R*, volatile R*> smem(R* ptr)
+        {
+            return smem_tuple(ptr, ptr + BLOCK_SIZE, ptr + 2 * BLOCK_SIZE);
+        }
+
+        template <typename R>
+        __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&,
+                                                typename VecTraits<R>::elem_type&,
+                                                typename VecTraits<R>::elem_type&> res(R& val)
+        {
+            return tie(val.x, val.y, val.z);
+        }
+
+        template <class Op>
+        __device__ __forceinline__ static tuple<Op, Op, Op> op(const Op& aop)
+        {
+            return make_tuple(aop, aop, aop);
+        }
+    };
+
+    template <> struct Unroll<4>
+    {
+        template <int BLOCK_SIZE, typename R>
+        __device__ __forceinline__ static tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem(R* ptr)
+        {
+            return smem_tuple(ptr, ptr + BLOCK_SIZE, ptr + 2 * BLOCK_SIZE, ptr + 3 * BLOCK_SIZE);
+        }
+
+        template <typename R>
+        __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&,
+                                                typename VecTraits<R>::elem_type&,
+                                                typename VecTraits<R>::elem_type&,
+                                                typename VecTraits<R>::elem_type&> res(R& val)
+        {
+            return tie(val.x, val.y, val.z, val.w);
+        }
+
+        template <class Op>
+        __device__ __forceinline__ static tuple<Op, Op, Op, Op> op(const Op& aop)
+        {
+            return make_tuple(aop, aop, aop, aop);
+        }
+    };
+
+    // AtomicUnroll
+
+    template <typename R, int cn> struct AtomicUnroll;
+
+    template <typename R> struct AtomicUnroll<R, 1>
+    {
+        __device__ __forceinline__ static void add(R* ptr, R val)
+        {
+            atomicAdd(ptr, val);
+        }
+
+        __device__ __forceinline__ static void min(R* ptr, R val)
+        {
+            atomicMin(ptr, val);
+        }
+
+        __device__ __forceinline__ static void max(R* ptr, R val)
+        {
+            atomicMax(ptr, val);
+        }
+    };
+
+    template <typename R> struct AtomicUnroll<R, 2>
+    {
+        typedef typename MakeVec<R, 2>::type val_type;
+
+        __device__ __forceinline__ static void add(R* ptr, val_type val)
+        {
+            atomicAdd(ptr, val.x);
+            atomicAdd(ptr + 1, val.y);
+        }
+
+        __device__ __forceinline__ static void min(R* ptr, val_type val)
+        {
+            atomicMin(ptr, val.x);
+            atomicMin(ptr + 1, val.y);
+        }
+
+        __device__ __forceinline__ static void max(R* ptr, val_type val)
+        {
+            atomicMax(ptr, val.x);
+            atomicMax(ptr + 1, val.y);
+        }
+    };
+
+    template <typename R> struct AtomicUnroll<R, 3>
+    {
+        typedef typename MakeVec<R, 3>::type val_type;
+
+        __device__ __forceinline__ static void add(R* ptr, val_type val)
+        {
+            atomicAdd(ptr, val.x);
+            atomicAdd(ptr + 1, val.y);
+            atomicAdd(ptr + 2, val.z);
+        }
+
+        __device__ __forceinline__ static void min(R* ptr, val_type val)
+        {
+            atomicMin(ptr, val.x);
+            atomicMin(ptr + 1, val.y);
+            atomicMin(ptr + 2, val.z);
+        }
+
+        __device__ __forceinline__ static void max(R* ptr, val_type val)
+        {
+            atomicMax(ptr, val.x);
+            atomicMax(ptr + 1, val.y);
+            atomicMax(ptr + 2, val.z);
+        }
+    };
+
+    template <typename R> struct AtomicUnroll<R, 4>
+    {
+        typedef typename MakeVec<R, 4>::type val_type;
+
+        __device__ __forceinline__ static void add(R* ptr, val_type val)
+        {
+            atomicAdd(ptr, val.x);
+            atomicAdd(ptr + 1, val.y);
+            atomicAdd(ptr + 2, val.z);
+            atomicAdd(ptr + 3, val.w);
+        }
+
+        __device__ __forceinline__ static void min(R* ptr, val_type val)
+        {
+            atomicMin(ptr, val.x);
+            atomicMin(ptr + 1, val.y);
+            atomicMin(ptr + 2, val.z);
+            atomicMin(ptr + 3, val.w);
+        }
+
+        __device__ __forceinline__ static void max(R* ptr, val_type val)
+        {
+            atomicMax(ptr, val.x);
+            atomicMax(ptr + 1, val.y);
+            atomicMax(ptr + 2, val.z);
+            atomicMax(ptr + 3, val.w);
+        }
+    };
+
+    // SumReductor
+
+    template <typename src_type, typename work_type> struct SumReductor
+    {
+        typedef typename VecTraits<work_type>::elem_type work_elem_type;
+        enum { cn = VecTraits<src_type>::cn };
+
+        work_type sum;
+
+        __device__ __forceinline__ SumReductor()
+        {
+            sum = VecTraits<work_type>::all(0);
+        }
+
+        __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal)
+        {
+            sum = sum + saturate_cast<work_type>(srcVal);
+        }
+
+        template <int BLOCK_SIZE>
+        __device__ void reduceGrid(work_elem_type* result, int tid)
+        {
+            __shared__ work_elem_type smem[BLOCK_SIZE * cn];
+
+            blockReduce<BLOCK_SIZE>(Unroll<cn>::template smem<BLOCK_SIZE>(smem), Unroll<cn>::res(sum), tid, Unroll<cn>::op(plus<work_elem_type>()));
+
+            if (tid == 0)
+                AtomicUnroll<work_elem_type, cn>::add(result, sum);
+        }
+    };
+
+    // MinMaxReductor
+
+    template <typename T> struct minop : minimum<T>
+    {
+        __device__ __forceinline__ static T initial()
+        {
+            return numeric_limits<T>::max();
+        }
+
+        __device__ __forceinline__ static void atomic(T* result, T myval)
+        {
+            atomicMin(result, myval);
+        }
+    };
+
+    template <typename T> struct maxop : maximum<T>
+    {
+        __device__ __forceinline__ static T initial()
+        {
+            return -numeric_limits<T>::max();
+        }
+
+        __device__ __forceinline__ static void atomic(T* result, T myval)
+        {
+            atomicMax(result, myval);
+        }
+    };
+
+    struct both
+    {
+    };
+
+    template <class Op, typename src_type, typename work_type> struct MinMaxReductor
+    {
+        work_type myval;
+
+        __device__ __forceinline__ MinMaxReductor()
+        {
+            myval = Op::initial();
+        }
+
+        __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal)
+        {
+            Op op;
+
+            myval = op(myval, srcVal);
+        }
+
+        template <int BLOCK_SIZE>
+        __device__ void reduceGrid(work_type* result, int tid)
+        {
+            __shared__ work_type smem[BLOCK_SIZE];
+
+            Op op;
+
+            blockReduce<BLOCK_SIZE>(smem, myval, tid, op);
+
+            if (tid == 0)
+                Op::atomic(result, myval);
+        }
+    };
+
+    template <typename src_type, typename work_type> struct MinMaxReductor<both, src_type, work_type>
+    {
+        work_type mymin;
+        work_type mymax;
+
+        __device__ __forceinline__ MinMaxReductor()
+        {
+            mymin = numeric_limits<work_type>::max();
+            mymax = -numeric_limits<work_type>::max();
+        }
+
+        __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal)
+        {
+            minimum<work_type> minOp;
+            maximum<work_type> maxOp;
+
+            mymin = minOp(mymin, srcVal);
+            mymax = maxOp(mymax, srcVal);
+        }
+
+        template <int BLOCK_SIZE>
+        __device__ void reduceGrid(work_type* result, int tid)
+        {
+            __shared__ work_type sminval[BLOCK_SIZE];
+            __shared__ work_type smaxval[BLOCK_SIZE];
+
+            minimum<work_type> minOp;
+            maximum<work_type> maxOp;
+
+            blockReduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), tie(mymin, mymax), tid, make_tuple(minOp, maxOp));
+
+            if (tid == 0)
+            {
+                atomicMin(result, mymin);
+                atomicMax(result + 1, mymax);
+            }
+        }
+    };
+
+    // glob_reduce
+
+    template <class Reductor, int BLOCK_SIZE, int PATCH_X, int PATCH_Y, class SrcPtr, typename ResType, class MaskPtr>
+    __global__ void reduce(const SrcPtr src, ResType* result, const MaskPtr mask, const int rows, const int cols)
+    {
+        const int x0 = blockIdx.x * blockDim.x * PATCH_X + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * PATCH_Y + threadIdx.y;
+
+        Reductor reductor;
+
+        for (int i = 0, y = y0; i < PATCH_Y && y < rows; ++i, y += blockDim.y)
+        {
+            for (int j = 0, x = x0; j < PATCH_X && x < cols; ++j, x += blockDim.x)
+            {
+                if (mask(y, x))
+                {
+                    reductor.reduceVal(src(y, x));
+                }
+            }
+        }
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        reductor.template reduceGrid<BLOCK_SIZE>(result, tid);
+    }
+
+    template <class Reductor, class Policy, class SrcPtr, typename ResType, class MaskPtr>
+    __host__ void reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));
+
+        reduce<Reductor, Policy::block_size_x * Policy::block_size_y, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+
+    // callers
+
+    template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+    __host__ void sum(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+        typedef typename VecTraits<ResType>::elem_type res_elem_type;
+
+        reduce<SumReductor<src_type, ResType>, Policy>(src, (res_elem_type*) result, mask, rows, cols, stream);
+    }
+
+    template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+    __host__ void minVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+        reduce<MinMaxReductor<minop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
+    }
+
+    template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+    __host__ void maxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+        reduce<MinMaxReductor<maxop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
+    }
+
+    template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+    __host__ void minMaxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+        reduce<MinMaxReductor<both, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp
new file mode 100644
index 00000000000..e1c8a3bc7a2
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp
@@ -0,0 +1,146 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_REDUCE_TO_COLUMN_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_REDUCE_TO_COLUMN_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/saturate_cast.hpp"
+#include "../../block/reduce.hpp"
+
+namespace cv { namespace cudev {
+
+namespace grid_reduce_to_vec_detail
+{
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor, int cn> struct Reduce;
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 1>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[1][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem[0], myVal, threadIdx.x, op);
+        }
+    };
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 2>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[2][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1]), tie(myVal.x, myVal.y), threadIdx.x, make_tuple(op, op));
+        }
+    };
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 3>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[3][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2]), tie(myVal.x, myVal.y, myVal.z), threadIdx.x, make_tuple(op, op, op));
+        }
+    };
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 4>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[4][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2], smem[3]), tie(myVal.x, myVal.y, myVal.z, myVal.w), threadIdx.x, make_tuple(op, op, op, op));
+        }
+    };
+
+    template <class Reductor, int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
+    __global__ void reduceToColumn(const SrcPtr src, ResType* dst, const MaskPtr mask, const int cols)
+    {
+        typedef typename Reductor::work_type work_type;
+        typedef typename VecTraits<work_type>::elem_type work_elem_type;
+        const int cn = VecTraits<work_type>::cn;
+
+        __shared__ work_elem_type smem[cn][BLOCK_SIZE];
+
+        const int y = blockIdx.x;
+
+        work_type myVal = Reductor::initialValue();
+
+        Reductor op;
+
+        for (int x = threadIdx.x; x < cols; x += BLOCK_SIZE)
+        {
+            if (mask(y, x))
+            {
+                myVal = op(myVal, saturate_cast<work_type>(src(y, x)));
+            }
+        }
+
+        Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, cn>::call(smem, myVal);
+
+        if (threadIdx.x == 0)
+            dst[y] = saturate_cast<ResType>(Reductor::result(myVal, cols));
+    }
+
+    template <class Reductor, class Policy, class SrcPtr, typename ResType, class MaskPtr>
+    __host__ void reduceToColumn(const SrcPtr& src, ResType* dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const int BLOCK_SIZE_X = Policy::block_size_x;
+        const int BLOCK_SIZE_Y = Policy::block_size_y;
+
+        const int BLOCK_SIZE = BLOCK_SIZE_X * BLOCK_SIZE_Y;
+
+        const dim3 block(BLOCK_SIZE);
+        const dim3 grid(rows);
+
+        reduceToColumn<Reductor, BLOCK_SIZE><<<grid, block, 0, stream>>>(src, dst, mask, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_row.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_row.hpp
new file mode 100644
index 00000000000..8d3c7e40498
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_row.hpp
@@ -0,0 +1,118 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_REDUCE_TO_ROW_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_REDUCE_TO_ROW_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/saturate_cast.hpp"
+#include "../../block/reduce.hpp"
+
+namespace cv { namespace cudev {
+
+namespace grid_reduce_to_vec_detail
+{
+    template <class Reductor, int BLOCK_SIZE_X, int BLOCK_SIZE_Y, class SrcPtr, typename ResType, class MaskPtr>
+    __global__ void reduceToRow(const SrcPtr src, ResType* dst, const MaskPtr mask, const int rows, const int cols)
+    {
+        typedef typename Reductor::work_type work_type;
+
+        __shared__ work_type smem[BLOCK_SIZE_X * BLOCK_SIZE_Y];
+
+        const int x = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;
+
+        work_type myVal = Reductor::initialValue();
+
+        Reductor op;
+
+        if (x < cols)
+        {
+            for (int y = threadIdx.y; y < rows; y += BLOCK_SIZE_Y)
+            {
+                if (mask(y, x))
+                {
+                    myVal = op(myVal, saturate_cast<work_type>(src(y, x)));
+                }
+            }
+        }
+
+        smem[threadIdx.x * BLOCK_SIZE_Y + threadIdx.y] = myVal;
+
+        __syncthreads();
+
+        volatile work_type* srow = smem + threadIdx.y * BLOCK_SIZE_X;
+
+        myVal = srow[threadIdx.x];
+        blockReduce<BLOCK_SIZE_X>(srow, myVal, threadIdx.x, op);
+
+        if (threadIdx.x == 0)
+            srow[0] = myVal;
+
+        __syncthreads();
+
+        if (threadIdx.y == 0 && x < cols)
+            dst[x] = saturate_cast<ResType>(Reductor::result(smem[threadIdx.x * BLOCK_SIZE_X], rows));
+    }
+
+    template <class Reductor, class SrcPtr, typename ResType, class MaskPtr>
+    __host__ void reduceToRow(const SrcPtr& src, ResType* dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const int BLOCK_SIZE_X = 16;
+        const int BLOCK_SIZE_Y = 16;
+
+        const dim3 block(BLOCK_SIZE_X, BLOCK_SIZE_Y);
+        const dim3 grid(divUp(cols, block.x));
+
+        reduceToRow<Reductor, BLOCK_SIZE_X, BLOCK_SIZE_Y><<<grid, block, 0, stream>>>(src, dst, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp
new file mode 100644
index 00000000000..3f512060165
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp
@@ -0,0 +1,282 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_SPLIT_MERGE_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_SPLIT_MERGE_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/saturate_cast.hpp"
+#include "../../util/tuple.hpp"
+#include "../../util/vec_traits.hpp"
+#include "../../ptr2d/glob.hpp"
+#include "../../ptr2d/traits.hpp"
+
+namespace cv { namespace cudev {
+
+namespace grid_split_merge_detail
+{
+    // merge
+
+    template <class Src1Ptr, class Src2Ptr, typename DstType, class MaskPtr>
+    __global__ void mergeC2(const Src1Ptr src1, const Src2Ptr src2, GlobPtr<DstType> dst, const MaskPtr mask, const int rows, const int cols)
+    {
+        typedef typename VecTraits<DstType>::elem_type dst_elem_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        dst(y, x) = VecTraits<DstType>::make(
+                    saturate_cast<dst_elem_type>(src1(y, x)),
+                    saturate_cast<dst_elem_type>(src2(y, x))
+                    );
+    }
+
+    template <class Policy, class Src1Ptr, class Src2Ptr, typename DstType, class MaskPtr>
+    __host__ void mergeC2(const Src1Ptr& src1, const Src2Ptr& src2, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+        mergeC2<<<grid, block, 0, stream>>>(src1, src2, dst, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
+    }
+
+    template <class Src1Ptr, class Src2Ptr, class Src3Ptr, typename DstType, class MaskPtr>
+    __global__ void mergeC3(const Src1Ptr src1, const Src2Ptr src2, const Src3Ptr src3, GlobPtr<DstType> dst, const MaskPtr mask, const int rows, const int cols)
+    {
+        typedef typename VecTraits<DstType>::elem_type dst_elem_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        dst(y, x) = VecTraits<DstType>::make(
+                    saturate_cast<dst_elem_type>(src1(y, x)),
+                    saturate_cast<dst_elem_type>(src2(y, x)),
+                    saturate_cast<dst_elem_type>(src3(y, x))
+                    );
+    }
+
+    template <class Policy, class Src1Ptr, class Src2Ptr, class Src3Ptr, typename DstType, class MaskPtr>
+    __host__ void mergeC3(const Src1Ptr& src1, const Src2Ptr& src2, const Src3Ptr& src3, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+        mergeC3<<<grid, block, 0, stream>>>(src1, src2, src3, dst, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
+    }
+
+    template <class Src1Ptr, class Src2Ptr, class Src3Ptr, class Src4Ptr, typename DstType, class MaskPtr>
+    __global__ void mergeC4(const Src1Ptr src1, const Src2Ptr src2, const Src3Ptr src3, const Src4Ptr src4, GlobPtr<DstType> dst, const MaskPtr mask, const int rows, const int cols)
+    {
+        typedef typename VecTraits<DstType>::elem_type dst_elem_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        dst(y, x) = VecTraits<DstType>::make(
+                    saturate_cast<dst_elem_type>(src1(y, x)),
+                    saturate_cast<dst_elem_type>(src2(y, x)),
+                    saturate_cast<dst_elem_type>(src3(y, x)),
+                    saturate_cast<dst_elem_type>(src4(y, x))
+                    );
+    }
+
+    template <class Policy, class Src1Ptr, class Src2Ptr, class Src3Ptr, class Src4Ptr, typename DstType, class MaskPtr>
+    __host__ void mergeC4(const Src1Ptr& src1, const Src2Ptr& src2, const Src3Ptr& src3, const Src4Ptr& src4, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+        mergeC4<<<grid, block, 0, stream>>>(src1, src2, src3, src4, dst, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
+    }
+
+    template <int cn, class Policy> struct MergeImpl;
+
+    template <class Policy> struct MergeImpl<2, Policy>
+    {
+        template <class SrcPtrTuple, typename DstType, class MaskPtr>
+        __host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+        {
+            mergeC2<Policy>(get<0>(src), get<1>(src), dst, mask, rows, cols, stream);
+        }
+    };
+
+    template <class Policy> struct MergeImpl<3, Policy>
+    {
+        template <class SrcPtrTuple, typename DstType, class MaskPtr>
+        __host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+        {
+            mergeC3<Policy>(get<0>(src), get<1>(src), get<2>(src), dst, mask, rows, cols, stream);
+        }
+    };
+
+    template <class Policy> struct MergeImpl<4, Policy>
+    {
+        template <class SrcPtrTuple, typename DstType, class MaskPtr>
+        __host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+        {
+            mergeC4<Policy>(get<0>(src), get<1>(src), get<2>(src), get<3>(src), dst, mask, rows, cols, stream);
+        }
+    };
+
+    // split
+
+    template <class SrcPtr, typename DstType, class MaskPtr>
+    __global__ void split(const SrcPtr src, GlobPtr<DstType> dst1, GlobPtr<DstType> dst2, const MaskPtr mask, const int rows, const int cols)
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        const src_type src_value = src(y, x);
+
+        dst1(y, x) = src_value.x;
+        dst2(y, x) = src_value.y;
+    }
+
+    template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+    __host__ void split(const SrcPtr& src, const GlobPtr<DstType>& dst1, const GlobPtr<DstType>& dst2, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+        split<<<grid, block, 0, stream>>>(src, dst1, dst2, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
+    }
+
+    template <class SrcPtr, typename DstType, class MaskPtr>
+    __global__ void split(const SrcPtr src, GlobPtr<DstType> dst1, GlobPtr<DstType> dst2, GlobPtr<DstType> dst3, const MaskPtr mask, const int rows, const int cols)
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        const src_type src_value = src(y, x);
+
+        dst1(y, x) = src_value.x;
+        dst2(y, x) = src_value.y;
+        dst3(y, x) = src_value.z;
+    }
+
+    template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+    __host__ void split(const SrcPtr& src, const GlobPtr<DstType>& dst1, const GlobPtr<DstType>& dst2, const GlobPtr<DstType>& dst3, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+        split<<<grid, block, 0, stream>>>(src, dst1, dst2, dst3, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
+    }
+
+    template <class SrcPtr, typename DstType, class MaskPtr>
+    __global__ void split(const SrcPtr src, GlobPtr<DstType> dst1, GlobPtr<DstType> dst2, GlobPtr<DstType> dst3, GlobPtr<DstType> dst4, const MaskPtr mask, const int rows, const int cols)
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        const src_type src_value = src(y, x);
+
+        dst1(y, x) = src_value.x;
+        dst2(y, x) = src_value.y;
+        dst3(y, x) = src_value.z;
+        dst4(y, x) = src_value.w;
+    }
+
+    template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+    __host__ void split(const SrcPtr& src, const GlobPtr<DstType>& dst1, const GlobPtr<DstType>& dst2, const GlobPtr<DstType>& dst3, const GlobPtr<DstType>& dst4, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+        split<<<grid, block, 0, stream>>>(src, dst1, dst2, dst3, dst4, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp
new file mode 100644
index 00000000000..557797d7c85
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp
@@ -0,0 +1,417 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_TRANSFORM_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_TRANSFORM_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/tuple.hpp"
+#include "../../util/saturate_cast.hpp"
+#include "../../util/vec_traits.hpp"
+#include "../../ptr2d/glob.hpp"
+#include "../../ptr2d/traits.hpp"
+
+namespace cv { namespace cudev {
+
+namespace grid_transform_detail
+{
+    // OpUnroller
+
+    template <int cn> struct OpUnroller;
+
+    template <> struct OpUnroller<1>
+    {
+        template <typename T, typename D, class UnOp, class MaskPtr>
+        __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+        }
+
+        template <typename T1, typename T2, typename D, class BinOp, class MaskPtr>
+        __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+        }
+    };
+
+    template <> struct OpUnroller<2>
+    {
+        template <typename T, typename D, class UnOp, class MaskPtr>
+        __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src.y);
+        }
+
+        template <typename T1, typename T2, typename D, class BinOp, class MaskPtr>
+        __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src1.y, src2.y);
+        }
+    };
+
+    template <> struct OpUnroller<3>
+    {
+        template <typename T, typename D, class UnOp, class MaskPtr>
+        __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src.z);
+        }
+
+        template <typename T1, typename T2, typename D, class BinOp, class MaskPtr>
+        __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src1.y, src2.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src1.z, src2.z);
+        }
+    };
+
+    template <> struct OpUnroller<4>
+    {
+        template <typename T, typename D, class UnOp, class MaskPtr>
+        __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src.z);
+            if (mask(y, x_shifted + 3))
+                dst.w = op(src.w);
+        }
+
+        template <typename T1, typename T2, typename D, class BinOp, class MaskPtr>
+        __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src1.y, src2.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src1.z, src2.z);
+            if (mask(y, x_shifted + 3))
+                dst.w = op(src1.w, src2.w);
+        }
+    };
+
+    // transformSimple
+
+    template <class SrcPtr, typename DstType, class UnOp, class MaskPtr>
+    __global__ void transformSimple(const SrcPtr src, GlobPtr<DstType> dst, const UnOp op, const MaskPtr mask, const int rows, const int cols)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        dst(y, x) = saturate_cast<DstType>(op(src(y, x)));
+    }
+
+    template <class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
+    __global__ void transformSimple(const SrcPtr1 src1, const SrcPtr2 src2, GlobPtr<DstType> dst, const BinOp op, const MaskPtr mask, const int rows, const int cols)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        dst(y, x) = saturate_cast<DstType>(op(src1(y, x), src2(y, x)));
+    }
+
+    // transformSmart
+
+    template <int SHIFT, typename SrcType, typename DstType, class UnOp, class MaskPtr>
+    __global__ void transformSmart(const GlobPtr<SrcType> src_, GlobPtr<DstType> dst_, const UnOp op, const MaskPtr mask, const int rows, const int cols)
+    {
+        typedef typename MakeVec<SrcType, SHIFT>::type read_type;
+        typedef typename MakeVec<DstType, SHIFT>::type write_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x_shifted = x * SHIFT;
+
+        if (y < rows)
+        {
+            const SrcType* src = src_.row(y);
+            DstType* dst = dst_.row(y);
+
+            if (x_shifted + SHIFT - 1 < cols)
+            {
+                const read_type src_n_el = ((const read_type*)src)[x];
+
+                OpUnroller<SHIFT>::unroll(src_n_el, ((write_type*)dst)[x], op, mask, x_shifted, y);
+            }
+            else
+            {
+                for (int real_x = x_shifted; real_x < cols; ++real_x)
+                {
+                    if (mask(y, real_x))
+                        dst[real_x] = op(src[real_x]);
+                }
+            }
+        }
+    }
+
+    template <int SHIFT, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
+    __global__ void transformSmart(const GlobPtr<SrcType1> src1_, const GlobPtr<SrcType2> src2_, GlobPtr<DstType> dst_, const BinOp op, const MaskPtr mask, const int rows, const int cols)
+    {
+        typedef typename MakeVec<SrcType1, SHIFT>::type read_type1;
+        typedef typename MakeVec<SrcType2, SHIFT>::type read_type2;
+        typedef typename MakeVec<DstType, SHIFT>::type write_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x_shifted = x * SHIFT;
+
+        if (y < rows)
+        {
+            const SrcType1* src1 = src1_.row(y);
+            const SrcType2* src2 = src2_.row(y);
+            DstType* dst = dst_.row(y);
+
+            if (x_shifted + SHIFT - 1 < cols)
+            {
+                const read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                const read_type2 src2_n_el = ((const read_type2*)src2)[x];
+
+                OpUnroller<SHIFT>::unroll(src1_n_el, src2_n_el, ((write_type*)dst)[x], op, mask, x_shifted, y);
+            }
+            else
+            {
+                for (int real_x = x_shifted; real_x < cols; ++real_x)
+                {
+                    if (mask(y, real_x))
+                        dst[real_x] = op(src1[real_x], src2[real_x]);
+                }
+            }
+        }
+    }
+
+    // TransformDispatcher
+
+    template <bool UseSmart, class Policy> struct TransformDispatcher;
+
+    template <class Policy> struct TransformDispatcher<false, Policy>
+    {
+        template <class SrcPtr, typename DstType, class UnOp, class MaskPtr>
+        __host__ static void call(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+        {
+            const dim3 block(Policy::block_size_x, Policy::block_size_y);
+            const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+            transformSimple<<<grid, block, 0, stream>>>(src, dst, op, mask, rows, cols);
+            CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+            if (stream == 0)
+                CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+        }
+
+        template <class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
+        __host__ static void call(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+        {
+            const dim3 block(Policy::block_size_x, Policy::block_size_y);
+            const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+            transformSimple<<<grid, block, 0, stream>>>(src1, src2, dst, op, mask, rows, cols);
+            CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+            if (stream == 0)
+                CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+        }
+    };
+
+    template <class Policy> struct TransformDispatcher<true, Policy>
+    {
+        template <typename T>
+        __host__ static bool isAligned(const T* ptr, size_t size)
+        {
+            return reinterpret_cast<size_t>(ptr) % size == 0;
+        }
+
+        __host__ static bool isAligned(size_t step, size_t size)
+        {
+            return step % size == 0;
+        }
+
+        template <typename SrcType, typename DstType, class UnOp, class MaskPtr>
+        __host__ static void call(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+        {
+            if (Policy::shift == 1 ||
+                !isAligned(src.data, Policy::shift * sizeof(SrcType)) || !isAligned(src.step, Policy::shift * sizeof(SrcType)) ||
+                !isAligned(dst.data, Policy::shift * sizeof(DstType)) || !isAligned(dst.step, Policy::shift * sizeof(DstType)))
+            {
+                TransformDispatcher<false, Policy>::call(src, dst, op, mask, rows, cols, stream);
+                return;
+            }
+
+            const dim3 block(Policy::block_size_x, Policy::block_size_y);
+            const dim3 grid(divUp(cols, block.x * Policy::shift), divUp(rows, block.y));
+
+            transformSmart<Policy::shift><<<grid, block, 0, stream>>>(src, dst, op, mask, rows, cols);
+            CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+            if (stream == 0)
+                CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+        }
+
+        template <typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
+        __host__ static void call(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+        {
+            if (Policy::shift == 1 ||
+                !isAligned(src1.data, Policy::shift * sizeof(SrcType1)) || !isAligned(src1.step, Policy::shift * sizeof(SrcType1)) ||
+                !isAligned(src2.data, Policy::shift * sizeof(SrcType2)) || !isAligned(src2.step, Policy::shift * sizeof(SrcType2)) ||
+                !isAligned(dst.data,  Policy::shift * sizeof(DstType))  || !isAligned(dst.step,  Policy::shift * sizeof(DstType)))
+            {
+                TransformDispatcher<false, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
+                return;
+            }
+
+            const dim3 block(Policy::block_size_x, Policy::block_size_y);
+            const dim3 grid(divUp(cols, block.x * Policy::shift), divUp(rows, block.y));
+
+            transformSmart<Policy::shift><<<grid, block, 0, stream>>>(src1, src2, dst, op, mask, rows, cols);
+            CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+            if (stream == 0)
+                CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+        }
+    };
+
+    template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
+    __host__ void transform_unary(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        TransformDispatcher<false, Policy>::call(src, dst, op, mask, rows, cols, stream);
+    }
+
+    template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
+    __host__ void transform_binary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        TransformDispatcher<false, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
+    }
+
+    template <class Policy, typename SrcType, typename DstType, class UnOp, class MaskPtr>
+    __host__ void transform_unary(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        TransformDispatcher<VecTraits<SrcType>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src, dst, op, mask, rows, cols, stream);
+    }
+
+    template <class Policy, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
+    __host__ void transform_binary(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        TransformDispatcher<VecTraits<SrcType1>::cn == 1 && VecTraits<SrcType2>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
+    }
+
+    // transform_tuple
+
+    template <int count> struct Unroll
+    {
+        template <class SrcVal, class DstPtrTuple, class OpTuple>
+        __device__ static void transform(const SrcVal& srcVal, DstPtrTuple& dst, const OpTuple& op, int y, int x)
+        {
+            typedef typename tuple_element<count - 1, DstPtrTuple>::type dst_ptr_type;
+            typedef typename PtrTraits<dst_ptr_type>::value_type dst_type;
+
+            get<count - 1>(dst)(y, x) = saturate_cast<dst_type>(get<count - 1>(op)(srcVal));
+            Unroll<count - 1>::transform(srcVal, dst, op, y, x);
+        }
+    };
+    template <> struct Unroll<0>
+    {
+        template <class SrcVal, class DstPtrTuple, class OpTuple>
+        __device__ __forceinline__ static void transform(const SrcVal&, DstPtrTuple&, const OpTuple&, int, int)
+        {
+        }
+    };
+
+    template <class SrcPtr, class DstPtrTuple, class OpTuple, class MaskPtr>
+    __global__ void transform_tuple(const SrcPtr src, DstPtrTuple dst, const OpTuple op, const MaskPtr mask, const int rows, const int cols)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= cols || y >= rows || !mask(y, x))
+            return;
+
+        typename PtrTraits<SrcPtr>::value_type srcVal = src(y, x);
+
+        Unroll<tuple_size<DstPtrTuple>::value>::transform(srcVal, dst, op, y, x);
+    }
+
+    template <class Policy, class SrcPtrTuple, class DstPtrTuple, class OpTuple, class MaskPtr>
+    __host__ void transform_tuple(const SrcPtrTuple& src, const DstPtrTuple& dst, const OpTuple& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::block_size_x, Policy::block_size_y);
+        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+        transform_tuple<<<grid, block, 0, stream>>>(src, dst, op, mask, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp
new file mode 100644
index 00000000000..1236d4bce36
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp
@@ -0,0 +1,127 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_TRANSPOSE_DETAIL_HPP
+#define OPENCV_CUDEV_GRID_TRANSPOSE_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/saturate_cast.hpp"
+#include "../../ptr2d/glob.hpp"
+#include "../../ptr2d/traits.hpp"
+
+namespace cv { namespace cudev {
+
+namespace transpose_detail
+{
+    template <int TILE_DIM, int BLOCK_DIM_Y, class SrcPtr, typename DstType>
+    __global__ void transpose(const SrcPtr src, GlobPtr<DstType> dst, const int rows, const int cols)
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+        __shared__ src_type tile[TILE_DIM][TILE_DIM + 1];
+
+        int blockIdx_x, blockIdx_y;
+
+        // do diagonal reordering
+        if (gridDim.x == gridDim.y)
+        {
+            blockIdx_y = blockIdx.x;
+            blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
+        }
+        else
+        {
+            int bid = blockIdx.x + gridDim.x * blockIdx.y;
+            blockIdx_y = bid % gridDim.y;
+            blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
+        }
+
+        int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
+        int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
+
+        if (xIndex < cols)
+        {
+            for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
+            {
+                if (yIndex + i < rows)
+                {
+                    tile[threadIdx.y + i][threadIdx.x] = src(yIndex + i, xIndex);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
+        yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
+
+        if (xIndex < rows)
+        {
+            for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
+            {
+                if (yIndex + i < cols)
+                {
+                    dst(yIndex + i, xIndex) = saturate_cast<DstType>(tile[threadIdx.x][threadIdx.y + i]);
+                }
+            }
+        }
+    }
+
+    template <class Policy, class SrcPtr, typename DstType>
+    __host__ void transpose(const SrcPtr& src, const GlobPtr<DstType>& dst, int rows, int cols, cudaStream_t stream)
+    {
+        const dim3 block(Policy::tile_dim, Policy::block_dim_y);
+        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+
+        transpose<Policy::tile_dim, Policy::block_dim_y><<<grid, block, 0, stream>>>(src, dst, rows, cols);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/histogram.hpp b/modules/cudev/include/opencv2/cudev/grid/histogram.hpp
new file mode 100644
index 00000000000..1ceee9fdd51
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/histogram.hpp
@@ -0,0 +1,124 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_HISTOGRAM_HPP
+#define OPENCV_CUDEV_GRID_HISTOGRAM_HPP
+
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/mask.hpp"
+#include "detail/histogram.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <int BIN_COUNT, class Policy, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridHistogram_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_Assert( deviceSupports(SHARED_ATOMICS) );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dst.create(1, BIN_COUNT);
+    dst.setTo(0, stream);
+
+    grid_histogram_detail::histogram<BIN_COUNT, Policy>(shrinkPtr(src),
+                                                        dst[0],
+                                                        shrinkPtr(mask),
+                                                        rows, cols,
+                                                        StreamAccessor::getStream(stream));
+}
+
+template <int BIN_COUNT, class Policy, class SrcPtr, typename ResType>
+__host__ void gridHistogram_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    CV_Assert( deviceSupports(SHARED_ATOMICS) );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst.create(1, BIN_COUNT);
+    dst.setTo(0, stream);
+
+    grid_histogram_detail::histogram<BIN_COUNT, Policy>(shrinkPtr(src),
+                                                        dst[0],
+                                                        WithOutMask(),
+                                                        rows, cols,
+                                                        StreamAccessor::getStream(stream));
+}
+
+// default policy
+
+struct DefaultHistogramPolicy
+{
+    enum {
+        block_size_x = 32,
+        block_size_y = 8
+    };
+};
+
+template <int BIN_COUNT, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridHistogram(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridHistogram_<BIN_COUNT, DefaultHistogramPolicy>(src, dst, mask, stream);
+}
+
+template <int BIN_COUNT, class SrcPtr, typename ResType>
+__host__ void gridHistogram(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    gridHistogram_<BIN_COUNT, DefaultHistogramPolicy>(src, dst, stream);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/integral.hpp b/modules/cudev/include/opencv2/cudev/grid/integral.hpp
new file mode 100644
index 00000000000..3e680b1a196
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/integral.hpp
@@ -0,0 +1,74 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_INTEGRAL_HPP
+#define OPENCV_CUDEV_GRID_INTEGRAL_HPP
+
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "detail/integral.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class SrcPtr, typename DstType>
+__host__ void gridIntegral(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst.create(rows, cols);
+
+    integral_detail::integral(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/pyramids.hpp b/modules/cudev/include/opencv2/cudev/grid/pyramids.hpp
new file mode 100644
index 00000000000..64826578259
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/pyramids.hpp
@@ -0,0 +1,93 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_PYRAMIDS_HPP
+#define OPENCV_CUDEV_GRID_PYRAMIDS_HPP
+
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/extrapolation.hpp"
+#include "detail/pyr_down.hpp"
+#include "detail/pyr_up.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class Brd, class SrcPtr, typename DstType>
+__host__ void gridPyrDown_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst.create(divUp(rows, 2), divUp(cols, 2));
+
+    pyramids_detail::pyrDown<Brd>(shrinkPtr(src), shrinkPtr(dst), rows, cols, dst.rows, dst.cols, StreamAccessor::getStream(stream));
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridPyrDown(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridPyrDown_<BrdReflect101>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridPyrUp(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst.create(rows * 2, cols * 2);
+
+    pyramids_detail::pyrUp(shrinkPtr(src), shrinkPtr(dst), rows, cols, dst.rows, dst.cols, StreamAccessor::getStream(stream));
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/reduce.hpp b/modules/cudev/include/opencv2/cudev/grid/reduce.hpp
new file mode 100644
index 00000000000..e9e42b27fdb
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/reduce.hpp
@@ -0,0 +1,380 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_REDUCE_HPP
+#define OPENCV_CUDEV_GRID_REDUCE_HPP
+
+#include <limits>
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/mask.hpp"
+#include "../ptr2d/transform.hpp"
+#include "detail/reduce.hpp"
+#include "detail/minmaxloc.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+    CV_StaticAssert( unsigned(VecTraits<src_type>::cn) == unsigned(VecTraits<ResType>::cn), "" );
+
+    dst.create(1, 1);
+    dst.setTo(0, stream);
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_reduce_detail::sum<Policy>(shrinkPtr(src),
+                                    dst[0],
+                                    shrinkPtr(mask),
+                                    rows, cols,
+                                    StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType>
+__host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+    CV_StaticAssert( unsigned(VecTraits<src_type>::cn) == unsigned(VecTraits<ResType>::cn), "" );
+
+    dst.create(1, 1);
+    dst.setTo(0, stream);
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    grid_reduce_detail::sum<Policy>(shrinkPtr(src),
+                                    dst[0],
+                                    WithOutMask(),
+                                    rows, cols,
+                                    StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridFindMinVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    dst.create(1, 1);
+    dst.setTo(Scalar::all(std::numeric_limits<ResType>::max()), stream);
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_reduce_detail::minVal<Policy>(shrinkPtr(src),
+                                       dst[0],
+                                       shrinkPtr(mask),
+                                       rows, cols,
+                                       StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType>
+__host__ void gridFindMinVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    dst.create(1, 1);
+    dst.setTo(Scalar::all(std::numeric_limits<ResType>::max()), stream);
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    grid_reduce_detail::minVal<Policy>(shrinkPtr(src),
+                                       dst[0],
+                                       WithOutMask(),
+                                       rows, cols,
+                                       StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridFindMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    dst.create(1, 1);
+    dst.setTo(Scalar::all(-std::numeric_limits<ResType>::max()), stream);
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_reduce_detail::maxVal<Policy>(shrinkPtr(src),
+                                       dst[0],
+                                       shrinkPtr(mask),
+                                       rows, cols,
+                                       StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType>
+__host__ void gridFindMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    dst.create(1, 1);
+    dst.setTo(Scalar::all(-std::numeric_limits<ResType>::max()), stream);
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    grid_reduce_detail::maxVal<Policy>(shrinkPtr(src),
+                                       dst[0],
+                                       WithOutMask(),
+                                       rows, cols,
+                                       StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridFindMinMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    dst.create(1, 2);
+    dst.col(0).setTo(Scalar::all(std::numeric_limits<ResType>::max()), stream);
+    dst.col(1).setTo(Scalar::all(-std::numeric_limits<ResType>::max()), stream);
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
+                                          dst[0],
+                                          shrinkPtr(mask),
+                                          rows, cols,
+                                          StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType>
+__host__ void gridFindMinMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    dst.create(1, 2);
+    dst.col(0).setTo(Scalar::all(std::numeric_limits<ResType>::max()), stream);
+    dst.col(1).setTo(Scalar::all(-std::numeric_limits<ResType>::max()), stream);
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    grid_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
+                                          dst[0],
+                                          WithOutMask(),
+                                          rows, cols,
+                                          StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridMinMaxLoc_(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dim3 grid, block;
+    grid_minmaxloc_detail::getLaunchCfg<Policy>(rows, cols, block, grid);
+
+    valBuf.create(2, grid.x * grid.y);
+    locBuf.create(2, grid.x * grid.y);
+
+    grid_minmaxloc_detail::minMaxLoc<Policy>(shrinkPtr(src),
+                                             valBuf[0], valBuf[1], locBuf[0], locBuf[1],
+                                             shrinkPtr(mask),
+                                             rows, cols,
+                                             StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType>
+__host__ void gridMinMaxLoc_(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dim3 grid, block;
+    grid_minmaxloc_detail::getLaunchCfg<Policy>(rows, cols, block, grid);
+
+    valBuf.create(2, grid.x * grid.y);
+    locBuf.create(2, grid.x * grid.y);
+
+    grid_minmaxloc_detail::minMaxLoc<Policy>(shrinkPtr(src),
+                                             valBuf[0], valBuf[1], locBuf[0], locBuf[1],
+                                             WithOutMask(),
+                                             rows, cols,
+                                             StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridCountNonZero_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    dst.create(1, 1);
+    dst.setTo(0, stream);
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    typedef typename PtrTraits<SrcPtr>::value_type src_type;
+    not_equal_to<src_type> ne_op;
+    const src_type zero = VecTraits<src_type>::all(0);
+
+    grid_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
+                                    dst[0],
+                                    shrinkPtr(mask),
+                                    rows, cols,
+                                    StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType>
+__host__ void gridCountNonZero_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    dst.create(1, 1);
+    dst.setTo(0, stream);
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    typedef typename PtrTraits<SrcPtr>::value_type src_type;
+    not_equal_to<src_type> ne_op;
+    const src_type zero = VecTraits<src_type>::all(0);
+
+    grid_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
+                                    dst[0],
+                                    WithOutMask(),
+                                    rows, cols,
+                                    StreamAccessor::getStream(stream));
+}
+
+// default policy
+
+struct DefaultGlobReducePolicy
+{
+    enum {
+        block_size_x = 32,
+        block_size_y = 8,
+
+        patch_size_x = 4,
+        patch_size_y = 4
+    };
+};
+
+template <class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridCalcSum(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCalcSum_<DefaultGlobReducePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename ResType>
+__host__ void gridCalcSum(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    gridCalcSum_<DefaultGlobReducePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridFindMinVal(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridFindMinVal_<DefaultGlobReducePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename ResType>
+__host__ void gridFindMinVal(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    gridFindMinVal_<DefaultGlobReducePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridFindMaxVal(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridFindMaxVal_<DefaultGlobReducePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename ResType>
+__host__ void gridFindMaxVal(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    gridFindMaxVal_<DefaultGlobReducePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridFindMinMaxVal(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridFindMinMaxVal_<DefaultGlobReducePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename ResType>
+__host__ void gridFindMinMaxVal(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    gridFindMinMaxVal_<DefaultGlobReducePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridMinMaxLoc(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridMinMaxLoc_<DefaultGlobReducePolicy>(src, valBuf, locBuf, mask, stream);
+}
+
+template <class SrcPtr, typename ResType>
+__host__ void gridMinMaxLoc(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, Stream& stream = Stream::Null())
+{
+    gridMinMaxLoc_<DefaultGlobReducePolicy>(src, valBuf, locBuf, stream);
+}
+
+template <class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridCountNonZero(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCountNonZero_<DefaultGlobReducePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename ResType>
+__host__ void gridCountNonZero(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    gridCountNonZero_<DefaultGlobReducePolicy>(src, dst, stream);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp b/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
new file mode 100644
index 00000000000..5955ceeaa5e
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
@@ -0,0 +1,235 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_REDUCE_TO_VEC_HPP
+#define OPENCV_CUDEV_GRID_REDUCE_TO_VEC_HPP
+
+#include "../common.hpp"
+#include "../util/vec_traits.hpp"
+#include "../util/limits.hpp"
+#include "../util/saturate_cast.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/mask.hpp"
+#include "../functional/functional.hpp"
+#include "detail/reduce_to_column.hpp"
+#include "detail/reduce_to_row.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <typename T> struct Sum : plus<T>
+{
+    typedef T work_type;
+
+    template <typename U> struct rebind
+    {
+        typedef Sum<U> other;
+    };
+
+    __device__ __forceinline__ static T initialValue()
+    {
+        return VecTraits<T>::all(0);
+    }
+
+    __device__ __forceinline__ static T result(T r, int)
+    {
+        return r;
+    }
+};
+
+template <typename T> struct Avg : plus<T>
+{
+    typedef T work_type;
+
+    template <typename U> struct rebind
+    {
+        typedef Avg<U> other;
+    };
+
+    __device__ __forceinline__ static T initialValue()
+    {
+        return VecTraits<T>::all(0);
+    }
+
+    __device__ __forceinline__ static T result(T r, float sz)
+    {
+        return saturate_cast<T>(r / sz);
+    }
+};
+
+template <typename T> struct Min : minimum<T>
+{
+    typedef T work_type;
+
+    template <typename U> struct rebind
+    {
+        typedef Min<U> other;
+    };
+
+    __device__ __forceinline__ static T initialValue()
+    {
+        return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
+    }
+
+    __device__ __forceinline__ static T result(T r, int)
+    {
+        return r;
+    }
+};
+
+template <typename T> struct Max : maximum<T>
+{
+    typedef T work_type;
+
+    template <typename U> struct rebind
+    {
+        typedef Max<U> other;
+    };
+
+    __device__ __forceinline__ static T initialValue()
+    {
+        return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
+    }
+
+    __device__ __forceinline__ static T result(T r, int)
+    {
+        return r;
+    }
+};
+
+template <class Reductor, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridReduceToRow(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dst.create(1, cols);
+
+    grid_reduce_to_vec_detail::reduceToRow<Reductor>(shrinkPtr(src),
+                                                     dst[0],
+                                                     shrinkPtr(mask),
+                                                     rows, cols,
+                                                     StreamAccessor::getStream(stream));
+}
+
+template <class Reductor, class SrcPtr, typename ResType>
+__host__ void gridReduceToRow(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst.create(1, cols);
+
+    grid_reduce_to_vec_detail::reduceToRow<Reductor>(shrinkPtr(src),
+                                                     dst[0],
+                                                     WithOutMask(),
+                                                     rows, cols,
+                                                     StreamAccessor::getStream(stream));
+}
+
+template <class Reductor, class Policy, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    cuda::createContinuous(rows, 1, dst.type(), dst);
+
+    grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
+                                                                dst[0],
+                                                                shrinkPtr(mask),
+                                                                rows, cols,
+                                                                StreamAccessor::getStream(stream));
+}
+
+template <class Reductor, class Policy, class SrcPtr, typename ResType>
+__host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    cuda::createContinuous(rows, 1, dst.type(), dst);
+
+    grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
+                                                                dst[0],
+                                                                WithOutMask(),
+                                                                rows, cols,
+                                                                StreamAccessor::getStream(stream));
+}
+
+// default policy
+
+struct DefaultReduceToVecPolicy
+{
+    enum {
+        block_size_x = 32,
+        block_size_y = 8
+    };
+};
+
+template <class Reductor, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridReduceToColumn(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridReduceToColumn_<Reductor, DefaultReduceToVecPolicy>(src, dst, mask, stream);
+}
+
+template <class Reductor, class SrcPtr, typename ResType>
+__host__ void gridReduceToColumn(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
+{
+    gridReduceToColumn_<Reductor, DefaultReduceToVecPolicy>(src, dst, stream);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
new file mode 100644
index 00000000000..5c92a813ed8
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
@@ -0,0 +1,589 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_SPLIT_MERGE_HPP
+#define OPENCV_CUDEV_GRID_SPLIT_MERGE_HPP
+
+#include "../common.hpp"
+#include "../util/tuple.hpp"
+#include "../util/vec_traits.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/glob.hpp"
+#include "../ptr2d/mask.hpp"
+#include "detail/split_merge.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class Policy, class SrcPtrTuple, typename DstType, class MaskPtr>
+__host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<DstType>::cn == tuple_size<SrcPtrTuple>::value, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dst.create(rows, cols);
+
+    grid_split_merge_detail::MergeImpl<VecTraits<DstType>::cn, Policy>::merge(shrinkPtr(src),
+                                                                              shrinkPtr(dst),
+                                                                              shrinkPtr(mask),
+                                                                              rows, cols,
+                                                                              StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename DstType, class MaskPtr>
+__host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<DstType>::cn == tuple_size<SrcPtrTuple>::value, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_split_merge_detail::MergeImpl<VecTraits<DstType>::cn, Policy>::merge(shrinkPtr(src),
+                                                                              shrinkPtr(dst),
+                                                                              shrinkPtr(mask),
+                                                                              rows, cols,
+                                                                              StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename DstType>
+__host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<DstType>::cn == tuple_size<SrcPtrTuple>::value, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst.create(rows, cols);
+
+    grid_split_merge_detail::MergeImpl<VecTraits<DstType>::cn, Policy>::merge(shrinkPtr(src),
+                                                                              shrinkPtr(dst),
+                                                                              WithOutMask(),
+                                                                              rows, cols,
+                                                                              StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtrTuple, typename DstType>
+__host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<DstType>::cn == tuple_size<SrcPtrTuple>::value, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+
+    grid_split_merge_detail::MergeImpl<VecTraits<DstType>::cn, Policy>::merge(shrinkPtr(src),
+                                                                              shrinkPtr(dst),
+                                                                              WithOutMask(),
+                                                                              rows, cols,
+                                                                              StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(get<0>(dst)), shrinkPtr(get<1>(dst)),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[2], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dst[0].create(rows, cols);
+    dst[1].create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[2], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(get<0>(dst)), shrinkPtr(get<1>(dst)),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[2], Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst[0].create(rows, cols);
+    dst[1].create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[2], Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(get<0>(dst)), shrinkPtr(get<1>(dst)), shrinkPtr(get<2>(dst)),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[3], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dst[0].create(rows, cols);
+    dst[1].create(rows, cols);
+    dst[2].create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[3], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+    CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(get<0>(dst)), shrinkPtr(get<1>(dst)), shrinkPtr(get<2>(dst)),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[3], Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst[0].create(rows, cols);
+    dst[1].create(rows, cols);
+    dst[2].create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[3], Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+    CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+    get<3>(dst).create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(get<0>(dst)), shrinkPtr(get<1>(dst)), shrinkPtr(get<2>(dst)), shrinkPtr(get<3>(dst)),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[4], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dst[0].create(rows, cols);
+    dst[1].create(rows, cols);
+    dst[2].create(rows, cols);
+    dst[3].create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[4], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+    CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
+    CV_Assert( getRows(dst[3]) == rows && getCols(dst[3]) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+    get<3>(dst).create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(get<0>(dst)), shrinkPtr(get<1>(dst)), shrinkPtr(get<2>(dst)), shrinkPtr(get<3>(dst)),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[4], Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst[0].create(rows, cols);
+    dst[1].create(rows, cols);
+    dst[2].create(rows, cols);
+    dst[3].create(rows, cols);
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[4], Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+    CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
+    CV_Assert( getRows(dst[3]) == rows && getCols(dst[3]) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+// Default Policy
+
+struct DefaultSplitMergePolicy
+{
+    enum {
+        block_size_x = 32,
+        block_size_y = 8
+    };
+};
+
+template <class SrcPtrTuple, typename DstType, class MaskPtr>
+__host__ void gridMerge(const SrcPtrTuple& src, GpuMat_<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridMerge_<DefaultSplitMergePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtrTuple, typename DstType, class MaskPtr>
+__host__ void gridMerge(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridMerge_<DefaultSplitMergePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtrTuple, typename DstType>
+__host__ void gridMerge(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridMerge_<DefaultSplitMergePolicy>(src, dst, stream);
+}
+
+template <class SrcPtrTuple, typename DstType>
+__host__ void gridMerge(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridMerge_<DefaultSplitMergePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType, int COUNT, class MaskPtr>
+__host__ void gridSplit(const SrcPtr& src, GpuMat_<DstType> (&dst)[COUNT], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename DstType, int COUNT, class MaskPtr>
+__host__ void gridSplit(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[COUNT], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
+}
+
+template <class SrcPtr, typename DstType, int COUNT>
+__host__ void gridSplit(const SrcPtr& src, GpuMat_<DstType> (&dst)[COUNT], Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType, int COUNT>
+__host__ void gridSplit(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[COUNT], Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/transform.hpp b/modules/cudev/include/opencv2/cudev/grid/transform.hpp
new file mode 100644
index 00000000000..4f7d191e64b
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/transform.hpp
@@ -0,0 +1,546 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_TRANSFORM_HPP
+#define OPENCV_CUDEV_GRID_TRANSFORM_HPP
+
+#include "../common.hpp"
+#include "../util/tuple.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/glob.hpp"
+#include "../ptr2d/mask.hpp"
+#include "../ptr2d/zip.hpp"
+#include "detail/transform.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
+__host__ void gridTransformUnary_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dst.create(rows, cols);
+
+    grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
+__host__ void gridTransformUnary_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class UnOp>
+__host__ void gridTransformUnary_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst.create(rows, cols);
+
+    grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class UnOp>
+__host__ void gridTransformUnary_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+
+    grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
+__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src1);
+    const int cols = getCols(src1);
+
+    CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dst.create(rows, cols);
+
+    grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
+__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src1);
+    const int cols = getCols(src1);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+    CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp>
+__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src1);
+    const int cols = getCols(src1);
+
+    CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
+
+    dst.create(rows, cols);
+
+    grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp>
+__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src1);
+    const int cols = getCols(src1);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+    CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
+
+    grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                                   op,
+                                                   shrinkPtr(mask),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                                   op,
+                                                   shrinkPtr(mask),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                                   op,
+                                                   WithOutMask(),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                                   op,
+                                                   WithOutMask(),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                                   op,
+                                                   shrinkPtr(mask),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                                   op,
+                                                   shrinkPtr(mask),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                                   op,
+                                                   WithOutMask(),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                                   op,
+                                                   WithOutMask(),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+    get<3>(dst).create(rows, cols);
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                                   op,
+                                                   shrinkPtr(mask),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(get<3>(dst)) == rows && getCols(get<3>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                                   op,
+                                                   shrinkPtr(mask),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    get<0>(dst).create(rows, cols);
+    get<1>(dst).create(rows, cols);
+    get<2>(dst).create(rows, cols);
+    get<3>(dst).create(rows, cols);
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                                   op,
+                                                   WithOutMask(),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(get<3>(dst)) == rows && getCols(get<3>(dst)) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                                   op,
+                                                   WithOutMask(),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
+// Default Policy
+
+struct DefaultTransformPolicy
+{
+    enum {
+        block_size_x = 32,
+        block_size_y = 8,
+        shift = 4
+    };
+};
+
+template <class SrcPtr, typename DstType, class Op, class MaskPtr>
+__host__ void gridTransformUnary(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
+template <class SrcPtr, typename DstType, class Op, class MaskPtr>
+__host__ void gridTransformUnary(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
+template <class SrcPtr, typename DstType, class Op>
+__host__ void gridTransformUnary(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
+{
+    gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
+template <class SrcPtr, typename DstType, class Op>
+__host__ void gridTransformUnary(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
+{
+    gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
+template <class SrcPtr1, class SrcPtr2, typename DstType, class Op, class MaskPtr>
+__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
+}
+
+template <class SrcPtr1, class SrcPtr2, typename DstType, class Op, class MaskPtr>
+__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
+}
+
+template <class SrcPtr1, class SrcPtr2, typename DstType, class Op>
+__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
+{
+    gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
+}
+
+template <class SrcPtr1, class SrcPtr2, typename DstType, class Op>
+__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
+{
+    gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, class OpTuple>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, class OpTuple>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
+template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/transpose.hpp b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
new file mode 100644
index 00000000000..72ab7267ff7
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_GRID_TRANSPOSE_HPP
+#define OPENCV_CUDEV_GRID_TRANSPOSE_HPP
+
+#include "../common.hpp"
+#include "../ptr2d/traits.hpp"
+#include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/glob.hpp"
+#include "detail/transpose.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridTranspose_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dst.create(cols, rows);
+
+    transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridTranspose_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == cols && getCols(dst) == rows );
+
+    transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+}
+
+// Default Policy
+
+struct DefaultTransposePolicy
+{
+    enum {
+        tile_dim    = 16,
+        block_dim_y = 16
+    };
+};
+
+template <class SrcPtr, typename DstType>
+__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridTranspose(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/constant.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/constant.hpp
new file mode 100644
index 00000000000..eb96290b4e3
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/constant.hpp
@@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_CONSTANT_HPP
+#define OPENCV_CUDEV_PTR2D_CONSTANT_HPP
+
+#include "../common.hpp"
+#include "traits.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <typename T> struct ConstantPtr
+{
+    typedef T   value_type;
+    typedef int index_type;
+
+    T value;
+
+    __device__ __forceinline__ T operator ()(int, int) const { return value; }
+};
+
+template <typename T> struct ConstantPtrSz : ConstantPtr<T>
+{
+    int rows, cols;
+};
+
+template <typename T>
+__host__ ConstantPtr<T> constantPtr(T value)
+{
+    ConstantPtr<T> p;
+    p.value = value;
+    return p;
+}
+
+template <typename T> ConstantPtrSz<T>
+__host__ constantPtr(T value, int rows, int cols)
+{
+    ConstantPtrSz<T> p;
+    p.value = value;
+    p.rows = rows;
+    p.cols = cols;
+    return p;
+}
+
+template <typename T> struct PtrTraits< ConstantPtrSz<T> > : PtrTraitsBase< ConstantPtrSz<T>, ConstantPtr<T> >
+{
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/deriv.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/deriv.hpp
new file mode 100644
index 00000000000..fe30d3e7afc
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/deriv.hpp
@@ -0,0 +1,398 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_DERIV_HPP
+#define OPENCV_CUDEV_PTR2D_DERIV_HPP
+
+#include "../common.hpp"
+#include "../grid/copy.hpp"
+#include "traits.hpp"
+#include "gpumat.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// derivX
+
+template <class SrcPtr> struct DerivXPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef int                                    index_type;
+
+    SrcPtr src;
+
+    __device__ __forceinline__ typename PtrTraits<SrcPtr>::value_type operator ()(int y, int x) const
+    {
+        return src(y, x + 1) - src(y, x - 1);
+    }
+};
+
+template <class SrcPtr> struct DerivXPtrSz : DerivXPtr<SrcPtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ DerivXPtrSz<typename PtrTraits<SrcPtr>::ptr_type> derivXPtr(const SrcPtr& src)
+{
+    DerivXPtrSz<typename PtrTraits<SrcPtr>::ptr_type> s;
+    s.src = shrinkPtr(src);
+    s.rows = getRows(src);
+    s.cols = getCols(src);
+    return s;
+}
+
+template <class SrcPtr> struct PtrTraits< DerivXPtrSz<SrcPtr> > : PtrTraitsBase<DerivXPtrSz<SrcPtr>, DerivXPtr<SrcPtr> >
+{
+};
+
+// derivY
+
+template <class SrcPtr> struct DerivYPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef int                                    index_type;
+
+    SrcPtr src;
+
+    __device__ __forceinline__ typename PtrTraits<SrcPtr>::value_type operator ()(int y, int x) const
+    {
+        return src(y + 1, x) - src(y - 1, x);
+    }
+};
+
+template <class SrcPtr> struct DerivYPtrSz : DerivYPtr<SrcPtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ DerivYPtrSz<typename PtrTraits<SrcPtr>::ptr_type> derivYPtr(const SrcPtr& src)
+{
+    DerivYPtrSz<typename PtrTraits<SrcPtr>::ptr_type> s;
+    s.src = shrinkPtr(src);
+    s.rows = getRows(src);
+    s.cols = getCols(src);
+    return s;
+}
+
+template <class SrcPtr> struct PtrTraits< DerivYPtrSz<SrcPtr> > : PtrTraitsBase<DerivYPtrSz<SrcPtr>, DerivYPtr<SrcPtr> >
+{
+};
+
+// sobelX
+
+template <class SrcPtr> struct SobelXPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef int                                    index_type;
+
+    SrcPtr src;
+
+    __device__ typename PtrTraits<SrcPtr>::value_type operator ()(int y, int x) const
+    {
+        typename PtrTraits<SrcPtr>::value_type vals[6] =
+        {
+            src(y - 1, x - 1), src(y - 1, x + 1),
+            src(y    , x - 1), src(y    , x + 1),
+            src(y + 1, x - 1), src(y + 1, x + 1),
+        };
+
+        return (vals[1] - vals[0]) + 2 * (vals[3] - vals[2]) + (vals[5] - vals[4]);
+    }
+};
+
+template <class SrcPtr> struct SobelXPtrSz : SobelXPtr<SrcPtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ SobelXPtrSz<typename PtrTraits<SrcPtr>::ptr_type> sobelXPtr(const SrcPtr& src)
+{
+    SobelXPtrSz<typename PtrTraits<SrcPtr>::ptr_type> s;
+    s.src = shrinkPtr(src);
+    s.rows = getRows(src);
+    s.cols = getCols(src);
+    return s;
+}
+
+template <class SrcPtr> struct PtrTraits< SobelXPtrSz<SrcPtr> > : PtrTraitsBase<SobelXPtrSz<SrcPtr>, SobelXPtr<SrcPtr> >
+{
+};
+
+// sobelY
+
+template <class SrcPtr> struct SobelYPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef int                                    index_type;
+
+    SrcPtr src;
+
+    __device__ typename PtrTraits<SrcPtr>::value_type operator ()(int y, int x) const
+    {
+        typename PtrTraits<SrcPtr>::value_type vals[6] =
+        {
+            src(y - 1, x - 1), src(y - 1, x), src(y - 1, x + 1),
+            src(y + 1, x - 1), src(y + 1, x), src(y + 1, x + 1)
+        };
+
+        return (vals[3] - vals[0]) + 2 * (vals[4] - vals[1]) + (vals[5] - vals[2]);
+    }
+};
+
+template <class SrcPtr> struct SobelYPtrSz : SobelYPtr<SrcPtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ SobelYPtrSz<typename PtrTraits<SrcPtr>::ptr_type> sobelYPtr(const SrcPtr& src)
+{
+    SobelYPtrSz<typename PtrTraits<SrcPtr>::ptr_type> s;
+    s.src = shrinkPtr(src);
+    s.rows = getRows(src);
+    s.cols = getCols(src);
+    return s;
+}
+
+template <class SrcPtr> struct PtrTraits< SobelYPtrSz<SrcPtr> > : PtrTraitsBase<SobelYPtrSz<SrcPtr>, SobelYPtr<SrcPtr> >
+{
+};
+
+// scharrX
+
+template <class SrcPtr> struct ScharrXPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef int                                    index_type;
+
+    SrcPtr src;
+
+    __device__ typename PtrTraits<SrcPtr>::value_type operator ()(int y, int x) const
+    {
+        typename PtrTraits<SrcPtr>::value_type vals[6] =
+        {
+            src(y - 1, x - 1), src(y - 1, x + 1),
+            src(y    , x - 1), src(y    , x + 1),
+            src(y + 1, x - 1), src(y + 1, x + 1),
+        };
+
+        return 3 * (vals[1] - vals[0]) + 10 * (vals[3] - vals[2]) + 3 * (vals[5] - vals[4]);
+    }
+};
+
+template <class SrcPtr> struct ScharrXPtrSz : ScharrXPtr<SrcPtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ ScharrXPtrSz<typename PtrTraits<SrcPtr>::ptr_type> scharrXPtr(const SrcPtr& src)
+{
+    ScharrXPtrSz<typename PtrTraits<SrcPtr>::ptr_type> s;
+    s.src = shrinkPtr(src);
+    s.rows = getRows(src);
+    s.cols = getCols(src);
+    return s;
+}
+
+template <class SrcPtr> struct PtrTraits< ScharrXPtrSz<SrcPtr> > : PtrTraitsBase<ScharrXPtrSz<SrcPtr>, ScharrXPtr<SrcPtr> >
+{
+};
+
+// scharrY
+
+template <class SrcPtr> struct ScharrYPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef int                                    index_type;
+
+    SrcPtr src;
+
+    __device__ typename PtrTraits<SrcPtr>::value_type operator ()(int y, int x) const
+    {
+        typename PtrTraits<SrcPtr>::value_type vals[6] =
+        {
+            src(y - 1, x - 1), src(y - 1, x), src(y - 1, x + 1),
+            src(y + 1, x - 1), src(y + 1, x), src(y + 1, x + 1)
+        };
+
+        return 3 * (vals[3] - vals[0]) + 10 * (vals[4] - vals[1]) + 3 * (vals[5] - vals[2]);
+    }
+};
+
+template <class SrcPtr> struct ScharrYPtrSz : ScharrYPtr<SrcPtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ ScharrYPtrSz<typename PtrTraits<SrcPtr>::ptr_type> scharrYPtr(const SrcPtr& src)
+{
+    ScharrYPtrSz<typename PtrTraits<SrcPtr>::ptr_type> s;
+    s.src = shrinkPtr(src);
+    s.rows = getRows(src);
+    s.cols = getCols(src);
+    return s;
+}
+
+template <class SrcPtr> struct PtrTraits< ScharrYPtrSz<SrcPtr> > : PtrTraitsBase<ScharrYPtrSz<SrcPtr>, ScharrYPtr<SrcPtr> >
+{
+};
+
+// laplacian
+
+template <int ksize, class SrcPtr> struct LaplacianPtr;
+
+template <class SrcPtr> struct LaplacianPtr<1, SrcPtr>
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef int                                    index_type;
+
+    SrcPtr src;
+
+    __device__ typename PtrTraits<SrcPtr>::value_type operator ()(int y, int x) const
+    {
+        typename PtrTraits<SrcPtr>::value_type vals[5] =
+        {
+                           src(y - 1, x),
+            src(y, x - 1), src(y    , x), src(y, x + 1),
+                           src(y + 1, x)
+        };
+
+        return (vals[0] + vals[1] + vals[3] + vals[4]) - 4 * vals[2];
+    }
+};
+
+template <class SrcPtr> struct LaplacianPtr<3, SrcPtr>
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef int                                    index_type;
+
+   SrcPtr src;
+
+   __device__ typename PtrTraits<SrcPtr>::value_type operator ()(int y, int x) const
+   {
+       typename PtrTraits<SrcPtr>::value_type vals[5] =
+       {
+           src(y - 1, x - 1),            src(y - 1, x + 1),
+                              src(y, x),
+           src(y + 1, x - 1),            src(y + 1, x + 1)
+       };
+
+       return 2 * (vals[0] + vals[1] + vals[3] + vals[4]) - 8 * vals[2];
+   }
+};
+
+template <int ksize, class SrcPtr> struct LaplacianPtrSz : LaplacianPtr<ksize, SrcPtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <int ksize, class SrcPtr>
+__host__ LaplacianPtrSz<ksize, typename PtrTraits<SrcPtr>::ptr_type> laplacianPtr(const SrcPtr& src)
+{
+    LaplacianPtrSz<ksize, typename PtrTraits<SrcPtr>::ptr_type> ptr;
+    ptr.src = shrinkPtr(src);
+    ptr.rows = getRows(src);
+    ptr.cols = getCols(src);
+    return ptr;
+}
+
+template <int ksize, class SrcPtr> struct PtrTraits< LaplacianPtrSz<ksize, SrcPtr> > : PtrTraitsBase<LaplacianPtrSz<ksize, SrcPtr>, LaplacianPtr<ksize, SrcPtr> >
+{
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp
new file mode 100644
index 00000000000..968d78e8328
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp
@@ -0,0 +1,361 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_GPUMAT_DETAIL_HPP
+#define OPENCV_CUDEV_PTR2D_GPUMAT_DETAIL_HPP
+
+#include "../gpumat.hpp"
+
+namespace cv { namespace cudev {
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(Allocator* allocator)
+    : GpuMat(allocator)
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) | DataType<T>::type;
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(int arows, int acols, Allocator* allocator)
+    : GpuMat(arows, acols, DataType<T>::type, allocator)
+{
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(Size asize, Allocator* allocator)
+    : GpuMat(asize.height, asize.width, DataType<T>::type, allocator)
+{
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(int arows, int acols, Scalar val, Allocator* allocator)
+    : GpuMat(arows, acols, DataType<T>::type, val, allocator)
+{
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(Size asize, Scalar val, Allocator* allocator)
+    : GpuMat(asize.height, asize.width, DataType<T>::type, val, allocator)
+{
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(const GpuMat_& m)
+    : GpuMat(m)
+{
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(const GpuMat& m, Allocator* allocator)
+    : GpuMat(allocator)
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) | DataType<T>::type;
+
+    if (DataType<T>::type == m.type())
+    {
+        GpuMat::operator =(m);
+        return;
+    }
+
+    if (DataType<T>::depth == m.depth())
+    {
+        GpuMat::operator =(m.reshape(DataType<T>::channels, m.rows));
+        return;
+    }
+
+    CV_Assert( DataType<T>::channels == m.channels() );
+    m.convertTo(*this, type());
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(int arows, int acols, T* adata, size_t astep)
+    : GpuMat(arows, acols, DataType<T>::type, adata, astep)
+{
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(Size asize, T* adata, size_t astep)
+    : GpuMat(asize.height, asize.width, DataType<T>::type, adata, astep)
+{
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(const GpuMat_& m, Range arowRange, Range acolRange)
+    : GpuMat(m, arowRange, acolRange)
+{
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(const GpuMat_& m, Rect roi)
+    : GpuMat(m, roi)
+{
+}
+
+template <typename T>
+__host__ GpuMat_<T>::GpuMat_(InputArray arr, Allocator* allocator)
+    : GpuMat(allocator)
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) | DataType<T>::type;
+    upload(arr);
+}
+
+template <typename T>
+__host__ GpuMat_<T>& GpuMat_<T>::operator =(const GpuMat_& m)
+{
+    GpuMat::operator =(m);
+    return *this;
+}
+
+template <typename T>
+__host__ void GpuMat_<T>::create(int arows, int acols)
+{
+    GpuMat::create(arows, acols, DataType<T>::type);
+}
+
+template <typename T>
+__host__ void GpuMat_<T>::create(Size asize)
+{
+    GpuMat::create(asize, DataType<T>::type);
+}
+
+template <typename T>
+__host__ void GpuMat_<T>::swap(GpuMat_& mat)
+{
+    GpuMat::swap(mat);
+}
+
+template <typename T>
+__host__ void GpuMat_<T>::upload(InputArray arr)
+{
+    CV_Assert( arr.type() == DataType<T>::type );
+    GpuMat::upload(arr);
+}
+
+template <typename T>
+__host__ void GpuMat_<T>::upload(InputArray arr, Stream& stream)
+{
+    CV_Assert( arr.type() == DataType<T>::type );
+    GpuMat::upload(arr, stream);
+}
+
+template <typename T>
+__host__ GpuMat_<T>::operator GlobPtrSz<T>() const
+{
+    return globPtr((T*) data, step, rows, cols);
+}
+
+template <typename T>
+__host__ GpuMat_<T>::operator GlobPtr<T>() const
+{
+    return globPtr((T*) data, step);
+}
+
+template <typename T>
+__host__ GpuMat_<T> GpuMat_<T>::clone() const
+{
+    return GpuMat_(GpuMat::clone());
+}
+
+template <typename T>
+__host__ GpuMat_<T> GpuMat_<T>::row(int y) const
+{
+    return GpuMat_(*this, Range(y, y+1), Range::all());
+}
+
+template <typename T>
+__host__ GpuMat_<T> GpuMat_<T>::col(int x) const
+{
+    return GpuMat_(*this, Range::all(), Range(x, x+1));
+}
+
+template <typename T>
+__host__ GpuMat_<T> GpuMat_<T>::rowRange(int startrow, int endrow) const
+{
+    return GpuMat_(*this, Range(startrow, endrow), Range::all());
+}
+
+template <typename T>
+__host__ GpuMat_<T> GpuMat_<T>::rowRange(Range r) const
+{
+    return GpuMat_(*this, r, Range::all());
+}
+
+template <typename T>
+__host__ GpuMat_<T> GpuMat_<T>::colRange(int startcol, int endcol) const
+{
+    return GpuMat_(*this, Range::all(), Range(startcol, endcol));
+}
+
+template <typename T>
+__host__ GpuMat_<T> GpuMat_<T>::colRange(Range r) const
+{
+    return GpuMat_(*this, Range::all(), r);
+}
+
+template <typename T>
+__host__ GpuMat_<T> GpuMat_<T>::operator ()(Range _rowRange, Range _colRange) const
+{
+    return GpuMat_(*this, _rowRange, _colRange);
+}
+
+template <typename T>
+__host__ GpuMat_<T> GpuMat_<T>::operator ()(Rect roi) const
+{
+    return GpuMat_(*this, roi);
+}
+
+template <typename T>
+__host__ GpuMat_<T>& GpuMat_<T>::adjustROI(int dtop, int dbottom, int dleft, int dright)
+{
+    return (GpuMat_<T>&)(GpuMat::adjustROI(dtop, dbottom, dleft, dright));
+}
+
+template <typename T>
+__host__ size_t GpuMat_<T>::elemSize() const
+{
+    CV_DbgAssert( GpuMat::elemSize() == sizeof(T) );
+    return sizeof(T);
+}
+
+template <typename T>
+__host__ size_t GpuMat_<T>::elemSize1() const
+{
+    CV_DbgAssert( GpuMat::elemSize1() == sizeof(T) / DataType<T>::channels );
+    return sizeof(T) / DataType<T>::channels;
+}
+
+template <typename T>
+__host__ int GpuMat_<T>::type() const
+{
+    CV_DbgAssert( GpuMat::type() == DataType<T>::type );
+    return DataType<T>::type;
+}
+
+template <typename T>
+__host__ int GpuMat_<T>::depth() const
+{
+    CV_DbgAssert( GpuMat::depth() == DataType<T>::depth );
+    return DataType<T>::depth;
+}
+
+template <typename T>
+__host__ int GpuMat_<T>::channels() const
+{
+    CV_DbgAssert( GpuMat::channels() == DataType<T>::channels );
+    return DataType<T>::channels;
+}
+
+template <typename T>
+__host__ size_t GpuMat_<T>::stepT() const
+{
+    return step / elemSize();
+}
+
+template <typename T>
+__host__ size_t GpuMat_<T>::step1() const
+{
+    return step / elemSize1();
+}
+
+template <typename T>
+__host__ T* GpuMat_<T>::operator [](int y)
+{
+    return (T*)ptr(y);
+}
+
+template <typename T>
+__host__ const T* GpuMat_<T>::operator [](int y) const
+{
+    return (const T*)ptr(y);
+}
+
+template <typename T> template <class Body>
+__host__ GpuMat_<T>::GpuMat_(const Expr<Body>& expr)
+    : GpuMat()
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) | DataType<T>::type;
+    *this = expr;
+}
+
+template <typename T> template <class Body>
+__host__ GpuMat_<T>& GpuMat_<T>::operator =(const Expr<Body>& expr)
+{
+    expr.body.assignTo(*this);
+    return *this;
+}
+
+template <typename T> template <class Body>
+__host__ GpuMat_<T>& GpuMat_<T>::assign(const Expr<Body>& expr, Stream& stream)
+{
+    expr.body.assignTo(*this, stream);
+    return *this;
+}
+
+}}
+
+// Input / Output Arrays
+
+namespace cv {
+
+template<typename _Tp>
+__host__ _InputArray::_InputArray(const cudev::GpuMat_<_Tp>& m)
+    : flags(FIXED_TYPE + CUDA_GPU_MAT + DataType<_Tp>::type), obj((void*)&m)
+{}
+
+template<typename _Tp>
+__host__ _OutputArray::_OutputArray(cudev::GpuMat_<_Tp>& m)
+    : _InputArray(m)
+{}
+
+template<typename _Tp>
+__host__ _OutputArray::_OutputArray(const cudev::GpuMat_<_Tp>& m)
+    : _InputArray(m)
+{
+    flags |= FIXED_SIZE;
+}
+
+}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/extrapolation.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/extrapolation.hpp
new file mode 100644
index 00000000000..14bb305ebb3
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/extrapolation.hpp
@@ -0,0 +1,224 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_EXTRAPOLATION_HPP
+#define OPENCV_CUDEV_PTR2D_EXTRAPOLATION_HPP
+
+#include "../common.hpp"
+#include "../util/vec_traits.hpp"
+#include "traits.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// BrdConstant
+
+template <class SrcPtr> struct BrdConstant
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef int                                    index_type;
+
+    SrcPtr src;
+    int rows, cols;
+    typename PtrTraits<SrcPtr>::value_type val;
+
+    __device__ __forceinline__ typename PtrTraits<SrcPtr>::value_type operator ()(int y, int x) const
+    {
+        return (x >= 0 && x < cols && y >= 0 && y < rows) ? src(y, x) : val;
+    }
+};
+
+template <class SrcPtr>
+__host__ BrdConstant<typename PtrTraits<SrcPtr>::ptr_type> brdConstant(const SrcPtr& src, typename PtrTraits<SrcPtr>::value_type val)
+{
+    BrdConstant<typename PtrTraits<SrcPtr>::ptr_type> b;
+    b.src = shrinkPtr(src);
+    b.rows = getRows(src);
+    b.cols = getCols(src);
+    b.val = val;
+    return b;
+}
+
+template <class SrcPtr>
+__host__ BrdConstant<typename PtrTraits<SrcPtr>::ptr_type> brdConstant(const SrcPtr& src)
+{
+    return brdConstant(src, VecTraits<typename PtrTraits<SrcPtr>::value_type>::all(0));
+}
+
+// BrdBase
+
+template <class BrdImpl, class SrcPtr> struct BrdBase
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef int                                    index_type;
+
+    SrcPtr src;
+    int rows, cols;
+
+    __device__ __forceinline__ int idx_row(int y) const
+    {
+        return BrdImpl::idx_low(BrdImpl::idx_high(y, rows), rows);
+    }
+
+    __device__ __forceinline__ int idx_col(int x) const
+    {
+        return BrdImpl::idx_low(BrdImpl::idx_high(x, cols), cols);
+    }
+
+    __device__ __forceinline__ typename PtrTraits<SrcPtr>::value_type operator ()(int y, int x) const
+    {
+        return src(idx_row(y), idx_col(x));
+    }
+};
+
+// BrdReplicate
+
+struct BrdReplicate
+{
+    __device__ __forceinline__ static int idx_low(int i, int len)
+    {
+        return ::max(i, 0);
+    }
+
+    __device__ __forceinline__ static int idx_high(int i, int len)
+    {
+        return ::min(i, len - 1);
+    }
+};
+
+template <class SrcPtr>
+__host__ BrdBase<BrdReplicate, typename PtrTraits<SrcPtr>::ptr_type> brdReplicate(const SrcPtr& src)
+{
+    BrdBase<BrdReplicate, typename PtrTraits<SrcPtr>::ptr_type> b;
+    b.src = shrinkPtr(src);
+    b.rows = getRows(src);
+    b.cols = getCols(src);
+    return b;
+}
+
+// BrdReflect101
+
+struct BrdReflect101
+{
+    __device__ __forceinline__ static int idx_low(int i, int len)
+    {
+        return ::abs(i) % len;
+    }
+
+    __device__ __forceinline__ static int idx_high(int i, int len)
+    {
+        const int last_ind = len - 1;
+        return ::abs(last_ind - ::abs(last_ind - i)) % len;
+    }
+};
+
+template <class SrcPtr>
+__host__ BrdBase<BrdReflect101, typename PtrTraits<SrcPtr>::ptr_type> brdReflect101(const SrcPtr& src)
+{
+    BrdBase<BrdReflect101, typename PtrTraits<SrcPtr>::ptr_type> b;
+    b.src = shrinkPtr(src);
+    b.rows = getRows(src);
+    b.cols = getCols(src);
+    return b;
+}
+
+// BrdReflect
+
+struct BrdReflect
+{
+    __device__ __forceinline__ static int idx_low(int i, int len)
+    {
+        return (::abs(i) - (i < 0)) % len;
+    }
+
+    __device__ __forceinline__ static int idx_high(int i, int len)
+    {
+        const int last_ind = len - 1;
+        return (last_ind - ::abs(last_ind - i) + (i > last_ind));
+    }
+};
+
+template <class SrcPtr>
+__host__ BrdBase<BrdReflect, typename PtrTraits<SrcPtr>::ptr_type> brdReflect(const SrcPtr& src)
+{
+    BrdBase<BrdReflect, typename PtrTraits<SrcPtr>::ptr_type> b;
+    b.src = shrinkPtr(src);
+    b.rows = getRows(src);
+    b.cols = getCols(src);
+    return b;
+}
+
+// BrdWrap
+
+struct BrdWrap
+{
+    __device__ __forceinline__ static int idx_low(int i, int len)
+    {
+        return (i >= 0) ? i : (i - ((i - len + 1) / len) * len);
+    }
+
+    __device__ __forceinline__ static int idx_high(int i, int len)
+    {
+        return (i < len) ? i : (i % len);
+    }
+};
+
+template <class SrcPtr>
+__host__ BrdBase<BrdWrap, typename PtrTraits<SrcPtr>::ptr_type> brdWrap(const SrcPtr& src)
+{
+    BrdBase<BrdWrap, typename PtrTraits<SrcPtr>::ptr_type> b;
+    b.src = shrinkPtr(src);
+    b.rows = getRows(src);
+    b.cols = getCols(src);
+    return b;
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
new file mode 100644
index 00000000000..2024a7e01a2
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
@@ -0,0 +1,129 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_GLOB_HPP
+#define OPENCV_CUDEV_PTR2D_GLOB_HPP
+
+#include "../common.hpp"
+#include "traits.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+/** @brief Structure similar to cv::cudev::GlobPtrSz but containing only a pointer and row step.
+
+Width and height fields are excluded due to performance reasons. The structure is intended
+for internal use or for users who write device code.
+ */
+template <typename T> struct GlobPtr
+{
+    typedef T   value_type;
+    typedef int index_type;
+
+    T* data;
+
+    //! stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!!
+    size_t step;
+
+    __device__ __forceinline__       T* row(int y)       { return (      T*)( (      uchar*)data + y * step); }
+    __device__ __forceinline__ const T* row(int y) const { return (const T*)( (const uchar*)data + y * step); }
+
+    __device__ __forceinline__       T& operator ()(int y, int x)       { return row(y)[x]; }
+    __device__ __forceinline__ const T& operator ()(int y, int x) const { return row(y)[x]; }
+};
+
+/** @brief Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compiled code (CUDA
+kernels).
+
+Typically, it is used internally by OpenCV and by users who write device code. You can call
+its members from both host and device code.
+ */
+template <typename T> struct GlobPtrSz : GlobPtr<T>
+{
+    int rows, cols;
+};
+
+template <typename T>
+__host__ __device__ GlobPtr<T> globPtr(T* data, size_t step)
+{
+    GlobPtr<T> p;
+    p.data = data;
+    p.step = step;
+    return p;
+}
+
+template <typename T>
+__host__ __device__ GlobPtrSz<T> globPtr(T* data, size_t step, int rows, int cols)
+{
+    GlobPtrSz<T> p;
+    p.data = data;
+    p.step = step;
+    p.rows = rows;
+    p.cols = cols;
+    return p;
+}
+
+template <typename T>
+__host__ GlobPtrSz<T> globPtr(const GpuMat& mat)
+{
+    GlobPtrSz<T> p;
+    p.data = (T*) mat.data;
+    p.step = mat.step;
+    p.rows = mat.rows;
+    p.cols = mat.cols;
+    return p;
+}
+
+template <typename T> struct PtrTraits< GlobPtrSz<T> > : PtrTraitsBase<GlobPtrSz<T>, GlobPtr<T> >
+{
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp
new file mode 100644
index 00000000000..5d9a98f0cd4
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp
@@ -0,0 +1,166 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_GPUMAT_HPP
+#define OPENCV_CUDEV_PTR2D_GPUMAT_HPP
+
+#include "../common.hpp"
+#include "../util/vec_traits.hpp"
+#include "../expr/expr.hpp"
+#include "glob.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <typename T>
+class GpuMat_ : public GpuMat
+{
+public:
+    typedef T value_type;
+
+    //! default constructor
+    __host__ GpuMat_(Allocator* allocator = defaultAllocator());
+
+    //! constructs GpuMat of the specified size
+    __host__ GpuMat_(int arows, int acols, Allocator* allocator = defaultAllocator());
+    __host__ explicit GpuMat_(Size asize, Allocator* allocator = defaultAllocator());
+
+    //! constucts GpuMat and fills it with the specified value
+    __host__ GpuMat_(int arows, int acols, Scalar val, Allocator* allocator = defaultAllocator());
+    __host__ GpuMat_(Size asize, Scalar val, Allocator* allocator = defaultAllocator());
+
+    //! copy constructor
+    __host__ GpuMat_(const GpuMat_& m);
+
+    //! copy/conversion constructor. If m is of different type, it's converted
+    __host__ explicit GpuMat_(const GpuMat& m, Allocator* allocator = defaultAllocator());
+
+    //! constructs a matrix on top of user-allocated data. step is in bytes(!!!), regardless of the type
+    __host__ GpuMat_(int arows, int acols, T* adata, size_t astep = Mat::AUTO_STEP);
+    __host__ GpuMat_(Size asize, T* adata, size_t astep = Mat::AUTO_STEP);
+
+    //! selects a submatrix
+    __host__ GpuMat_(const GpuMat_& m, Range arowRange, Range acolRange);
+    __host__ GpuMat_(const GpuMat_& m, Rect roi);
+
+    //! builds GpuMat from host memory (Blocking call)
+    __host__ explicit GpuMat_(InputArray arr, Allocator* allocator = defaultAllocator());
+
+    //! assignment operators
+    __host__ GpuMat_& operator =(const GpuMat_& m);
+
+    //! allocates new GpuMat data unless the GpuMat already has specified size and type
+    __host__ void create(int arows, int acols);
+    __host__ void create(Size asize);
+
+    //! swaps with other smart pointer
+    __host__ void swap(GpuMat_& mat);
+
+    //! pefroms upload data to GpuMat (Blocking call)
+    __host__ void upload(InputArray arr);
+
+    //! pefroms upload data to GpuMat (Non-Blocking call)
+    __host__ void upload(InputArray arr, Stream& stream);
+
+    //! convert to GlobPtr
+    __host__ operator GlobPtrSz<T>() const;
+    __host__ operator GlobPtr<T>() const;
+
+    //! overridden forms of GpuMat::row() etc.
+    __host__ GpuMat_ clone() const;
+    __host__ GpuMat_ row(int y) const;
+    __host__ GpuMat_ col(int x) const;
+    __host__ GpuMat_ rowRange(int startrow, int endrow) const;
+    __host__ GpuMat_ rowRange(Range r) const;
+    __host__ GpuMat_ colRange(int startcol, int endcol) const;
+    __host__ GpuMat_ colRange(Range r) const;
+    __host__ GpuMat_ operator ()(Range rowRange, Range colRange) const;
+    __host__ GpuMat_ operator ()(Rect roi) const;
+    __host__ GpuMat_& adjustROI(int dtop, int dbottom, int dleft, int dright);
+
+    //! overridden forms of GpuMat::elemSize() etc.
+    __host__ size_t elemSize() const;
+    __host__ size_t elemSize1() const;
+    __host__ int type() const;
+    __host__ int depth() const;
+    __host__ int channels() const;
+    __host__ size_t step1() const;
+
+    //! returns step()/sizeof(T)
+    __host__ size_t stepT() const;
+
+    //! more convenient forms of row and element access operators
+    __host__ T* operator [](int y);
+    __host__ const T* operator [](int y) const;
+
+    //! expression templates
+    template <class Body> __host__ GpuMat_(const Expr<Body>& expr);
+    template <class Body> __host__ GpuMat_& operator =(const Expr<Body>& expr);
+    template <class Body> __host__ GpuMat_& assign(const Expr<Body>& expr, Stream& stream);
+};
+
+//! creates alternative GpuMat header for the same data, with different
+//! number of channels and/or different number of rows. see cvReshape.
+template <int cn, typename T>
+__host__ GpuMat_<typename MakeVec<typename VecTraits<T>::elem_type, cn>::type>
+reshape_(const GpuMat_<T>& mat, int rows = 0)
+{
+    GpuMat_<typename MakeVec<typename VecTraits<T>::elem_type, cn>::type> dst(mat.reshape(cn, rows));
+    return dst;
+}
+
+template <typename T> struct PtrTraits< GpuMat_<T> > : PtrTraitsBase<GpuMat_<T>, GlobPtr<T> >
+{
+};
+
+//! @}
+
+}}
+
+#include "detail/gpumat.hpp"
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/interpolation.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/interpolation.hpp
new file mode 100644
index 00000000000..c416140f05d
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/interpolation.hpp
@@ -0,0 +1,390 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_INTERPOLATION_HPP
+#define OPENCV_CUDEV_PTR2D_INTERPOLATION_HPP
+
+#include "../common.hpp"
+#include "../util/vec_traits.hpp"
+#include "../util/saturate_cast.hpp"
+#include "../util/type_traits.hpp"
+#include "../util/limits.hpp"
+#include "traits.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// Nearest
+
+template <class SrcPtr> struct NearestInterPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef float                                  index_type;
+
+    SrcPtr src;
+
+    __device__ __forceinline__ typename PtrTraits<SrcPtr>::value_type operator ()(float y, float x) const
+    {
+        return src(__float2int_rn(y), __float2int_rn(x));
+    }
+};
+
+template <class SrcPtr> struct NearestInterPtrSz : NearestInterPtr<SrcPtr>
+{
+    int rows, cols;
+};
+
+template <class SrcPtr>
+__host__ NearestInterPtrSz<typename PtrTraits<SrcPtr>::ptr_type> interNearest(const SrcPtr& src)
+{
+    NearestInterPtrSz<typename PtrTraits<SrcPtr>::ptr_type> i;
+    i.src = shrinkPtr(src);
+    i.rows = getRows(src);
+    i.cols = getCols(src);
+    return i;
+}
+
+template <class SrcPtr> struct PtrTraits< NearestInterPtrSz<SrcPtr> > : PtrTraitsBase<NearestInterPtrSz<SrcPtr>, NearestInterPtr<SrcPtr> >
+{
+};
+
+// Linear
+
+template <typename SrcPtr> struct LinearInterPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef float                                  index_type;
+
+    SrcPtr src;
+
+    __device__ typename PtrTraits<SrcPtr>::value_type operator ()(float y, float x) const
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+        typedef typename VecTraits<src_type>::elem_type src_elem_type;
+        typedef typename LargerType<float, src_elem_type>::type work_elem_type;
+        typedef typename MakeVec<work_elem_type, VecTraits<src_type>::cn>::type work_type;
+
+        work_type out = VecTraits<work_type>::all(0);
+
+        const int x1 = __float2int_rd(x);
+        const int y1 = __float2int_rd(y);
+        const int x2 = x1 + 1;
+        const int y2 = y1 + 1;
+
+        typename PtrTraits<SrcPtr>::value_type src_reg = src(y1, x1);
+        out = out + src_reg * static_cast<work_elem_type>((x2 - x) * (y2 - y));
+
+        src_reg = src(y1, x2);
+        out = out + src_reg * static_cast<work_elem_type>((x - x1) * (y2 - y));
+
+        src_reg = src(y2, x1);
+        out = out + src_reg * static_cast<work_elem_type>((x2 - x) * (y - y1));
+
+        src_reg = src(y2, x2);
+        out = out + src_reg * static_cast<work_elem_type>((x - x1) * (y - y1));
+
+        return saturate_cast<typename PtrTraits<SrcPtr>::value_type>(out);
+    }
+};
+
+template <class SrcPtr> struct LinearInterPtrSz : LinearInterPtr<SrcPtr>
+{
+    int rows, cols;
+};
+
+template <class SrcPtr>
+__host__ LinearInterPtrSz<typename PtrTraits<SrcPtr>::ptr_type> interLinear(const SrcPtr& src)
+{
+    LinearInterPtrSz<typename PtrTraits<SrcPtr>::ptr_type> i;
+    i.src = shrinkPtr(src);
+    i.rows = getRows(src);
+    i.cols = getCols(src);
+    return i;
+}
+
+template <class SrcPtr> struct PtrTraits< LinearInterPtrSz<SrcPtr> > : PtrTraitsBase<LinearInterPtrSz<SrcPtr>, LinearInterPtr<SrcPtr> >
+{
+};
+
+// Cubic
+
+template <typename SrcPtr> struct CubicInterPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef float                                  index_type;
+
+    SrcPtr src;
+
+    __device__ static float bicubicCoeff(float x_)
+    {
+        float x = ::fabsf(x_);
+        if (x <= 1.0f)
+        {
+            return x * x * (1.5f * x - 2.5f) + 1.0f;
+        }
+        else if (x < 2.0f)
+        {
+            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+        }
+        else
+        {
+            return 0.0f;
+        }
+    }
+
+    __device__ typename PtrTraits<SrcPtr>::value_type operator ()(float y, float x) const
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+        typedef typename VecTraits<src_type>::elem_type src_elem_type;
+        typedef typename LargerType<float, src_elem_type>::type work_elem_type;
+        typedef typename MakeVec<work_elem_type, VecTraits<src_type>::cn>::type work_type;
+
+        const float xmin = ::ceilf(x - 2.0f);
+        const float xmax = ::floorf(x + 2.0f);
+
+        const float ymin = ::ceilf(y - 2.0f);
+        const float ymax = ::floorf(y + 2.0f);
+
+        work_type sum = VecTraits<work_type>::all(0);
+        float wsum = 0.0f;
+
+        for (float cy = ymin; cy <= ymax; cy += 1.0f)
+        {
+            for (float cx = xmin; cx <= xmax; cx += 1.0f)
+            {
+                typename PtrTraits<SrcPtr>::value_type src_reg = src(__float2int_rd(cy), __float2int_rd(cx));
+                const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
+
+                sum = sum + static_cast<work_elem_type>(w) * src_reg;
+                wsum += w;
+            }
+        }
+
+        work_type res = (wsum > numeric_limits<float>::epsilon()) ? VecTraits<work_type>::all(0) : sum / static_cast<work_elem_type>(wsum);
+
+        return saturate_cast<typename PtrTraits<SrcPtr>::value_type>(res);
+    }
+};
+
+template <class SrcPtr> struct CubicInterPtrSz : CubicInterPtr<SrcPtr>
+{
+    int rows, cols;
+};
+
+template <class SrcPtr>
+__host__ CubicInterPtrSz<typename PtrTraits<SrcPtr>::ptr_type> interCubic(const SrcPtr& src)
+{
+    CubicInterPtrSz<typename PtrTraits<SrcPtr>::ptr_type> i;
+    i.src = shrinkPtr(src);
+    i.rows = getRows(src);
+    i.cols = getCols(src);
+    return i;
+}
+
+template <class SrcPtr> struct PtrTraits< CubicInterPtrSz<SrcPtr> > : PtrTraitsBase<CubicInterPtrSz<SrcPtr>, CubicInterPtr<SrcPtr> >
+{
+};
+
+// IntegerArea
+
+template <typename SrcPtr> struct IntegerAreaInterPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef float                                  index_type;
+
+    SrcPtr src;
+    int area_width, area_height;
+
+    __device__ typename PtrTraits<SrcPtr>::value_type operator ()(float y, float x) const
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+        typedef typename VecTraits<src_type>::elem_type src_elem_type;
+        typedef typename LargerType<float, src_elem_type>::type work_elem_type;
+        typedef typename MakeVec<work_elem_type, VecTraits<src_type>::cn>::type work_type;
+
+        const int sx1 = __float2int_rd(x);
+        const int sx2 = sx1 + area_width;
+
+        const int sy1 = __float2int_rd(y);
+        const int sy2 = sy1 + area_height;
+
+        work_type out = VecTraits<work_type>::all(0);
+
+        for (int dy = sy1; dy < sy2; ++dy)
+        {
+            for (int dx = sx1; dx < sx2; ++dx)
+            {
+                out = out + saturate_cast<work_type>(src(dy, dx));
+            }
+        }
+
+        const work_elem_type scale = 1.0f / (area_width * area_height);
+
+        return saturate_cast<typename PtrTraits<SrcPtr>::value_type>(out * scale);
+    }
+};
+
+template <class SrcPtr> struct IntegerAreaInterPtrSz : IntegerAreaInterPtr<SrcPtr>
+{
+    int rows, cols;
+};
+
+template <class SrcPtr>
+__host__ IntegerAreaInterPtrSz<typename PtrTraits<SrcPtr>::ptr_type> interArea(const SrcPtr& src, Size areaSize)
+{
+    IntegerAreaInterPtrSz<typename PtrTraits<SrcPtr>::ptr_type> i;
+    i.src = shrinkPtr(src);
+    i.area_width = areaSize.width;
+    i.area_height = areaSize.height;
+    i.rows = getRows(src);
+    i.cols = getCols(src);
+    return i;
+}
+
+template <class SrcPtr> struct PtrTraits< IntegerAreaInterPtrSz<SrcPtr> > : PtrTraitsBase<IntegerAreaInterPtrSz<SrcPtr>, IntegerAreaInterPtr<SrcPtr> >
+{
+};
+
+// CommonArea
+
+template <typename SrcPtr> struct CommonAreaInterPtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef float                                  index_type;
+
+    SrcPtr src;
+    float area_width, area_height;
+
+    __device__ typename PtrTraits<SrcPtr>::value_type operator ()(float y, float x) const
+    {
+        typedef typename PtrTraits<SrcPtr>::value_type src_type;
+        typedef typename VecTraits<src_type>::elem_type src_elem_type;
+        typedef typename LargerType<float, src_elem_type>::type work_elem_type;
+        typedef typename MakeVec<work_elem_type, VecTraits<src_type>::cn>::type work_type;
+
+        const float fsx1 = x;
+        const float fsx2 = fsx1 + area_width;
+
+        const int sx1 = __float2int_rd(fsx1);
+        const int sx2 = __float2int_ru(fsx2);
+
+        const float fsy1 = y;
+        const float fsy2 = fsy1 + area_height;
+
+        const int sy1 = __float2int_rd(fsy1);
+        const int sy2 = __float2int_ru(fsy2);
+
+        work_type out = VecTraits<work_type>::all(0);
+
+        for (int dy = sy1; dy < sy2; ++dy)
+        {
+            for (int dx = sx1; dx < sx2; ++dx)
+                out = out + saturate_cast<work_type>(src(dy, dx));
+
+            if (sx1 > fsx1)
+                out = out + saturate_cast<work_type>(src(dy, sx1 - 1)) * static_cast<work_elem_type>(sx1 - fsx1);
+
+            if (sx2 < fsx2)
+                out = out + saturate_cast<work_type>(src(dy, sx2)) * static_cast<work_elem_type>(fsx2 - sx2);
+        }
+
+        if (sy1 > fsy1)
+        {
+            for (int dx = sx1; dx < sx2; ++dx)
+                out = out + saturate_cast<work_type>(src(sy1 - 1, dx)) * static_cast<work_elem_type>(sy1 - fsy1);
+        }
+
+        if (sy2 < fsy2)
+        {
+            for (int dx = sx1; dx < sx2; ++dx)
+                out = out + saturate_cast<work_type>(src(sy2, dx)) * static_cast<work_elem_type>(fsy2 - sy2);
+        }
+
+        if ((sy1 > fsy1) && (sx1 > fsx1))
+            out = out + saturate_cast<work_type>(src(sy1 - 1, sx1 - 1)) * static_cast<work_elem_type>((sy1 - fsy1) * (sx1 - fsx1));
+
+        if ((sy1 > fsy1) && (sx2 < fsx2))
+            out = out + saturate_cast<work_type>(src(sy1 - 1, sx2)) * static_cast<work_elem_type>((sy1 - fsy1) * (fsx2 - sx2));
+
+        if ((sy2 < fsy2) && (sx2 < fsx2))
+            out = out + saturate_cast<work_type>(src(sy2, sx2)) * static_cast<work_elem_type>((fsy2 - sy2) * (fsx2 - sx2));
+
+        if ((sy2 < fsy2) && (sx1 > fsx1))
+            out = out + saturate_cast<work_type>(src(sy2, sx1 - 1)) * static_cast<work_elem_type>((fsy2 - sy2) * (sx1 - fsx1));
+
+        const work_elem_type scale = 1.0f / (area_width * area_height);
+
+        return saturate_cast<typename PtrTraits<SrcPtr>::value_type>(out * scale);
+    }
+};
+
+template <class SrcPtr> struct CommonAreaInterPtrSz : CommonAreaInterPtr<SrcPtr>
+{
+    int rows, cols;
+};
+
+template <class SrcPtr>
+__host__ CommonAreaInterPtrSz<typename PtrTraits<SrcPtr>::ptr_type> interArea(const SrcPtr& src, Size2f areaSize)
+{
+    CommonAreaInterPtrSz<typename PtrTraits<SrcPtr>::ptr_type> i;
+    i.src = shrinkPtr(src);
+    i.area_width = areaSize.width;
+    i.area_height = areaSize.height;
+    i.rows = getRows(src);
+    i.cols = getCols(src);
+    return i;
+}
+
+template <class SrcPtr> struct PtrTraits< CommonAreaInterPtrSz<SrcPtr> > : PtrTraitsBase<CommonAreaInterPtrSz<SrcPtr>, CommonAreaInterPtr<SrcPtr> >
+{
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
new file mode 100644
index 00000000000..221732c0ec2
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
@@ -0,0 +1,105 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_LUT_HPP
+#define OPENCV_CUDEV_PTR2D_LUT_HPP
+
+#include "../common.hpp"
+#include "../util/vec_traits.hpp"
+#include "../grid/copy.hpp"
+#include "traits.hpp"
+#include "gpumat.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class SrcPtr, class TablePtr> struct LutPtr
+{
+    typedef typename PtrTraits<TablePtr>::value_type value_type;
+    typedef typename PtrTraits<SrcPtr>::index_type   index_type;
+
+    SrcPtr src;
+    TablePtr tbl;
+
+    __device__ __forceinline__ typename PtrTraits<TablePtr>::value_type operator ()(typename PtrTraits<SrcPtr>::index_type y, typename PtrTraits<SrcPtr>::index_type x) const
+    {
+        typedef typename PtrTraits<TablePtr>::index_type tbl_index_type;
+        return tbl(VecTraits<tbl_index_type>::all(0), src(y, x));
+    }
+};
+
+template <class SrcPtr, class TablePtr> struct LutPtrSz : LutPtr<SrcPtr, TablePtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr, class TablePtr>
+__host__ LutPtrSz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<TablePtr>::ptr_type> lutPtr(const SrcPtr& src, const TablePtr& tbl)
+{
+    LutPtrSz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<TablePtr>::ptr_type> ptr;
+    ptr.src = shrinkPtr(src);
+    ptr.tbl = shrinkPtr(tbl);
+    ptr.rows = getRows(src);
+    ptr.cols = getCols(src);
+    return ptr;
+}
+
+template <class SrcPtr, class TablePtr> struct PtrTraits< LutPtrSz<SrcPtr, TablePtr> > : PtrTraitsBase<LutPtrSz<SrcPtr, TablePtr>, LutPtr<SrcPtr, TablePtr> >
+{
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
new file mode 100644
index 00000000000..46f518cca93
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_MASK_HPP
+#define OPENCV_CUDEV_PTR2D_MASK_HPP
+
+#include "../common.hpp"
+#include "traits.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+struct WithOutMask
+{
+    typedef bool value_type;
+    typedef int  index_type;
+
+    __device__ __forceinline__ bool operator ()(int, int) const
+    {
+        return true;
+    }
+};
+
+template <class MaskPtr> struct SingleMaskChannels
+{
+    typedef typename PtrTraits<MaskPtr>::value_type value_type;
+    typedef typename PtrTraits<MaskPtr>::index_type index_type;
+
+    MaskPtr mask;
+    int channels;
+
+    __device__ __forceinline__ value_type operator()(index_type y, index_type x) const
+    {
+        return mask(y, x / channels);
+    }
+
+};
+
+template <class MaskPtr> struct SingleMaskChannelsSz : SingleMaskChannels<MaskPtr>
+{
+    int rows, cols;
+};
+
+template <class MaskPtr>
+__host__ SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type>
+singleMaskChannels(const MaskPtr& mask, int channels)
+{
+    SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type> ptr;
+    ptr.mask = shrinkPtr(mask);
+    ptr.channels = channels;
+    ptr.rows = getRows(mask);
+    ptr.cols = getCols(mask) * channels;
+    return ptr;
+}
+
+template <class MaskPtr> struct PtrTraits< SingleMaskChannelsSz<MaskPtr> > : PtrTraitsBase<SingleMaskChannelsSz<MaskPtr>, SingleMaskChannels<MaskPtr> >
+{
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/remap.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/remap.hpp
new file mode 100644
index 00000000000..cb21da48cf2
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/remap.hpp
@@ -0,0 +1,159 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_REMAP_HPP
+#define OPENCV_CUDEV_PTR2D_REMAP_HPP
+
+#include "opencv2/core/base.hpp"
+#include "../common.hpp"
+#include "../grid/copy.hpp"
+#include "traits.hpp"
+#include "gpumat.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class SrcPtr, class MapPtr> struct RemapPtr1
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef typename PtrTraits<MapPtr>::index_type index_type;
+
+    SrcPtr src;
+    MapPtr map;
+
+    __device__ __forceinline__ typename PtrTraits<SrcPtr>::value_type operator ()(typename PtrTraits<MapPtr>::index_type y, typename PtrTraits<MapPtr>::index_type x) const
+    {
+        const typename PtrTraits<MapPtr>::value_type coord = map(y, x);
+        return src(coord.y, coord.x);
+    }
+};
+
+template <class SrcPtr, class MapXPtr, class MapYPtr> struct RemapPtr2
+{
+    typedef typename PtrTraits<SrcPtr>::value_type  value_type;
+    typedef typename PtrTraits<MapXPtr>::index_type index_type;
+
+    SrcPtr src;
+    MapXPtr mapx;
+    MapYPtr mapy;
+
+    __device__ __forceinline__ typename PtrTraits<SrcPtr>::value_type operator ()(typename PtrTraits<MapXPtr>::index_type y, typename PtrTraits<MapXPtr>::index_type x) const
+    {
+        const typename PtrTraits<MapXPtr>::value_type nx = mapx(y, x);
+        const typename PtrTraits<MapYPtr>::value_type ny = mapy(y, x);
+        return src(ny, nx);
+    }
+};
+
+template <class SrcPtr, class MapPtr> struct RemapPtr1Sz : RemapPtr1<SrcPtr, MapPtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr, class MapXPtr, class MapYPtr> struct RemapPtr2Sz : RemapPtr2<SrcPtr, MapXPtr, MapYPtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr, class MapPtr>
+__host__ RemapPtr1Sz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<MapPtr>::ptr_type>
+remapPtr(const SrcPtr& src, const MapPtr& map)
+{
+    const int rows = getRows(map);
+    const int cols = getCols(map);
+
+    RemapPtr1Sz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<MapPtr>::ptr_type> r;
+    r.src = shrinkPtr(src);
+    r.map = shrinkPtr(map);
+    r.rows = rows;
+    r.cols = cols;
+    return r;
+}
+
+template <class SrcPtr, class MapXPtr, class MapYPtr>
+__host__ RemapPtr2Sz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<MapXPtr>::ptr_type, typename PtrTraits<MapYPtr>::ptr_type>
+remapPtr(const SrcPtr& src, const MapXPtr& mapx, const MapYPtr& mapy)
+{
+    const int rows = getRows(mapx);
+    const int cols = getCols(mapx);
+
+    CV_Assert( getRows(mapy) == rows && getCols(mapy) == cols );
+
+    RemapPtr2Sz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<MapXPtr>::ptr_type, typename PtrTraits<MapYPtr>::ptr_type> r;
+    r.src = shrinkPtr(src);
+    r.mapx = shrinkPtr(mapx);
+    r.mapy = shrinkPtr(mapy);
+    r.rows = rows;
+    r.cols = cols;
+    return r;
+}
+
+template <class SrcPtr, class MapPtr> struct PtrTraits< RemapPtr1Sz<SrcPtr, MapPtr> > : PtrTraitsBase<RemapPtr1Sz<SrcPtr, MapPtr>, RemapPtr1<SrcPtr, MapPtr> >
+{
+};
+
+template <class SrcPtr, class MapXPtr, class MapYPtr> struct PtrTraits< RemapPtr2Sz<SrcPtr, MapXPtr, MapYPtr> > : PtrTraitsBase<RemapPtr2Sz<SrcPtr, MapXPtr, MapYPtr>, RemapPtr2<SrcPtr, MapXPtr, MapYPtr> >
+{
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/resize.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/resize.hpp
new file mode 100644
index 00000000000..d026b7031a6
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/resize.hpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_RESIZE_HPP
+#define OPENCV_CUDEV_PTR2D_RESIZE_HPP
+
+#include "opencv2/core/base.hpp"
+#include "../common.hpp"
+#include "../grid/copy.hpp"
+#include "traits.hpp"
+#include "gpumat.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class SrcPtr> struct ResizePtr
+{
+    typedef typename PtrTraits<SrcPtr>::value_type value_type;
+    typedef typename PtrTraits<SrcPtr>::index_type index_type;
+
+    SrcPtr src;
+    float fx, fy;
+
+    __device__ __forceinline__ typename PtrTraits<SrcPtr>::value_type operator ()(typename PtrTraits<SrcPtr>::index_type y, typename PtrTraits<SrcPtr>::index_type x) const
+    {
+        const float yn = static_cast<float>(y * fy);
+        const float xn = static_cast<float>(x * fx);
+
+        return src(yn, xn);
+    }
+};
+
+template <class SrcPtr> struct ResizePtrSz : ResizePtr<SrcPtr>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr>
+__host__ ResizePtrSz<typename PtrTraits<SrcPtr>::ptr_type> resizePtr(const SrcPtr& src, float fx, float fy)
+{
+    ResizePtrSz<typename PtrTraits<SrcPtr>::ptr_type> r;
+    r.src = shrinkPtr(src);
+    r.fx = 1.0f / fx;
+    r.fy = 1.0f / fy;
+    r.rows = cv::saturate_cast<int>(getRows(src) * fy);
+    r.cols = cv::saturate_cast<int>(getCols(src) * fx);
+    return r;
+}
+
+template <class SrcPtr> struct PtrTraits< ResizePtrSz<SrcPtr> > : PtrTraitsBase<ResizePtrSz<SrcPtr>, ResizePtr<SrcPtr> >
+{
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
new file mode 100644
index 00000000000..fdcc66ca2f7
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
@@ -0,0 +1,258 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_TEXTURE_HPP
+#define OPENCV_CUDEV_PTR2D_TEXTURE_HPP
+
+#include <cstring>
+#include "../common.hpp"
+#include "glob.hpp"
+#include "gpumat.hpp"
+#include "traits.hpp"
+
+#if CUDART_VERSION >= 5050
+
+namespace
+{
+    template <typename T> struct CvCudevTextureRef
+    {
+        typedef texture<T, cudaTextureType2D, cudaReadModeElementType> TexRef;
+
+        static TexRef ref;
+
+        __host__ static void bind(const cv::cudev::GlobPtrSz<T>& mat,
+                                  bool normalizedCoords = false,
+                                  cudaTextureFilterMode filterMode = cudaFilterModePoint,
+                                  cudaTextureAddressMode addressMode = cudaAddressModeClamp)
+        {
+            ref.normalized = normalizedCoords;
+            ref.filterMode = filterMode;
+            ref.addressMode[0] = addressMode;
+            ref.addressMode[1] = addressMode;
+            ref.addressMode[2] = addressMode;
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+
+            CV_CUDEV_SAFE_CALL( cudaBindTexture2D(0, &ref, mat.data, &desc, mat.cols, mat.rows, mat.step) );
+        }
+
+        __host__ static void unbind()
+        {
+            cudaUnbindTexture(ref);
+        }
+    };
+
+    template <typename T>
+    typename CvCudevTextureRef<T>::TexRef CvCudevTextureRef<T>::ref;
+}
+
+#endif
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+#if CUDART_VERSION >= 5050
+
+template <typename T> struct TexturePtr
+{
+    typedef T     value_type;
+    typedef float index_type;
+
+    cudaTextureObject_t texObj;
+
+    __device__ __forceinline__ T operator ()(float y, float x) const
+    {
+    #if CV_CUDEV_ARCH < 300
+        // Use the texture reference
+        return tex2D(CvCudevTextureRef<T>::ref, x, y);
+    #else
+        // Use the texture object
+        return tex2D<T>(texObj, x, y);
+    #endif
+    }
+};
+
+template <typename T> struct Texture : TexturePtr<T>
+{
+    int rows, cols;
+    bool cc30;
+
+    __host__ explicit Texture(const GlobPtrSz<T>& mat,
+                              bool normalizedCoords = false,
+                              cudaTextureFilterMode filterMode = cudaFilterModePoint,
+                              cudaTextureAddressMode addressMode = cudaAddressModeClamp)
+    {
+        cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
+
+        rows = mat.rows;
+        cols = mat.cols;
+
+        if (cc30)
+        {
+            // Use the texture object
+            cudaResourceDesc texRes;
+            std::memset(&texRes, 0, sizeof(texRes));
+            texRes.resType = cudaResourceTypePitch2D;
+            texRes.res.pitch2D.devPtr = mat.data;
+            texRes.res.pitch2D.height = mat.rows;
+            texRes.res.pitch2D.width = mat.cols;
+            texRes.res.pitch2D.pitchInBytes = mat.step;
+            texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+
+            cudaTextureDesc texDescr;
+            std::memset(&texDescr, 0, sizeof(texDescr));
+            texDescr.normalizedCoords = normalizedCoords;
+            texDescr.filterMode = filterMode;
+            texDescr.addressMode[0] = addressMode;
+            texDescr.addressMode[1] = addressMode;
+            texDescr.addressMode[2] = addressMode;
+            texDescr.readMode = cudaReadModeElementType;
+
+            CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
+        }
+        else
+        {
+            // Use the texture reference
+            CvCudevTextureRef<T>::bind(mat, normalizedCoords, filterMode, addressMode);
+        }
+    }
+
+    __host__ ~Texture()
+    {
+        if (cc30)
+        {
+            // Use the texture object
+            cudaDestroyTextureObject(this->texObj);
+        }
+        else
+        {
+            // Use the texture reference
+            CvCudevTextureRef<T>::unbind();
+        }
+    }
+};
+
+template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
+{
+};
+
+#else
+
+template <typename T> struct TexturePtr
+{
+    typedef T     value_type;
+    typedef float index_type;
+
+    cudaTextureObject_t texObj;
+
+    __device__ __forceinline__ T operator ()(float y, float x) const
+    {
+    #if CV_CUDEV_ARCH >= 300
+        // Use the texture object
+        return tex2D<T>(texObj, x, y);
+    #else
+        CV_UNUSED(y);
+        CV_UNUSED(x);
+        return T();
+    #endif
+    }
+};
+
+template <typename T> struct Texture : TexturePtr<T>
+{
+    int rows, cols;
+
+    __host__ explicit Texture(const GlobPtrSz<T>& mat,
+                              bool normalizedCoords = false,
+                              cudaTextureFilterMode filterMode = cudaFilterModePoint,
+                              cudaTextureAddressMode addressMode = cudaAddressModeClamp)
+    {
+        CV_Assert( deviceSupports(FEATURE_SET_COMPUTE_30) );
+
+        rows = mat.rows;
+        cols = mat.cols;
+
+        // Use the texture object
+        cudaResourceDesc texRes;
+        std::memset(&texRes, 0, sizeof(texRes));
+        texRes.resType = cudaResourceTypePitch2D;
+        texRes.res.pitch2D.devPtr = mat.data;
+        texRes.res.pitch2D.height = mat.rows;
+        texRes.res.pitch2D.width = mat.cols;
+        texRes.res.pitch2D.pitchInBytes = mat.step;
+        texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+
+        cudaTextureDesc texDescr;
+        std::memset(&texDescr, 0, sizeof(texDescr));
+        texDescr.normalizedCoords = normalizedCoords;
+        texDescr.filterMode = filterMode;
+        texDescr.addressMode[0] = addressMode;
+        texDescr.addressMode[1] = addressMode;
+        texDescr.addressMode[2] = addressMode;
+        texDescr.readMode = cudaReadModeElementType;
+
+        CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
+    }
+
+    __host__ ~Texture()
+    {
+        // Use the texture object
+        cudaDestroyTextureObject(this->texObj);
+    }
+};
+
+template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
+{
+};
+
+#endif
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/traits.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/traits.hpp
new file mode 100644
index 00000000000..f0d1cad7a29
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/traits.hpp
@@ -0,0 +1,106 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_TRAITS_HPP
+#define OPENCV_CUDEV_PTR2D_TRAITS_HPP
+
+#include "../common.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class Ptr2DSz, class Ptr2D> struct PtrTraitsBase
+{
+    typedef Ptr2DSz ptr_sz_type;
+    typedef Ptr2D   ptr_type;
+
+    typedef typename Ptr2D::value_type value_type;
+    typedef typename Ptr2D::index_type index_type;
+
+    __host__ static Ptr2D shrinkPtr(const Ptr2DSz& ptr)
+    {
+        return ptr;
+    }
+
+    __host__ static int getRows(const Ptr2DSz& ptr)
+    {
+        return ptr.rows;
+    }
+
+    __host__ static int getCols(const Ptr2DSz& ptr)
+    {
+        return ptr.cols;
+    }
+};
+
+template <class Ptr2DSz> struct PtrTraits : PtrTraitsBase<Ptr2DSz, Ptr2DSz>
+{
+};
+
+template <class Ptr2DSz>
+__host__ typename PtrTraits<Ptr2DSz>::ptr_type shrinkPtr(const Ptr2DSz& ptr)
+{
+    return PtrTraits<Ptr2DSz>::shrinkPtr(ptr);
+}
+
+template <class Ptr2DSz>
+__host__ int getRows(const Ptr2DSz& ptr)
+{
+    return PtrTraits<Ptr2DSz>::getRows(ptr);
+}
+
+template <class Ptr2DSz>
+__host__ int getCols(const Ptr2DSz& ptr)
+{
+    return PtrTraits<Ptr2DSz>::getCols(ptr);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/transform.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/transform.hpp
new file mode 100644
index 00000000000..21d50757d38
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/transform.hpp
@@ -0,0 +1,156 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_TRANSFORM_HPP
+#define OPENCV_CUDEV_PTR2D_TRANSFORM_HPP
+
+#include "../common.hpp"
+#include "../grid/copy.hpp"
+#include "traits.hpp"
+#include "gpumat.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// UnaryTransformPtr
+
+template <class SrcPtr, class Op> struct UnaryTransformPtr
+{
+    typedef typename Op::result_type               value_type;
+    typedef typename PtrTraits<SrcPtr>::index_type index_type;
+
+    SrcPtr src;
+    Op op;
+
+    __device__ __forceinline__ typename Op::result_type operator ()(typename PtrTraits<SrcPtr>::index_type y, typename PtrTraits<SrcPtr>::index_type x) const
+    {
+        return op(src(y, x));
+    }
+};
+
+template <class SrcPtr, class Op> struct UnaryTransformPtrSz : UnaryTransformPtr<SrcPtr, Op>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class SrcPtr, class Op>
+__host__ UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, Op>
+transformPtr(const SrcPtr& src, const Op& op)
+{
+    UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, Op> ptr;
+    ptr.src = shrinkPtr(src);
+    ptr.op = op;
+    ptr.rows = getRows(src);
+    ptr.cols = getCols(src);
+    return ptr;
+}
+
+template <class SrcPtr, class Op> struct PtrTraits< UnaryTransformPtrSz<SrcPtr, Op> > : PtrTraitsBase<UnaryTransformPtrSz<SrcPtr, Op>, UnaryTransformPtr<SrcPtr, Op> >
+{
+};
+
+// BinaryTransformPtr
+
+template <class Src1Ptr, class Src2Ptr, class Op> struct BinaryTransformPtr
+{
+    typedef typename Op::result_type                value_type;
+    typedef typename PtrTraits<Src1Ptr>::index_type index_type;
+
+    Src1Ptr src1;
+    Src2Ptr src2;
+    Op op;
+
+    __device__ __forceinline__ typename Op::result_type operator ()(typename PtrTraits<Src1Ptr>::index_type y, typename PtrTraits<Src1Ptr>::index_type x) const
+    {
+        return op(src1(y, x), src2(y, x));
+    }
+};
+
+template <class Src1Ptr, class Src2Ptr, class Op> struct BinaryTransformPtrSz : BinaryTransformPtr<Src1Ptr, Src2Ptr, Op>
+{
+    int rows, cols;
+
+    template <typename T>
+    __host__ void assignTo(GpuMat_<T>& dst, Stream& stream = Stream::Null()) const
+    {
+        gridCopy(*this, dst, stream);
+    }
+};
+
+template <class Src1Ptr, class Src2Ptr, class Op>
+__host__ BinaryTransformPtrSz<typename PtrTraits<Src1Ptr>::ptr_type, typename PtrTraits<Src2Ptr>::ptr_type, Op>
+transformPtr(const Src1Ptr& src1, const Src2Ptr& src2, const Op& op)
+{
+    const int rows = getRows(src1);
+    const int cols = getCols(src1);
+
+    CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
+
+    BinaryTransformPtrSz<typename PtrTraits<Src1Ptr>::ptr_type, typename PtrTraits<Src2Ptr>::ptr_type, Op> ptr;
+    ptr.src1 = shrinkPtr(src1);
+    ptr.src2 = shrinkPtr(src2);
+    ptr.op = op;
+    ptr.rows = rows;
+    ptr.cols = cols;
+    return ptr;
+}
+
+template <class Src1Ptr, class Src2Ptr, class Op> struct PtrTraits< BinaryTransformPtrSz<Src1Ptr, Src2Ptr, Op> > : PtrTraitsBase<BinaryTransformPtrSz<Src1Ptr, Src2Ptr, Op>, BinaryTransformPtr<Src1Ptr, Src2Ptr, Op> >
+{
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/warping.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/warping.hpp
new file mode 100644
index 00000000000..02df07f8248
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/warping.hpp
@@ -0,0 +1,157 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_WARPING_HPP
+#define OPENCV_CUDEV_PTR2D_WARPING_HPP
+
+#include "../common.hpp"
+#include "traits.hpp"
+#include "remap.hpp"
+#include "gpumat.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// affine
+
+struct AffineMapPtr
+{
+    typedef float2 value_type;
+    typedef float  index_type;
+
+    const float* warpMat;
+
+    __device__ __forceinline__ float2 operator ()(float y, float x) const
+    {
+        const float xcoo = warpMat[0] * x + warpMat[1] * y + warpMat[2];
+        const float ycoo = warpMat[3] * x + warpMat[4] * y + warpMat[5];
+
+        return make_float2(xcoo, ycoo);
+    }
+};
+
+struct AffineMapPtrSz : AffineMapPtr
+{
+    int rows, cols;
+};
+
+template <> struct PtrTraits<AffineMapPtrSz> : PtrTraitsBase<AffineMapPtrSz, AffineMapPtr>
+{
+};
+
+__host__ static AffineMapPtrSz affineMap(Size dstSize, const GpuMat_<float>& warpMat)
+{
+    CV_Assert( warpMat.rows == 2 && warpMat.cols == 3 );
+    CV_Assert( warpMat.isContinuous() );
+
+    AffineMapPtrSz map;
+    map.warpMat = warpMat[0];
+    map.rows = dstSize.height;
+    map.cols = dstSize.width;
+    return map;
+}
+
+template <class SrcPtr>
+__host__ RemapPtr1Sz<typename PtrTraits<SrcPtr>::ptr_type, AffineMapPtr>
+warpAffinePtr(const SrcPtr& src, Size dstSize, const GpuMat_<float>& warpMat)
+{
+    return remapPtr(src, affineMap(dstSize, warpMat));
+}
+
+// perspective
+
+struct PerspectiveMapPtr
+{
+    typedef float2 value_type;
+    typedef float  index_type;
+
+    const float* warpMat;
+
+    __device__ __forceinline__ float2 operator ()(float y, float x) const
+    {
+        const float coeff = 1.0f / (warpMat[6] * x + warpMat[7] * y + warpMat[8]);
+
+        const float xcoo = coeff * (warpMat[0] * x + warpMat[1] * y + warpMat[2]);
+        const float ycoo = coeff * (warpMat[3] * x + warpMat[4] * y + warpMat[5]);
+
+        return make_float2(xcoo, ycoo);
+    }
+};
+
+struct PerspectiveMapPtrSz : PerspectiveMapPtr
+{
+    int rows, cols;
+};
+
+template <> struct PtrTraits<PerspectiveMapPtrSz> : PtrTraitsBase<PerspectiveMapPtrSz, PerspectiveMapPtr>
+{
+};
+
+__host__ static PerspectiveMapPtrSz perspectiveMap(Size dstSize, const GpuMat_<float>& warpMat)
+{
+    CV_Assert( warpMat.rows == 3 && warpMat.cols == 3 );
+    CV_Assert( warpMat.isContinuous() );
+
+    PerspectiveMapPtrSz map;
+    map.warpMat = warpMat[0];
+    map.rows = dstSize.height;
+    map.cols = dstSize.width;
+    return map;
+}
+
+template <class SrcPtr>
+__host__ RemapPtr1Sz<typename PtrTraits<SrcPtr>::ptr_type, PerspectiveMapPtr>
+warpPerspectivePtr(const SrcPtr& src, Size dstSize, const GpuMat_<float>& warpMat)
+{
+    return remapPtr(src, perspectiveMap(dstSize, warpMat));
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp
new file mode 100644
index 00000000000..e68f4cf61f5
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp
@@ -0,0 +1,178 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_PTR2D_ZIP_HPP
+#define OPENCV_CUDEV_PTR2D_ZIP_HPP
+
+#include "../common.hpp"
+#include "../util/tuple.hpp"
+#include "traits.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class PtrTuple> struct ZipPtr;
+
+template <class Ptr0, class Ptr1> struct ZipPtr< tuple<Ptr0, Ptr1> > : tuple<Ptr0, Ptr1>
+{
+    typedef tuple<typename PtrTraits<Ptr0>::value_type,
+                  typename PtrTraits<Ptr1>::value_type> value_type;
+    typedef typename PtrTraits<Ptr0>::index_type        index_type;
+
+    __host__ __device__ __forceinline__ ZipPtr() {}
+    __host__ __device__ __forceinline__ ZipPtr(const tuple<Ptr0, Ptr1>& t) : tuple<Ptr0, Ptr1>(t) {}
+
+    __device__ __forceinline__ value_type operator ()(index_type y, index_type x) const
+    {
+        return make_tuple(cv::cudev::get<0>(*this)(y, x), cv::cudev::get<1>(*this)(y, x));
+    }
+};
+
+template <class Ptr0, class Ptr1, class Ptr2> struct ZipPtr< tuple<Ptr0, Ptr1, Ptr2> > : tuple<Ptr0, Ptr1, Ptr2>
+{
+    typedef tuple<typename PtrTraits<Ptr0>::value_type,
+                  typename PtrTraits<Ptr1>::value_type,
+                  typename PtrTraits<Ptr2>::value_type> value_type;
+    typedef typename PtrTraits<Ptr0>::index_type        index_type;
+
+    __host__ __device__ __forceinline__ ZipPtr() {}
+    __host__ __device__ __forceinline__ ZipPtr(const tuple<Ptr0, Ptr1, Ptr2>& t) : tuple<Ptr0, Ptr1, Ptr2>(t) {}
+
+    __device__ __forceinline__ value_type operator ()(index_type y, index_type x) const
+    {
+        return make_tuple(cv::cudev::get<0>(*this)(y, x), cv::cudev::get<1>(*this)(y, x), cv::cudev::get<2>(*this)(y, x));
+    }
+};
+
+template <class Ptr0, class Ptr1, class Ptr2, class Ptr3> struct ZipPtr< tuple<Ptr0, Ptr1, Ptr2, Ptr3> > : tuple<Ptr0, Ptr1, Ptr2, Ptr3>
+{
+    typedef tuple<typename PtrTraits<Ptr0>::value_type,
+                  typename PtrTraits<Ptr1>::value_type,
+                  typename PtrTraits<Ptr2>::value_type,
+                  typename PtrTraits<Ptr3>::value_type> value_type;
+    typedef typename PtrTraits<Ptr0>::index_type        index_type;
+
+    __host__ __device__ __forceinline__ ZipPtr() {}
+    __host__ __device__ __forceinline__ ZipPtr(const tuple<Ptr0, Ptr1, Ptr2, Ptr3>& t) : tuple<Ptr0, Ptr1, Ptr2, Ptr3>(t) {}
+
+    __device__ __forceinline__ value_type operator ()(index_type y, index_type x) const
+    {
+        return make_tuple(cv::cudev::get<0>(*this)(y, x), cv::cudev::get<1>(*this)(y, x), cv::cudev::get<2>(*this)(y, x), cv::cudev::get<3>(*this)(y, x));
+    }
+};
+
+template <class PtrTuple> struct ZipPtrSz : ZipPtr<PtrTuple>
+{
+    int rows, cols;
+
+    __host__ __device__ __forceinline__ ZipPtrSz() {}
+    __host__ __device__ __forceinline__ ZipPtrSz(const PtrTuple& t) : ZipPtr<PtrTuple>(t) {}
+};
+
+template <class Ptr0, class Ptr1>
+__host__ ZipPtrSz< tuple<typename PtrTraits<Ptr0>::ptr_type, typename PtrTraits<Ptr1>::ptr_type> >
+zipPtr(const Ptr0& ptr0, const Ptr1& ptr1)
+{
+    const int rows = getRows(ptr0);
+    const int cols = getCols(ptr0);
+
+    CV_Assert( getRows(ptr1) == rows && getCols(ptr1) == cols );
+
+    ZipPtrSz< tuple<typename PtrTraits<Ptr0>::ptr_type, typename PtrTraits<Ptr1>::ptr_type> >
+            z(make_tuple(shrinkPtr(ptr0), shrinkPtr(ptr1)));
+    z.rows = rows;
+    z.cols = cols;
+
+    return z;
+}
+
+template <class Ptr0, class Ptr1, class Ptr2>
+__host__ ZipPtrSz< tuple<typename PtrTraits<Ptr0>::ptr_type, typename PtrTraits<Ptr1>::ptr_type, typename PtrTraits<Ptr2>::ptr_type> >
+zipPtr(const Ptr0& ptr0, const Ptr1& ptr1, const Ptr2& ptr2)
+{
+    const int rows = getRows(ptr0);
+    const int cols = getCols(ptr0);
+
+    CV_Assert( getRows(ptr1) == rows && getCols(ptr1) == cols );
+    CV_Assert( getRows(ptr2) == rows && getCols(ptr2) == cols );
+
+    ZipPtrSz< tuple<typename PtrTraits<Ptr0>::ptr_type, typename PtrTraits<Ptr1>::ptr_type, typename PtrTraits<Ptr2>::ptr_type> >
+            z(make_tuple(shrinkPtr(ptr0), shrinkPtr(ptr1), shrinkPtr(ptr2)));
+    z.rows = rows;
+    z.cols = cols;
+
+    return z;
+}
+
+template <class Ptr0, class Ptr1, class Ptr2, class Ptr3>
+__host__ ZipPtrSz< tuple<typename PtrTraits<Ptr0>::ptr_type, typename PtrTraits<Ptr1>::ptr_type, typename PtrTraits<Ptr2>::ptr_type, typename PtrTraits<Ptr3>::ptr_type> >
+zipPtr(const Ptr0& ptr0, const Ptr1& ptr1, const Ptr2& ptr2, const Ptr3& ptr3)
+{
+    const int rows = getRows(ptr0);
+    const int cols = getCols(ptr0);
+
+    CV_Assert( getRows(ptr1) == rows && getCols(ptr1) == cols );
+    CV_Assert( getRows(ptr2) == rows && getCols(ptr2) == cols );
+    CV_Assert( getRows(ptr3) == rows && getCols(ptr3) == cols );
+
+    ZipPtrSz< tuple<typename PtrTraits<Ptr0>::ptr_type, typename PtrTraits<Ptr1>::ptr_type, typename PtrTraits<Ptr2>::ptr_type, typename PtrTraits<Ptr3>::ptr_type> >
+            z(make_tuple(shrinkPtr(ptr0), shrinkPtr(ptr1), shrinkPtr(ptr2), shrinkPtr(ptr3)));
+    z.rows = rows;
+    z.cols = cols;
+
+    return z;
+}
+
+template <class PtrTuple> struct PtrTraits< ZipPtrSz<PtrTuple> > : PtrTraitsBase<ZipPtrSz<PtrTuple>, ZipPtr<PtrTuple> >
+{
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/util/atomic.hpp b/modules/cudev/include/opencv2/cudev/util/atomic.hpp
new file mode 100644
index 00000000000..190e8ee48b3
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/util/atomic.hpp
@@ -0,0 +1,202 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_UTIL_ATOMIC_HPP
+#define OPENCV_CUDEV_UTIL_ATOMIC_HPP
+
+#include "../common.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// atomicAdd
+
+__device__ __forceinline__ int atomicAdd(int* address, int val)
+{
+    return ::atomicAdd(address, val);
+}
+
+__device__ __forceinline__ uint atomicAdd(uint* address, uint val)
+{
+    return ::atomicAdd(address, val);
+}
+
+__device__ __forceinline__ float atomicAdd(float* address, float val)
+{
+#if CV_CUDEV_ARCH >= 200
+    return ::atomicAdd(address, val);
+#else
+    int* address_as_i = (int*) address;
+    int old = *address_as_i, assumed;
+    do {
+        assumed = old;
+        old = ::atomicCAS(address_as_i, assumed,
+            __float_as_int(val + __int_as_float(assumed)));
+    } while (assumed != old);
+    return __int_as_float(old);
+#endif
+}
+
+__device__ static double atomicAdd(double* address, double val)
+{
+#if CV_CUDEV_ARCH >= 130
+    unsigned long long int* address_as_ull = (unsigned long long int*) address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = ::atomicCAS(address_as_ull, assumed,
+            __double_as_longlong(val + __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+#else
+    CV_UNUSED(address);
+    CV_UNUSED(val);
+    return 0.0;
+#endif
+}
+
+// atomicMin
+
+__device__ __forceinline__ int atomicMin(int* address, int val)
+{
+    return ::atomicMin(address, val);
+}
+
+__device__ __forceinline__ uint atomicMin(uint* address, uint val)
+{
+    return ::atomicMin(address, val);
+}
+
+__device__ static float atomicMin(float* address, float val)
+{
+#if CV_CUDEV_ARCH >= 120
+    int* address_as_i = (int*) address;
+    int old = *address_as_i, assumed;
+    do {
+        assumed = old;
+        old = ::atomicCAS(address_as_i, assumed,
+            __float_as_int(::fminf(val, __int_as_float(assumed))));
+    } while (assumed != old);
+    return __int_as_float(old);
+#else
+    CV_UNUSED(address);
+    CV_UNUSED(val);
+    return 0.0f;
+#endif
+}
+
+__device__ static double atomicMin(double* address, double val)
+{
+#if CV_CUDEV_ARCH >= 130
+    unsigned long long int* address_as_ull = (unsigned long long int*) address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = ::atomicCAS(address_as_ull, assumed,
+            __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+#else
+    CV_UNUSED(address);
+    CV_UNUSED(val);
+    return 0.0;
+#endif
+}
+
+// atomicMax
+
+__device__ __forceinline__ int atomicMax(int* address, int val)
+{
+    return ::atomicMax(address, val);
+}
+
+__device__ __forceinline__ uint atomicMax(uint* address, uint val)
+{
+    return ::atomicMax(address, val);
+}
+
+__device__ static float atomicMax(float* address, float val)
+{
+#if CV_CUDEV_ARCH >= 120
+    int* address_as_i = (int*) address;
+    int old = *address_as_i, assumed;
+    do {
+        assumed = old;
+        old = ::atomicCAS(address_as_i, assumed,
+            __float_as_int(::fmaxf(val, __int_as_float(assumed))));
+    } while (assumed != old);
+    return __int_as_float(old);
+#else
+    CV_UNUSED(address);
+    CV_UNUSED(val);
+    return 0.0f;
+#endif
+}
+
+__device__ static double atomicMax(double* address, double val)
+{
+#if CV_CUDEV_ARCH >= 130
+    unsigned long long int* address_as_ull = (unsigned long long int*) address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = ::atomicCAS(address_as_ull, assumed,
+            __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+#else
+    CV_UNUSED(address);
+    CV_UNUSED(val);
+    return 0.0;
+#endif
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/util/detail/tuple.hpp b/modules/cudev/include/opencv2/cudev/util/detail/tuple.hpp
new file mode 100644
index 00000000000..248306149e5
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/util/detail/tuple.hpp
@@ -0,0 +1,175 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_UTIL_TUPLE_DETAIL_HPP
+#define OPENCV_CUDEV_UTIL_TUPLE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+
+namespace cv { namespace cudev {
+
+namespace tuple_detail
+{
+    using thrust::tuple;
+    using thrust::tuple_size;
+    using thrust::get;
+    using thrust::tuple_element;
+    using thrust::make_tuple;
+    using thrust::tie;
+
+    template <class Tuple, int SIZE, template <typename T> class CvtOp> struct ConvertTuple;
+
+    template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple<Tuple, 2, CvtOp>
+    {
+        typedef tuple<
+            typename CvtOp<typename tuple_element<0, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<1, Tuple>::type>::type
+        > type;
+    };
+
+    template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple<Tuple, 3, CvtOp>
+    {
+        typedef tuple<
+            typename CvtOp<typename tuple_element<0, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<1, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<2, Tuple>::type>::type
+        > type;
+    };
+
+    template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple<Tuple, 4, CvtOp>
+    {
+        typedef tuple<
+            typename CvtOp<typename tuple_element<0, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<1, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<2, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<3, Tuple>::type>::type
+        > type;
+    };
+
+    template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple<Tuple, 5, CvtOp>
+    {
+        typedef tuple<
+            typename CvtOp<typename tuple_element<0, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<1, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<2, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<3, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<4, Tuple>::type>::type
+        > type;
+    };
+
+    template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple<Tuple, 6, CvtOp>
+    {
+        typedef tuple<
+            typename CvtOp<typename tuple_element<0, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<1, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<2, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<3, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<4, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<5, Tuple>::type>::type
+        > type;
+    };
+
+    template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple<Tuple, 7, CvtOp>
+    {
+        typedef tuple<
+            typename CvtOp<typename tuple_element<0, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<1, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<2, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<3, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<4, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<5, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<6, Tuple>::type>::type
+        > type;
+    };
+
+    template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple<Tuple, 8, CvtOp>
+    {
+        typedef tuple<
+            typename CvtOp<typename tuple_element<0, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<1, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<2, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<3, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<4, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<5, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<6, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<7, Tuple>::type>::type
+        > type;
+    };
+
+    template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple<Tuple, 9, CvtOp>
+    {
+        typedef tuple<
+            typename CvtOp<typename tuple_element<0, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<1, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<2, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<3, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<4, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<5, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<6, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<7, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<8, Tuple>::type>::type
+        > type;
+    };
+
+    template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple<Tuple, 10, CvtOp>
+    {
+        typedef tuple<
+            typename CvtOp<typename tuple_element<0, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<1, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<2, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<3, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<4, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<5, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<6, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<7, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<8, Tuple>::type>::type,
+            typename CvtOp<typename tuple_element<9, Tuple>::type>::type
+        > type;
+    };
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/util/detail/type_traits.hpp b/modules/cudev/include/opencv2/cudev/util/detail/type_traits.hpp
new file mode 100644
index 00000000000..91e47362f94
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/util/detail/type_traits.hpp
@@ -0,0 +1,238 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_UTIL_TYPE_TRAITS_DETAIL_HPP
+#define OPENCV_CUDEV_UTIL_TYPE_TRAITS_DETAIL_HPP
+
+#include "../../common.hpp"
+
+namespace cv { namespace cudev {
+
+namespace type_traits_detail
+{
+    template <typename T> struct IsSignedIntergral { enum {value = 0}; };
+    template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
+    template <> struct IsSignedIntergral<short> { enum {value = 1}; };
+    template <> struct IsSignedIntergral<int> { enum {value = 1}; };
+
+    template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
+    template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
+    template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
+    template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
+
+    template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
+    template <> struct IsIntegral<char> { enum {value = 1}; };
+    template <> struct IsIntegral<bool> { enum {value = 1}; };
+
+    template <typename T> struct IsFloat { enum {value = 0}; };
+    template <> struct IsFloat<float> { enum {value = 1}; };
+    template <> struct IsFloat<double> { enum {value = 1}; };
+
+    template <typename T> struct IsVec { enum {value = 0}; };
+    template <> struct IsVec<uchar1> { enum {value = 1}; };
+    template <> struct IsVec<uchar2> { enum {value = 1}; };
+    template <> struct IsVec<uchar3> { enum {value = 1}; };
+    template <> struct IsVec<uchar4> { enum {value = 1}; };
+    template <> struct IsVec<char1> { enum {value = 1}; };
+    template <> struct IsVec<char2> { enum {value = 1}; };
+    template <> struct IsVec<char3> { enum {value = 1}; };
+    template <> struct IsVec<char4> { enum {value = 1}; };
+    template <> struct IsVec<ushort1> { enum {value = 1}; };
+    template <> struct IsVec<ushort2> { enum {value = 1}; };
+    template <> struct IsVec<ushort3> { enum {value = 1}; };
+    template <> struct IsVec<ushort4> { enum {value = 1}; };
+    template <> struct IsVec<short1> { enum {value = 1}; };
+    template <> struct IsVec<short2> { enum {value = 1}; };
+    template <> struct IsVec<short3> { enum {value = 1}; };
+    template <> struct IsVec<short4> { enum {value = 1}; };
+    template <> struct IsVec<uint1> { enum {value = 1}; };
+    template <> struct IsVec<uint2> { enum {value = 1}; };
+    template <> struct IsVec<uint3> { enum {value = 1}; };
+    template <> struct IsVec<uint4> { enum {value = 1}; };
+    template <> struct IsVec<int1> { enum {value = 1}; };
+    template <> struct IsVec<int2> { enum {value = 1}; };
+    template <> struct IsVec<int3> { enum {value = 1}; };
+    template <> struct IsVec<int4> { enum {value = 1}; };
+    template <> struct IsVec<float1> { enum {value = 1}; };
+    template <> struct IsVec<float2> { enum {value = 1}; };
+    template <> struct IsVec<float3> { enum {value = 1}; };
+    template <> struct IsVec<float4> { enum {value = 1}; };
+    template <> struct IsVec<double1> { enum {value = 1}; };
+    template <> struct IsVec<double2> { enum {value = 1}; };
+    template <> struct IsVec<double3> { enum {value = 1}; };
+    template <> struct IsVec<double4> { enum {value = 1}; };
+
+    template <class U> struct AddParameterType { typedef const U& type; };
+    template <class U> struct AddParameterType<U&> { typedef U& type; };
+    template <> struct AddParameterType<void> { typedef void type; };
+
+    // ReferenceTraits
+
+    template <class U> struct ReferenceTraits
+    {
+        enum { value = 0 };
+        typedef U type;
+    };
+    template <class U> struct ReferenceTraits<U&>
+    {
+        enum { value = 1 };
+        typedef U type;
+    };
+
+    // PointerTraits
+
+    template <class U> struct PointerTraits
+    {
+        enum { value = 0 };
+        typedef void type;
+    };
+    template <class U> struct PointerTraits<U*>
+    {
+        enum { value = 1 };
+        typedef U type;
+    };
+    template <class U> struct PointerTraits<U*&>
+    {
+        enum { value = 1 };
+        typedef U type;
+    };
+
+    // UnConst
+
+    template <class U> struct UnConst
+    {
+        typedef U type;
+        enum { value = 0 };
+    };
+    template <class U> struct UnConst<const U>
+    {
+        typedef U type;
+        enum { value = 1 };
+    };
+    template <class U> struct UnConst<const U&>
+    {
+        typedef U& type;
+        enum { value = 1 };
+    };
+
+    // UnVolatile
+
+    template <class U> struct UnVolatile
+    {
+        typedef U type;
+        enum { value = 0 };
+    };
+    template <class U> struct UnVolatile<volatile U>
+    {
+        typedef U type;
+        enum { value = 1 };
+    };
+    template <class U> struct UnVolatile<volatile U&>
+    {
+        typedef U& type;
+        enum { value = 1 };
+    };
+
+    // IsSimpleParameter
+
+    template <typename T> struct IsSimpleParameter
+    {
+        enum { value = IsIntegral<T>::value
+               || IsFloat<T>::value
+               || PointerTraits<typename ReferenceTraits<T>::type>::value};
+    };
+
+    // LargerDepth
+
+    template <bool, typename ThenType, typename ElseType> struct SelectIf
+    {
+        typedef ThenType type;
+    };
+    template <typename ThenType, typename ElseType> struct SelectIf<false, ThenType, ElseType>
+    {
+        typedef ElseType type;
+    };
+
+    template <typename A, typename B> struct LargerDepth
+    {
+        typedef typename SelectIf<sizeof(A) >= sizeof(B), A, B>::type type;
+    };
+    template <typename A> struct LargerDepth<A, float>
+    {
+        typedef float type;
+    };
+    template <typename A> struct LargerDepth<float, A>
+    {
+        typedef float type;
+    };
+    template <typename A> struct LargerDepth<A, double>
+    {
+        typedef double type;
+    };
+    template <typename A> struct LargerDepth<double, A>
+    {
+        typedef double type;
+    };
+    template <> struct LargerDepth<float, float>
+    {
+        typedef float type;
+    };
+    template <> struct LargerDepth<float, double>
+    {
+        typedef double type;
+    };
+    template <> struct LargerDepth<double, float>
+    {
+        typedef double type;
+    };
+    template <> struct LargerDepth<double, double>
+    {
+        typedef double type;
+    };
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/util/limits.hpp b/modules/cudev/include/opencv2/cudev/util/limits.hpp
new file mode 100644
index 00000000000..753fd91878b
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/util/limits.hpp
@@ -0,0 +1,129 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_UTIL_LIMITS_HPP
+#define OPENCV_CUDEV_UTIL_LIMITS_HPP
+
+#include <limits.h>
+#include <float.h>
+#include "../common.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <class T> struct numeric_limits;
+
+template <> struct numeric_limits<bool>
+{
+    __device__ __forceinline__ static bool min() { return false; }
+    __device__ __forceinline__ static bool max() { return true;  }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<schar>
+{
+    __device__ __forceinline__ static schar min() { return SCHAR_MIN; }
+    __device__ __forceinline__ static schar max() { return SCHAR_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<uchar>
+{
+    __device__ __forceinline__ static uchar min() { return 0; }
+    __device__ __forceinline__ static uchar max() { return UCHAR_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<short>
+{
+    __device__ __forceinline__ static short min() { return SHRT_MIN; }
+    __device__ __forceinline__ static short max() { return SHRT_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<ushort>
+{
+    __device__ __forceinline__ static ushort min() { return 0; }
+    __device__ __forceinline__ static ushort max() { return USHRT_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<int>
+{
+    __device__ __forceinline__ static int min() { return INT_MIN; }
+    __device__ __forceinline__ static int max() { return INT_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<uint>
+{
+    __device__ __forceinline__ static uint min() { return 0; }
+    __device__ __forceinline__ static uint max() { return UINT_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<float>
+{
+    __device__ __forceinline__ static float min() { return FLT_MIN; }
+    __device__ __forceinline__ static float max() { return FLT_MAX; }
+    __device__ __forceinline__ static float epsilon() { return FLT_EPSILON; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<double>
+{
+    __device__ __forceinline__ static double min() { return DBL_MIN; }
+    __device__ __forceinline__ static double max() { return DBL_MAX; }
+    __device__ __forceinline__ static double epsilon() { return DBL_EPSILON; }
+    static const bool is_signed = true;
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
new file mode 100644
index 00000000000..64a4574ffd4
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
@@ -0,0 +1,300 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_UTIL_SATURATE_CAST_HPP
+#define OPENCV_CUDEV_UTIL_SATURATE_CAST_HPP
+
+#include "../common.hpp"
+#if __CUDACC_VER_MAJOR__ >= 9
+#include <cuda_fp16.h>
+#endif
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <typename T> __device__ __forceinline__ T saturate_cast(uchar v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(schar v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(ushort v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(short v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(uint v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(int v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(float v) { return T(v); }
+template <typename T> __device__ __forceinline__ T saturate_cast(double v) { return T(v); }
+
+template <> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
+{
+    uint res = 0;
+    int vi = v;
+    asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
+    return res;
+}
+template <> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+{
+    uint res = 0;
+    asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
+    return res;
+}
+template <> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+{
+    uint res = 0;
+    asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
+    return res;
+}
+template <> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+{
+    uint res = 0;
+    asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
+    return res;
+}
+template <> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+{
+    uint res = 0;
+    asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
+    return res;
+}
+template <> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
+{
+    uint res = 0;
+    asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
+    return res;
+}
+template <> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
+{
+    uint res = 0;
+    asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
+    return res;
+}
+
+template <> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
+{
+    uint res = 0;
+    uint vi = v;
+    asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
+    return res;
+}
+template <> __device__ __forceinline__ schar saturate_cast<schar>(short v)
+{
+    uint res = 0;
+    asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
+    return res;
+}
+template <> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+{
+    uint res = 0;
+    asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
+    return res;
+}
+template <> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+{
+    uint res = 0;
+    asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
+    return res;
+}
+template <> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
+{
+    uint res = 0;
+    asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
+    return res;
+}
+template <> __device__ __forceinline__ schar saturate_cast<schar>(float v)
+{
+    uint res = 0;
+    asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
+    return res;
+}
+template <> __device__ __forceinline__ schar saturate_cast<schar>(double v)
+{
+    uint res = 0;
+    asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
+    return res;
+}
+
+template <> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
+{
+    ushort res = 0;
+    int vi = v;
+    asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
+    return res;
+}
+template <> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
+{
+    ushort res = 0;
+    asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
+    return res;
+}
+template <> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
+{
+    ushort res = 0;
+    asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
+    return res;
+}
+template <> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
+{
+    ushort res = 0;
+    asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
+    return res;
+}
+template <> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
+{
+    ushort res = 0;
+    asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
+    return res;
+}
+template <> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
+{
+    ushort res = 0;
+    asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
+    return res;
+}
+
+template <> __device__ __forceinline__ short saturate_cast<short>(ushort v)
+{
+    short res = 0;
+    asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
+    return res;
+}
+template <> __device__ __forceinline__ short saturate_cast<short>(int v)
+{
+    short res = 0;
+    asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
+    return res;
+}
+template <> __device__ __forceinline__ short saturate_cast<short>(uint v)
+{
+    short res = 0;
+    asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
+    return res;
+}
+template <> __device__ __forceinline__ short saturate_cast<short>(float v)
+{
+    short res = 0;
+    asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
+    return res;
+}
+template <> __device__ __forceinline__ short saturate_cast<short>(double v)
+{
+    short res = 0;
+    asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
+    return res;
+}
+
+template <> __device__ __forceinline__ int saturate_cast<int>(uint v)
+{
+    int res = 0;
+    asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
+    return res;
+}
+template <> __device__ __forceinline__ int saturate_cast<int>(float v)
+{
+    return __float2int_rn(v);
+}
+template <> __device__ __forceinline__ int saturate_cast<int>(double v)
+{
+#if CV_CUDEV_ARCH >= 130
+    return __double2int_rn(v);
+#else
+    return saturate_cast<int>((float) v);
+#endif
+}
+
+template <> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
+{
+    uint res = 0;
+    int vi = v;
+    asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
+    return res;
+}
+template <> __device__ __forceinline__ uint saturate_cast<uint>(short v)
+{
+    uint res = 0;
+    asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
+    return res;
+}
+template <> __device__ __forceinline__ uint saturate_cast<uint>(int v)
+{
+    uint res = 0;
+    asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
+    return res;
+}
+template <> __device__ __forceinline__ uint saturate_cast<uint>(float v)
+{
+    return __float2uint_rn(v);
+}
+template <> __device__ __forceinline__ uint saturate_cast<uint>(double v)
+{
+#if CV_CUDEV_ARCH >= 130
+    return __double2uint_rn(v);
+#else
+    return saturate_cast<uint>((float) v);
+#endif
+}
+
+template <typename T, typename D> __device__ __forceinline__ D cast_fp16(T v);
+
+template <> __device__ __forceinline__ float cast_fp16<short, float>(short v)
+{
+#if __CUDACC_VER_MAJOR__ >= 9
+  return float(*(__half*)&v);
+#else
+    return __half2float(v);
+#endif
+}
+
+template <> __device__ __forceinline__ short cast_fp16<float, short>(float v)
+{
+#if __CUDACC_VER_MAJOR__ >= 9
+  __half h(v);
+  return *(short*)&v;
+#else
+  return (short)__float2half_rn(v);
+#endif
+}
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/util/simd_functions.hpp b/modules/cudev/include/opencv2/cudev/util/simd_functions.hpp
new file mode 100644
index 00000000000..ed6efa6a2ba
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/util/simd_functions.hpp
@@ -0,0 +1,918 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+ * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ *   Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ *   Neither the name of NVIDIA Corporation nor the names of its contributors
+ *   may be used to endorse or promote products derived from this software
+ *   without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_UTIL_SIMD_FUNCTIONS_HPP
+#define OPENCV_CUDEV_UTIL_SIMD_FUNCTIONS_HPP
+
+#include "../common.hpp"
+
+/*
+  This header file contains inline functions that implement intra-word SIMD
+  operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
+  emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
+  to make the code portable across all GPUs supported by CUDA. The following
+  functions are currently implemented:
+
+  vadd2(a,b)      per-halfword unsigned addition, with wrap-around: a + b
+  vsub2(a,b)      per-halfword unsigned subtraction, with wrap-around: a - b
+  vabsdiff2(a,b)  per-halfword unsigned absolute difference: |a - b|
+  vavg2(a,b)      per-halfword unsigned average: (a + b) / 2
+  vavrg2(a,b)     per-halfword unsigned rounded average: (a + b + 1) / 2
+  vseteq2(a,b)    per-halfword unsigned comparison: a == b ? 1 : 0
+  vcmpeq2(a,b)    per-halfword unsigned comparison: a == b ? 0xffff : 0
+  vsetge2(a,b)    per-halfword unsigned comparison: a >= b ? 1 : 0
+  vcmpge2(a,b)    per-halfword unsigned comparison: a >= b ? 0xffff : 0
+  vsetgt2(a,b)    per-halfword unsigned comparison: a > b ? 1 : 0
+  vcmpgt2(a,b)    per-halfword unsigned comparison: a > b ? 0xffff : 0
+  vsetle2(a,b)    per-halfword unsigned comparison: a <= b ? 1 : 0
+  vcmple2(a,b)    per-halfword unsigned comparison: a <= b ? 0xffff : 0
+  vsetlt2(a,b)    per-halfword unsigned comparison: a < b ? 1 : 0
+  vcmplt2(a,b)    per-halfword unsigned comparison: a < b ? 0xffff : 0
+  vsetne2(a,b)    per-halfword unsigned comparison: a != b ? 1 : 0
+  vcmpne2(a,b)    per-halfword unsigned comparison: a != b ? 0xffff : 0
+  vmax2(a,b)      per-halfword unsigned maximum: max(a, b)
+  vmin2(a,b)      per-halfword unsigned minimum: min(a, b)
+
+  vadd4(a,b)      per-byte unsigned addition, with wrap-around: a + b
+  vsub4(a,b)      per-byte unsigned subtraction, with wrap-around: a - b
+  vabsdiff4(a,b)  per-byte unsigned absolute difference: |a - b|
+  vavg4(a,b)      per-byte unsigned average: (a + b) / 2
+  vavrg4(a,b)     per-byte unsigned rounded average: (a + b + 1) / 2
+  vseteq4(a,b)    per-byte unsigned comparison: a == b ? 1 : 0
+  vcmpeq4(a,b)    per-byte unsigned comparison: a == b ? 0xff : 0
+  vsetge4(a,b)    per-byte unsigned comparison: a >= b ? 1 : 0
+  vcmpge4(a,b)    per-byte unsigned comparison: a >= b ? 0xff : 0
+  vsetgt4(a,b)    per-byte unsigned comparison: a > b ? 1 : 0
+  vcmpgt4(a,b)    per-byte unsigned comparison: a > b ? 0xff : 0
+  vsetle4(a,b)    per-byte unsigned comparison: a <= b ? 1 : 0
+  vcmple4(a,b)    per-byte unsigned comparison: a <= b ? 0xff : 0
+  vsetlt4(a,b)    per-byte unsigned comparison: a < b ? 1 : 0
+  vcmplt4(a,b)    per-byte unsigned comparison: a < b ? 0xff : 0
+  vsetne4(a,b)    per-byte unsigned comparison: a != b ? 1: 0
+  vcmpne4(a,b)    per-byte unsigned comparison: a != b ? 0xff: 0
+  vmax4(a,b)      per-byte unsigned maximum: max(a, b)
+  vmin4(a,b)      per-byte unsigned minimum: min(a, b)
+*/
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// 2
+
+__device__ __forceinline__ uint vadd2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#elif CV_CUDEV_ARCH >= 200
+    asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint s;
+    s = a ^ b;          // sum bits
+    r = a + b;          // actual sum
+    s = s ^ r;          // determine carry-ins for each bit position
+    s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
+    r = r - s;          // subtract out carry-out from low word
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsub2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#elif CV_CUDEV_ARCH >= 200
+    asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint s;
+    s = a ^ b;          // sum bits
+    r = a - b;          // actual sum
+    s = s ^ r;          // determine carry-ins for each bit position
+    s = s & 0x00010000; // borrow to high word
+    r = r + s;          // compensate for borrow from low word
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vabsdiff2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#elif CV_CUDEV_ARCH >= 200
+    asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint s, t, u, v;
+    s = a & 0x0000ffff; // extract low halfword
+    r = b & 0x0000ffff; // extract low halfword
+    u = ::max(r, s);    // maximum of low halfwords
+    v = ::min(r, s);    // minimum of low halfwords
+    s = a & 0xffff0000; // extract high halfword
+    r = b & 0xffff0000; // extract high halfword
+    t = ::max(r, s);    // maximum of high halfwords
+    s = ::min(r, s);    // minimum of high halfwords
+    r = u | t;          // maximum of both halfwords
+    s = v | s;          // minimum of both halfwords
+    r = r - s;          // |a - b| = max(a,b) - min(a,b);
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vavg2(uint a, uint b)
+{
+    uint r, s;
+
+    // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+    // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+    s = a ^ b;
+    r = a & b;
+    s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
+    s = s >> 1;
+    s = r + s;
+
+    return s;
+}
+
+__device__ __forceinline__ uint vavrg2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+    // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+    uint s;
+    s = a ^ b;
+    r = a | b;
+    s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
+    s = s >> 1;
+    r = r - s;
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vseteq2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    // inspired by Alan Mycroft's null-byte detection algorithm:
+    // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+    uint c;
+    r = a ^ b;          // 0x0000 if a == b
+    c = r | 0x80008000; // set msbs, to catch carry out
+    r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+    c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+    c = r & ~c;         // msb = 1, if r was 0x0000
+    r = c >> 15;        // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmpeq2(uint a, uint b)
+{
+    uint r, c;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vseteq2(a, b);
+    c = r << 16;        // convert bool
+    r = c - r;          //  into mask
+#else
+    // inspired by Alan Mycroft's null-byte detection algorithm:
+    // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+    r = a ^ b;          // 0x0000 if a == b
+    c = r | 0x80008000; // set msbs, to catch carry out
+    r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+    c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+    c = r & ~c;         // msb = 1, if r was 0x0000
+    r = c >> 15;        // convert
+    r = c - r;          //  msbs to
+    r = c | r;          //   mask
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsetge2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint c;
+    asm("not.b32 %0, %0;" : "+r"(b));
+    c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+    c = c & 0x80008000; // msb = carry-outs
+    r = c >> 15;        // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmpge2(uint a, uint b)
+{
+    uint r, c;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vsetge2(a, b);
+    c = r << 16;        // convert bool
+    r = c - r;          //  into mask
+#else
+    asm("not.b32 %0, %0;" : "+r"(b));
+    c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+    c = c & 0x80008000; // msb = carry-outs
+    r = c >> 15;        // convert
+    r = c - r;          //  msbs to
+    r = c | r;          //   mask
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsetgt2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint c;
+    asm("not.b32 %0, %0;" : "+r"(b));
+    c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+    c = c & 0x80008000; // msbs = carry-outs
+    r = c >> 15;        // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmpgt2(uint a, uint b)
+{
+    uint r, c;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vsetgt2(a, b);
+    c = r << 16;        // convert bool
+    r = c - r;          //  into mask
+#else
+    asm("not.b32 %0, %0;" : "+r"(b));
+    c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+    c = c & 0x80008000; // msbs = carry-outs
+    r = c >> 15;        // convert
+    r = c - r;          //  msbs to
+    r = c | r;          //   mask
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsetle2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint c;
+    asm("not.b32 %0, %0;" : "+r"(a));
+    c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+    c = c & 0x80008000; // msb = carry-outs
+    r = c >> 15;        // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmple2(uint a, uint b)
+{
+    uint r, c;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vsetle2(a, b);
+    c = r << 16;        // convert bool
+    r = c - r;          //  into mask
+#else
+    asm("not.b32 %0, %0;" : "+r"(a));
+    c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+    c = c & 0x80008000; // msb = carry-outs
+    r = c >> 15;        // convert
+    r = c - r;          //  msbs to
+    r = c | r;          //   mask
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsetlt2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint c;
+    asm("not.b32 %0, %0;" : "+r"(a));
+    c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+    c = c & 0x80008000; // msb = carry-outs
+    r = c >> 15;        // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmplt2(uint a, uint b)
+{
+    uint r, c;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vsetlt2(a, b);
+    c = r << 16;        // convert bool
+    r = c - r;          //  into mask
+#else
+    asm("not.b32 %0, %0;" : "+r"(a));
+    c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+    c = c & 0x80008000; // msb = carry-outs
+    r = c >> 15;        // convert
+    r = c - r;          //  msbs to
+    r = c | r;          //   mask
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsetne2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    // inspired by Alan Mycroft's null-byte detection algorithm:
+    // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+    uint c;
+    r = a ^ b;          // 0x0000 if a == b
+    c = r | 0x80008000; // set msbs, to catch carry out
+    c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+    c = r | c;          // msb = 1, if r was not 0x0000
+    c = c & 0x80008000; // extract msbs
+    r = c >> 15;        // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmpne2(uint a, uint b)
+{
+    uint r, c;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vsetne2(a, b);
+    c = r << 16;        // convert bool
+    r = c - r;          //  into mask
+#else
+    // inspired by Alan Mycroft's null-byte detection algorithm:
+    // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+    r = a ^ b;          // 0x0000 if a == b
+    c = r | 0x80008000; // set msbs, to catch carry out
+    c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+    c = r | c;          // msb = 1, if r was not 0x0000
+    c = c & 0x80008000; // extract msbs
+    r = c >> 15;        // convert
+    r = c - r;          //  msbs to
+    r = c | r;          //   mask
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vmax2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#elif CV_CUDEV_ARCH >= 200
+    asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint s, t, u;
+    r = a & 0x0000ffff; // extract low halfword
+    s = b & 0x0000ffff; // extract low halfword
+    t = ::max(r, s);    // maximum of low halfwords
+    r = a & 0xffff0000; // extract high halfword
+    s = b & 0xffff0000; // extract high halfword
+    u = ::max(r, s);    // maximum of high halfwords
+    r = t | u;          // combine halfword maximums
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vmin2(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#elif CV_CUDEV_ARCH >= 200
+    asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint s, t, u;
+    r = a & 0x0000ffff; // extract low halfword
+    s = b & 0x0000ffff; // extract low halfword
+    t = ::min(r, s);    // minimum of low halfwords
+    r = a & 0xffff0000; // extract high halfword
+    s = b & 0xffff0000; // extract high halfword
+    u = ::min(r, s);    // minimum of high halfwords
+    r = t | u;          // combine halfword minimums
+#endif
+
+    return r;
+}
+
+// 4
+
+__device__ __forceinline__ uint vadd4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#elif CV_CUDEV_ARCH >= 200
+    asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint s, t;
+    s = a ^ b;          // sum bits
+    r = a & 0x7f7f7f7f; // clear msbs
+    t = b & 0x7f7f7f7f; // clear msbs
+    s = s & 0x80808080; // msb sum bits
+    r = r + t;          // add without msbs, record carry-out in msbs
+    r = r ^ s;          // sum of msb sum and carry-in bits, w/o carry-out
+#endif /* CV_CUDEV_ARCH >= 300 */
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsub4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#elif CV_CUDEV_ARCH >= 200
+    asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint s, t;
+    s = a ^ ~b;         // inverted sum bits
+    r = a | 0x80808080; // set msbs
+    t = b & 0x7f7f7f7f; // clear msbs
+    s = s & 0x80808080; // inverted msb sum bits
+    r = r - t;          // subtract w/o msbs, record inverted borrows in msb
+    r = r ^ s;          // combine inverted msb sum bits and borrows
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vavg4(uint a, uint b)
+{
+    uint r, s;
+
+    // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+    // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+    s = a ^ b;
+    r = a & b;
+    s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+    s = s >> 1;
+    s = r + s;
+
+    return s;
+}
+
+__device__ __forceinline__ uint vavrg4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+    // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+    uint c;
+    c = a ^ b;
+    r = a | b;
+    c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+    c = c >> 1;
+    r = r - c;
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vseteq4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    // inspired by Alan Mycroft's null-byte detection algorithm:
+    // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+    uint c;
+    r = a ^ b;          // 0x00 if a == b
+    c = r | 0x80808080; // set msbs, to catch carry out
+    r = r ^ c;          // extract msbs, msb = 1 if r < 0x80
+    c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+    c = r & ~c;         // msb = 1, if r was 0x00
+    r = c >> 7;         // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmpeq4(uint a, uint b)
+{
+    uint r, t;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vseteq4(a, b);
+    t = r << 8;         // convert bool
+    r = t - r;          //  to mask
+#else
+    // inspired by Alan Mycroft's null-byte detection algorithm:
+    // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+    t = a ^ b;          // 0x00 if a == b
+    r = t | 0x80808080; // set msbs, to catch carry out
+    t = t ^ r;          // extract msbs, msb = 1 if t < 0x80
+    r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
+    r = t & ~r;         // msb = 1, if t was 0x00
+    t = r >> 7;         // build mask
+    t = r - t;          //  from
+    r = t | r;          //   msbs
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsetle4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint c;
+    asm("not.b32 %0, %0;" : "+r"(a));
+    c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+    c = c & 0x80808080; // msb = carry-outs
+    r = c >> 7;         // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmple4(uint a, uint b)
+{
+    uint r, c;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vsetle4(a, b);
+    c = r << 8;         // convert bool
+    r = c - r;          //  to mask
+#else
+    asm("not.b32 %0, %0;" : "+r"(a));
+    c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+    c = c & 0x80808080; // msbs = carry-outs
+    r = c >> 7;         // convert
+    r = c - r;          //  msbs to
+    r = c | r;          //   mask
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsetlt4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint c;
+    asm("not.b32 %0, %0;" : "+r"(a));
+    c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+    c = c & 0x80808080; // msb = carry-outs
+    r = c >> 7;         // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmplt4(uint a, uint b)
+{
+    uint r, c;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vsetlt4(a, b);
+    c = r << 8;         // convert bool
+    r = c - r;          //  to mask
+#else
+    asm("not.b32 %0, %0;" : "+r"(a));
+    c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+    c = c & 0x80808080; // msbs = carry-outs
+    r = c >> 7;         // convert
+    r = c - r;          //  msbs to
+    r = c | r;          //   mask
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsetge4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint c;
+    asm("not.b32 %0, %0;" : "+r"(b));
+    c = vavrg4(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+    c = c & 0x80808080; // msb = carry-outs
+    r = c >> 7;         // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmpge4(uint a, uint b)
+{
+    uint r, s;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vsetge4(a, b);
+    s = r << 8;         // convert bool
+    r = s - r;          //  to mask
+#else
+    asm ("not.b32 %0,%0;" : "+r"(b));
+    r = vavrg4 (a, b);  // (a + ~b + 1) / 2 = (a - b) / 2
+    r = r & 0x80808080; // msb = carry-outs
+    s = r >> 7;         // build mask
+    s = r - s;          //  from
+    r = s | r;          //   msbs
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsetgt4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint c;
+    asm("not.b32 %0, %0;" : "+r"(b));
+    c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+    c = c & 0x80808080; // msb = carry-outs
+    r = c >> 7;         // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmpgt4(uint a, uint b)
+{
+    uint r, c;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vsetgt4(a, b);
+    c = r << 8;         // convert bool
+    r = c - r;          //  to mask
+#else
+    asm("not.b32 %0, %0;" : "+r"(b));
+    c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+    c = c & 0x80808080; // msb = carry-outs
+    r = c >> 7;         // convert
+    r = c - r;          //  msbs to
+    r = c | r;          //   mask
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vsetne4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    // inspired by Alan Mycroft's null-byte detection algorithm:
+    // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+    uint c;
+    r = a ^ b;          // 0x00 if a == b
+    c = r | 0x80808080; // set msbs, to catch carry out
+    c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+    c = r | c;          // msb = 1, if r was not 0x00
+    c = c & 0x80808080; // extract msbs
+    r = c >> 7;         // convert to bool
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vcmpne4(uint a, uint b)
+{
+    uint r, c;
+
+#if CV_CUDEV_ARCH >= 300
+    r = vsetne4(a, b);
+    c = r << 8;         // convert bool
+    r = c - r;          //  to mask
+#else
+    // inspired by Alan Mycroft's null-byte detection algorithm:
+    // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+    r = a ^ b;          // 0x00 if a == b
+    c = r | 0x80808080; // set msbs, to catch carry out
+    c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+    c = r | c;          // msb = 1, if r was not 0x00
+    c = c & 0x80808080; // extract msbs
+    r = c >> 7;         // convert
+    r = c - r;          //  msbs to
+    r = c | r;          //   mask
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vabsdiff4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#elif CV_CUDEV_ARCH >= 200
+    asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint s;
+    s = vcmpge4(a, b);  // mask = 0xff if a >= b
+    r = a ^ b;          //
+    s = (r &  s) ^ b;   // select a when a >= b, else select b => max(a,b)
+    r = s ^ r;          // select a when b >= a, else select b => min(a,b)
+    r = s - r;          // |a - b| = max(a,b) - min(a,b);
+#endif
+
+    return r;
+}
+
+__device__ __forceinline__ uint vmax4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#elif CV_CUDEV_ARCH >= 200
+    asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint s;
+    s = vcmpge4(a, b);  // mask = 0xff if a >= b
+    r = a & s;          // select a when b >= a
+    s = b & ~s;         // select b when b < a
+    r = r | s;          // combine byte selections
+#endif
+
+    return r;           // byte-wise unsigned maximum
+}
+
+__device__ __forceinline__ uint vmin4(uint a, uint b)
+{
+    uint r = 0;
+
+#if CV_CUDEV_ARCH >= 300
+    asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#elif CV_CUDEV_ARCH >= 200
+    asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+#else
+    uint s;
+    s = vcmpge4(b, a);  // mask = 0xff if a >= b
+    r = a & s;          // select a when b >= a
+    s = b & ~s;         // select b when b < a
+    r = r | s;          // combine byte selections
+#endif
+
+    return r;
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/util/tuple.hpp b/modules/cudev/include/opencv2/cudev/util/tuple.hpp
new file mode 100644
index 00000000000..b28bb4df03d
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/util/tuple.hpp
@@ -0,0 +1,85 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_UTIL_TUPLE_HPP
+#define OPENCV_CUDEV_UTIL_TUPLE_HPP
+
+#include "../common.hpp"
+#include "detail/tuple.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+using tuple_detail::tuple;
+using tuple_detail::tuple_size;
+using tuple_detail::get;
+using tuple_detail::tuple_element;
+using tuple_detail::make_tuple;
+using tuple_detail::tie;
+
+template <typename T> struct TupleTraits
+{
+    enum { is_tuple = 0 };
+    enum { size = 1 };
+};
+template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9>
+struct TupleTraits< tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >
+{
+    enum { is_tuple = 1 };
+    enum { size = tuple_size< tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value };
+};
+
+template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple
+{
+    typedef typename tuple_detail::ConvertTuple<Tuple, tuple_size<Tuple>::value, CvtOp>::type type;
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/util/type_traits.hpp b/modules/cudev/include/opencv2/cudev/util/type_traits.hpp
new file mode 100644
index 00000000000..cad1f006f39
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/util/type_traits.hpp
@@ -0,0 +1,174 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_UTIL_TYPE_TRAITS_HPP
+#define OPENCV_CUDEV_UTIL_TYPE_TRAITS_HPP
+
+#include "../common.hpp"
+#include "vec_traits.hpp"
+#include "detail/type_traits.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// NullType
+
+struct NullType {};
+
+// Int2Type
+
+template <int A> struct Int2Type
+{
+   enum { value = A };
+};
+
+// ArrayWrapper
+
+template <typename T, int COUNT> struct ArrayWrapper
+{
+    T array[COUNT];
+};
+
+// Log2 (compile time calculation)
+
+template <int N, int CURRENT_VAL = N, int COUNT = 0> struct Log2
+{
+    enum { value = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
+};
+template <int N, int COUNT> struct Log2<N, 0, COUNT>
+{
+    enum { value = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 };
+};
+
+// IsPowerOf2
+
+template <int N> struct IsPowerOf2
+{
+    enum { value = ((N != 0) && !(N & (N - 1))) };
+};
+
+// SelectIf
+
+template <bool, typename ThenType, typename ElseType> struct SelectIf
+{
+    typedef ThenType type;
+};
+template <typename ThenType, typename ElseType> struct SelectIf<false, ThenType, ElseType>
+{
+    typedef ElseType type;
+};
+
+// EnableIf
+
+template <bool, typename T = void> struct EnableIf {};
+template <typename T> struct EnableIf<true, T> { typedef T type; };
+
+// DisableIf
+
+template <bool, typename T = void> struct DisableIf {};
+template <typename T> struct DisableIf<false, T> { typedef T type; };
+
+// TypesEquals
+
+template <typename A, typename B> struct TypesEquals
+{
+    enum { value = 0 };
+};
+template <typename A> struct TypesEquals<A, A>
+{
+    enum { value = 1 };
+};
+
+// TypeTraits
+
+template <typename T> struct TypeTraits
+{
+    typedef typename type_traits_detail::UnConst<T>::type                                                non_const_type;
+    typedef typename type_traits_detail::UnVolatile<T>::type                                             non_volatile_type;
+    typedef typename type_traits_detail::UnVolatile<typename type_traits_detail::UnConst<T>::type>::type unqualified_type;
+    typedef typename type_traits_detail::PointerTraits<unqualified_type>::type                           pointee_type;
+    typedef typename type_traits_detail::ReferenceTraits<T>::type                                        referred_type;
+
+    enum { is_const          = type_traits_detail::UnConst<T>::value };
+    enum { is_volatile       = type_traits_detail::UnVolatile<T>::value };
+
+    enum { is_reference      = type_traits_detail::ReferenceTraits<unqualified_type>::value };
+    enum { is_pointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<unqualified_type>::type>::value };
+
+    enum { is_unsigned_int   = type_traits_detail::IsUnsignedIntegral<unqualified_type>::value };
+    enum { is_signed_int     = type_traits_detail::IsSignedIntergral<unqualified_type>::value };
+    enum { is_integral       = type_traits_detail::IsIntegral<unqualified_type>::value };
+    enum { is_float          = type_traits_detail::IsFloat<unqualified_type>::value };
+    enum { is_scalar         = is_integral || is_float };
+    enum { is_vec            = type_traits_detail::IsVec<unqualified_type>::value };
+
+    typedef typename SelectIf<type_traits_detail::IsSimpleParameter<unqualified_type>::value,
+        T, typename type_traits_detail::AddParameterType<T>::type>::type parameter_type;
+};
+
+// LargerType
+
+template <typename A, typename B> struct LargerType
+{
+    typedef typename SelectIf<
+        unsigned(VecTraits<A>::cn) != unsigned(VecTraits<B>::cn),
+        void,
+        typename MakeVec<
+            typename type_traits_detail::LargerDepth<
+                typename VecTraits<A>::elem_type,
+                typename VecTraits<B>::elem_type
+            >::type,
+            VecTraits<A>::cn
+        >::type
+    >::type type;
+};
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/util/vec_math.hpp b/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
new file mode 100644
index 00000000000..f6d8d2cda41
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
@@ -0,0 +1,941 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_UTIL_VEC_MATH_HPP
+#define OPENCV_CUDEV_UTIL_VEC_MATH_HPP
+
+#include "vec_traits.hpp"
+#include "saturate_cast.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// saturate_cast
+
+namespace vec_math_detail
+{
+    template <int cn, typename VecD> struct SatCastHelper;
+
+    template <typename VecD> struct SatCastHelper<1, VecD>
+    {
+        template <typename VecS> __device__ __forceinline__ static VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x));
+        }
+    };
+
+    template <typename VecD> struct SatCastHelper<2, VecD>
+    {
+        template <typename VecS> __device__ __forceinline__ static VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
+        }
+    };
+
+    template <typename VecD> struct SatCastHelper<3, VecD>
+    {
+        template <typename VecS> __device__ __forceinline__ static VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
+        }
+    };
+
+    template <typename VecD> struct SatCastHelper<4, VecD>
+    {
+        template <typename VecS> __device__ __forceinline__ static VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
+        }
+    };
+}
+
+template<typename T> __device__ __forceinline__ T saturate_cast(const uchar1& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const char1& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const ushort1& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const short1& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const uint1& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const int1& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const float1& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const double1& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+
+template<typename T> __device__ __forceinline__ T saturate_cast(const uchar2& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const char2& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const ushort2& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const short2& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const uint2& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const int2& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const float2& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const double2& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+
+template<typename T> __device__ __forceinline__ T saturate_cast(const uchar3& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const char3& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const ushort3& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const short3& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const uint3& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const int3& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const float3& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const double3& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+
+template<typename T> __device__ __forceinline__ T saturate_cast(const uchar4& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const char4& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const ushort4& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const short4& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const uint4& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const int4& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const float4& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+template<typename T> __device__ __forceinline__ T saturate_cast(const double4& v) { return vec_math_detail::SatCastHelper<VecTraits<T>::cn, T>::cast(v); }
+
+// unary operators
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a) \
+    { \
+        return VecTraits<output_type ## 1>::make(op (a.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a) \
+    { \
+        return VecTraits<output_type ## 2>::make(op (a.x), op (a.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a) \
+    { \
+        return VecTraits<output_type ## 3>::make(op (a.x), op (a.y), op (a.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a) \
+    { \
+        return VecTraits<output_type ## 4>::make(op (a.x), op (a.y), op (a.z), op (a.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_OP
+
+// unary functions
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a) \
+    { \
+        return VecTraits<output_type ## 1>::make(func (a.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a) \
+    { \
+        return VecTraits<output_type ## 2>::make(func (a.x), func (a.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a) \
+    { \
+        return VecTraits<output_type ## 3>::make(func (a.x), func (a.y), func (a.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a) \
+    { \
+        return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
+    }
+
+namespace vec_math_detail
+{
+    __device__ __forceinline__ schar abs_(schar val)
+    {
+        return (schar) ::abs((int) val);
+    }
+
+    __device__ __forceinline__ short abs_(short val)
+    {
+        return (short) ::abs((int) val);
+    }
+}
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, vec_math_detail::abs_, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, vec_math_detail::abs_, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabsf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabs, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrt, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::exp, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::log, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::cosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanh, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
+
+// binary operators (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op b.x, a.y op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op b.x, a.y op b.y, a.z op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_OP
+
+// binary operators (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(op, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op s); \
+    } \
+    __device__ __forceinline__ output_type ## 1 operator op(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(s op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op s, a.y op s); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(s op b.x, s op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op s, a.y op s, a.z op s); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(s op b.x, s op b.y, s op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op s, a.y op s, a.z op s, a.w op s); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(s op b.x, s op b.y, s op b.z, s op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uint, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP
+
+// binary function (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func (a.x, b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func (a.x, b.x), func (a.y, b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z), func (a.w, b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmaxf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmax, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fminf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fmin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypot, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC
+
+// binary function (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(func_name, func, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) a.x, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 1 func_name(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) s, (output_type) b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s), func ((output_type) a.w, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z), func ((output_type) s, (output_type) b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, double, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp b/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
new file mode 100644
index 00000000000..bff3744ef7a
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
@@ -0,0 +1,325 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_UTIL_VEC_TRAITS_HPP
+#define OPENCV_CUDEV_UTIL_VEC_TRAITS_HPP
+
+#include "../common.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// MakeVec
+
+template<typename T, int CN> struct MakeVec;
+
+#define CV_CUDEV_MAKE_VEC_INST(elem_type) \
+    template<> struct MakeVec<elem_type, 1> { typedef elem_type      type; }; \
+    template<> struct MakeVec<elem_type, 2> { typedef elem_type ## 2 type; }; \
+    template<> struct MakeVec<elem_type, 3> { typedef elem_type ## 3 type; }; \
+    template<> struct MakeVec<elem_type, 4> { typedef elem_type ## 4 type; };
+
+CV_CUDEV_MAKE_VEC_INST(uchar)
+CV_CUDEV_MAKE_VEC_INST(ushort)
+CV_CUDEV_MAKE_VEC_INST(short)
+CV_CUDEV_MAKE_VEC_INST(int)
+CV_CUDEV_MAKE_VEC_INST(uint)
+CV_CUDEV_MAKE_VEC_INST(float)
+CV_CUDEV_MAKE_VEC_INST(double)
+
+#undef CV_CUDEV_MAKE_VEC_INST
+
+template<> struct MakeVec<schar, 1> { typedef schar type; };
+template<> struct MakeVec<schar, 2> { typedef char2 type; };
+template<> struct MakeVec<schar, 3> { typedef char3 type; };
+template<> struct MakeVec<schar, 4> { typedef char4 type; };
+
+template<> struct MakeVec<bool, 1> { typedef uchar  type; };
+template<> struct MakeVec<bool, 2> { typedef uchar2 type; };
+template<> struct MakeVec<bool, 3> { typedef uchar3 type; };
+template<> struct MakeVec<bool, 4> { typedef uchar4 type; };
+
+// VecTraits
+
+template<typename T> struct VecTraits;
+
+#define CV_CUDEV_VEC_TRAITS_INST(type) \
+    template <> struct VecTraits<type> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        __host__ __device__ __forceinline__ static type all(type v) {return v;} \
+        __host__ __device__ __forceinline__ static type make(type x) {return x;} \
+        __host__ __device__ __forceinline__ static type make(const type* v) {return *v;} \
+    }; \
+    template <> struct VecTraits<type ## 1> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        __host__ __device__ __forceinline__ static type ## 1 all(type v) {return make_ ## type ## 1(v);} \
+        __host__ __device__ __forceinline__ static type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+        __host__ __device__ __forceinline__ static type ## 1 make(const type* v) {return make_ ## type ## 1(*v);} \
+    }; \
+    template <> struct VecTraits<type ## 2> \
+    { \
+        typedef type elem_type; \
+        enum {cn=2}; \
+        __host__ __device__ __forceinline__ static type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
+        __host__ __device__ __forceinline__ static type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+        __host__ __device__ __forceinline__ static type ## 2 make(const type* v) {return make_ ## type ## 2(v[0], v[1]);} \
+    }; \
+    template <> struct VecTraits<type ## 3> \
+    { \
+        typedef type elem_type; \
+        enum {cn=3}; \
+        __host__ __device__ __forceinline__ static type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
+        __host__ __device__ __forceinline__ static type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+        __host__ __device__ __forceinline__ static type ## 3 make(const type* v) {return make_ ## type ## 3(v[0], v[1], v[2]);} \
+    }; \
+    template <> struct VecTraits<type ## 4> \
+    { \
+        typedef type elem_type; \
+        enum {cn=4}; \
+        __host__ __device__ __forceinline__ static type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
+        __host__ __device__ __forceinline__ static type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+        __host__ __device__ __forceinline__ static type ## 4 make(const type* v) {return make_ ## type ## 4(v[0], v[1], v[2], v[3]);} \
+    };
+
+CV_CUDEV_VEC_TRAITS_INST(uchar)
+CV_CUDEV_VEC_TRAITS_INST(ushort)
+CV_CUDEV_VEC_TRAITS_INST(short)
+CV_CUDEV_VEC_TRAITS_INST(int)
+CV_CUDEV_VEC_TRAITS_INST(uint)
+CV_CUDEV_VEC_TRAITS_INST(float)
+CV_CUDEV_VEC_TRAITS_INST(double)
+
+#undef CV_CUDEV_VEC_TRAITS_INST
+
+template<> struct VecTraits<schar>
+{
+    typedef schar elem_type;
+    enum {cn=1};
+    __host__ __device__ __forceinline__ static schar all(schar v) {return v;}
+    __host__ __device__ __forceinline__ static schar make(schar x) {return x;}
+    __host__ __device__ __forceinline__ static schar make(const schar* x) {return *x;}
+};
+template<> struct VecTraits<char1>
+{
+    typedef schar elem_type;
+    enum {cn=1};
+    __host__ __device__ __forceinline__ static char1 all(schar v) {return make_char1(v);}
+    __host__ __device__ __forceinline__ static char1 make(schar x) {return make_char1(x);}
+    __host__ __device__ __forceinline__ static char1 make(const schar* v) {return make_char1(v[0]);}
+};
+template<> struct VecTraits<char2>
+{
+    typedef schar elem_type;
+    enum {cn=2};
+    __host__ __device__ __forceinline__ static char2 all(schar v) {return make_char2(v, v);}
+    __host__ __device__ __forceinline__ static char2 make(schar x, schar y) {return make_char2(x, y);}
+    __host__ __device__ __forceinline__ static char2 make(const schar* v) {return make_char2(v[0], v[1]);}
+};
+template<> struct VecTraits<char3>
+{
+    typedef schar elem_type;
+    enum {cn=3};
+    __host__ __device__ __forceinline__ static char3 all(schar v) {return make_char3(v, v, v);}
+    __host__ __device__ __forceinline__ static char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
+    __host__ __device__ __forceinline__ static char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
+};
+template<> struct VecTraits<char4>
+{
+    typedef schar elem_type;
+    enum {cn=4};
+    __host__ __device__ __forceinline__ static char4 all(schar v) {return make_char4(v, v, v, v);}
+    __host__ __device__ __forceinline__ static char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
+    __host__ __device__ __forceinline__ static char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
+};
+
+//! @}
+
+}}
+
+// DataType
+
+namespace cv {
+
+template <> class DataType<uint>
+{
+public:
+    typedef uint         value_type;
+    typedef value_type   work_type;
+    typedef value_type   channel_type;
+    typedef value_type   vec_type;
+    enum { generic_type = 0,
+           depth        = CV_32S,
+           channels     = 1,
+           fmt          = (int)'i',
+           type         = CV_MAKE_TYPE(depth, channels)
+         };
+};
+
+#define CV_CUDEV_DATA_TYPE_INST(_depth_type, _channel_num) \
+    template <> class DataType< _depth_type ## _channel_num > \
+    { \
+    public: \
+        typedef _depth_type ## _channel_num     value_type; \
+        typedef value_type                      work_type; \
+        typedef _depth_type                     channel_type; \
+        typedef value_type                      vec_type; \
+        enum { generic_type = 0, \
+               depth        = DataType<channel_type>::depth, \
+               channels     = _channel_num, \
+               fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8), \
+               type         = CV_MAKE_TYPE(depth, channels) \
+             }; \
+    };
+
+CV_CUDEV_DATA_TYPE_INST(uchar, 1)
+CV_CUDEV_DATA_TYPE_INST(uchar, 2)
+CV_CUDEV_DATA_TYPE_INST(uchar, 3)
+CV_CUDEV_DATA_TYPE_INST(uchar, 4)
+
+CV_CUDEV_DATA_TYPE_INST(ushort, 1)
+CV_CUDEV_DATA_TYPE_INST(ushort, 2)
+CV_CUDEV_DATA_TYPE_INST(ushort, 3)
+CV_CUDEV_DATA_TYPE_INST(ushort, 4)
+
+CV_CUDEV_DATA_TYPE_INST(short, 1)
+CV_CUDEV_DATA_TYPE_INST(short, 2)
+CV_CUDEV_DATA_TYPE_INST(short, 3)
+CV_CUDEV_DATA_TYPE_INST(short, 4)
+
+CV_CUDEV_DATA_TYPE_INST(int, 1)
+CV_CUDEV_DATA_TYPE_INST(int, 2)
+CV_CUDEV_DATA_TYPE_INST(int, 3)
+CV_CUDEV_DATA_TYPE_INST(int, 4)
+
+CV_CUDEV_DATA_TYPE_INST(uint, 1)
+CV_CUDEV_DATA_TYPE_INST(uint, 2)
+CV_CUDEV_DATA_TYPE_INST(uint, 3)
+CV_CUDEV_DATA_TYPE_INST(uint, 4)
+
+CV_CUDEV_DATA_TYPE_INST(float, 1)
+CV_CUDEV_DATA_TYPE_INST(float, 2)
+CV_CUDEV_DATA_TYPE_INST(float, 3)
+CV_CUDEV_DATA_TYPE_INST(float, 4)
+
+CV_CUDEV_DATA_TYPE_INST(double, 1)
+CV_CUDEV_DATA_TYPE_INST(double, 2)
+CV_CUDEV_DATA_TYPE_INST(double, 3)
+CV_CUDEV_DATA_TYPE_INST(double, 4)
+
+#undef CV_CUDEV_DATA_TYPE_INST
+
+template<> class DataType<char1>
+{
+public:
+    typedef char1      value_type;
+    typedef value_type work_type;
+    typedef schar      channel_type;
+    typedef value_type vec_type;
+
+    enum { generic_type = 0,
+           depth        = DataType<channel_type>::depth,
+           channels     = 1,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+           type         = CV_MAKE_TYPE(depth, channels)
+         };
+};
+
+template<> class DataType<char2>
+{
+public:
+    typedef char2      value_type;
+    typedef value_type work_type;
+    typedef schar      channel_type;
+    typedef value_type vec_type;
+
+    enum { generic_type = 0,
+           depth        = DataType<channel_type>::depth,
+           channels     = 2,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+           type         = CV_MAKE_TYPE(depth, channels)
+         };
+};
+
+template<> class DataType<char3>
+{
+public:
+    typedef char3      value_type;
+    typedef value_type work_type;
+    typedef schar      channel_type;
+    typedef value_type vec_type;
+
+    enum { generic_type = 0,
+           depth        = DataType<channel_type>::depth,
+           channels     = 3,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+           type         = CV_MAKE_TYPE(depth, channels)
+         };
+};
+
+template<> class DataType<char4>
+{
+public:
+    typedef char4      value_type;
+    typedef value_type work_type;
+    typedef schar      channel_type;
+    typedef value_type vec_type;
+
+    enum { generic_type = 0,
+           depth        = DataType<channel_type>::depth,
+           channels     = 4,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+           type         = CV_MAKE_TYPE(depth, channels)
+         };
+};
+
+}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/warp/detail/reduce.hpp b/modules/cudev/include/opencv2/cudev/warp/detail/reduce.hpp
new file mode 100644
index 00000000000..ef19bd35fa6
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/warp/detail/reduce.hpp
@@ -0,0 +1,222 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_WARP_REDUCE_DETAIL_HPP
+#define OPENCV_CUDEV_WARP_REDUCE_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/tuple.hpp"
+#include "../../warp/shuffle.hpp"
+
+namespace cv { namespace cudev {
+
+namespace warp_reduce_detail
+{
+    // GetType
+
+    template <typename T> struct GetType;
+
+    template <typename T> struct GetType<T*>
+    {
+        typedef T type;
+    };
+
+    template <typename T> struct GetType<volatile T*>
+    {
+        typedef T type;
+    };
+
+    template <typename T> struct GetType<T&>
+    {
+        typedef T type;
+    };
+
+    // For
+
+    template <int I, int N> struct For
+    {
+        template <class PointerTuple, class ValTuple>
+        __device__ static void loadToSmem(const PointerTuple& smem, const ValTuple& val, uint tid)
+        {
+            get<I>(smem)[tid] = get<I>(val);
+
+            For<I + 1, N>::loadToSmem(smem, val, tid);
+        }
+
+        template <class PointerTuple, class ValTuple, class OpTuple>
+        __device__ static void merge(const PointerTuple& smem, const ValTuple& val, uint tid, uint delta, const OpTuple& op)
+        {
+            typename GetType<typename tuple_element<I, PointerTuple>::type>::type reg = get<I>(smem)[tid + delta];
+            get<I>(smem)[tid] = get<I>(val) = get<I>(op)(get<I>(val), reg);
+
+            For<I + 1, N>::merge(smem, val, tid, delta, op);
+        }
+
+#if CV_CUDEV_ARCH >= 300
+        template <class ValTuple, class OpTuple>
+        __device__ static void mergeShfl(const ValTuple& val, uint delta, uint width, const OpTuple& op)
+        {
+            typename GetType<typename tuple_element<I, ValTuple>::type>::type reg = shfl_down(get<I>(val), delta, width);
+            get<I>(val) = get<I>(op)(get<I>(val), reg);
+
+            For<I + 1, N>::mergeShfl(val, delta, width, op);
+        }
+#endif
+    };
+
+    template <int N> struct For<N, N>
+    {
+        template <class PointerTuple, class ValTuple>
+        __device__ __forceinline__ static void loadToSmem(const PointerTuple&, const ValTuple&, uint)
+        {
+        }
+
+        template <class PointerTuple, class ValTuple, class OpTuple>
+        __device__ __forceinline__ static void merge(const PointerTuple&, const ValTuple&, uint, uint, const OpTuple&)
+        {
+        }
+
+#if CV_CUDEV_ARCH >= 300
+        template <class ValTuple, class OpTuple>
+        __device__ __forceinline__ static void mergeShfl(const ValTuple&, uint, uint, const OpTuple&)
+        {
+        }
+#endif
+    };
+
+    // loadToSmem
+
+    template <typename T>
+    __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, uint tid)
+    {
+        smem[tid] = val;
+    }
+
+    template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+    __device__ __forceinline__ void loadToSmem(const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                               const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                               uint tid)
+    {
+        For<0, tuple_size<tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+    }
+
+    // merge
+
+    template <typename T, class Op>
+    __device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op)
+    {
+        T reg = smem[tid + delta];
+        smem[tid] = val = op(val, reg);
+    }
+
+    template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+              class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+    __device__ __forceinline__ void merge(const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                          const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                          uint tid,
+                                          uint delta,
+                                          const tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+    {
+        For<0, tuple_size<tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+    }
+
+    // mergeShfl
+
+#if CV_CUDEV_ARCH >= 300
+    template <typename T, class Op>
+    __device__ __forceinline__ void mergeShfl(T& val, uint delta, uint width, const Op& op)
+    {
+        T reg = shfl_down(val, delta, width);
+        val = op(val, reg);
+    }
+
+    template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+              class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+    __device__ __forceinline__ void mergeShfl(const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              uint delta,
+                                              uint width,
+                                              const tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+    {
+        For<0, tuple_size<tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+    }
+#endif
+
+    // WarpReductor
+
+    struct WarpReductor
+    {
+        template <typename Pointer, typename Reference, class Op>
+        __device__ static void reduce(Pointer smem, Reference val, uint tid, Op op)
+        {
+        #if CV_CUDEV_ARCH >= 300
+            CV_UNUSED(smem);
+            CV_UNUSED(tid);
+
+            mergeShfl(val, 16, 32, op);
+            mergeShfl(val, 8, 32, op);
+            mergeShfl(val, 4, 32, op);
+            mergeShfl(val, 2, 32, op);
+            mergeShfl(val, 1, 32, op);
+        #else
+            loadToSmem(smem, val, tid);
+
+            if (tid < 16)
+            {
+                merge(smem, val, tid, 16, op);
+                merge(smem, val, tid, 8, op);
+                merge(smem, val, tid, 4, op);
+                merge(smem, val, tid, 2, op);
+                merge(smem, val, tid, 1, op);
+            }
+        #endif
+        }
+    };
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/warp/detail/reduce_key_val.hpp b/modules/cudev/include/opencv2/cudev/warp/detail/reduce_key_val.hpp
new file mode 100644
index 00000000000..c6deb3ace19
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/warp/detail/reduce_key_val.hpp
@@ -0,0 +1,239 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_WARP_REDUCE_KEY_VAL_DETAIL_HPP
+#define OPENCV_CUDEV_WARP_REDUCE_KEY_VAL_DETAIL_HPP
+
+#include "../../common.hpp"
+#include "../../util/tuple.hpp"
+
+namespace cv { namespace cudev {
+
+namespace warp_reduce_key_val_detail
+{
+    // GetType
+
+    template <typename T> struct GetType;
+
+    template <typename T> struct GetType<T*>
+    {
+        typedef T type;
+    };
+
+    template <typename T> struct GetType<volatile T*>
+    {
+        typedef T type;
+    };
+
+    template <typename T> struct GetType<T&>
+    {
+        typedef T type;
+    };
+
+    // For
+
+    template <int I, int N> struct For
+    {
+        template <class PointerTuple, class ReferenceTuple>
+        __device__ static void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, uint tid)
+        {
+            get<I>(smem)[tid] = get<I>(data);
+
+            For<I + 1, N>::loadToSmem(smem, data, tid);
+        }
+
+        template <class PointerTuple, class ReferenceTuple>
+        __device__ static void copy(const PointerTuple& svals, const ReferenceTuple& val, uint tid, uint delta)
+        {
+            get<I>(svals)[tid] = get<I>(val) = get<I>(svals)[tid + delta];
+
+            For<I + 1, N>::copy(svals, val, tid, delta);
+        }
+
+        template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+        __device__ static void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+                                     const ValPointerTuple& svals, const ValReferenceTuple& val,
+                                     const CmpTuple& cmp,
+                                     uint tid, uint delta)
+        {
+            typename GetType<typename tuple_element<I, KeyPointerTuple>::type>::type reg = get<I>(skeys)[tid + delta];
+
+            if (get<I>(cmp)(reg, get<I>(key)))
+            {
+                get<I>(skeys)[tid] = get<I>(key) = reg;
+                get<I>(svals)[tid] = get<I>(val) = get<I>(svals)[tid + delta];
+            }
+
+            For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+    };
+
+    template <int N> struct For<N, N>
+    {
+        template <class PointerTuple, class ReferenceTuple>
+        __device__ __forceinline__ static void loadToSmem(const PointerTuple&, const ReferenceTuple&, uint)
+        {
+        }
+
+        template <class PointerTuple, class ReferenceTuple>
+        __device__ __forceinline__ static void copy(const PointerTuple&, const ReferenceTuple&, uint, uint)
+        {
+        }
+
+        template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+        __device__ __forceinline__ static void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+                                                     const ValPointerTuple&, const ValReferenceTuple&,
+                                                     const CmpTuple&,
+                                                     uint, uint)
+        {
+        }
+    };
+
+    // loadToSmem
+
+    template <typename T>
+    __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, uint tid)
+    {
+        smem[tid] = data;
+    }
+
+    template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+    __device__ __forceinline__ void loadToSmem(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                               const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                               uint tid)
+    {
+        For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+    }
+
+    // copyVals
+
+    template <typename V>
+    __device__ __forceinline__ void copyVals(volatile V* svals, V& val, uint tid, uint delta)
+    {
+        svals[tid] = val = svals[tid + delta];
+    }
+
+    template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+    __device__ __forceinline__ void copyVals(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                             const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                             uint tid, uint delta)
+    {
+        For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+    }
+
+    // merge
+
+    template <typename K, typename V, class Cmp>
+    __device__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, uint tid, uint delta)
+    {
+        K reg = skeys[tid + delta];
+
+        if (cmp(reg, key))
+        {
+            skeys[tid] = key = reg;
+            copyVals(svals, val, tid, delta);
+        }
+    }
+
+    template <typename K,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp>
+    __device__ void merge(volatile K* skeys, K& key,
+                          const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                          const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                          const Cmp& cmp, uint tid, uint delta)
+    {
+        K reg = skeys[tid + delta];
+
+        if (cmp(reg, key))
+        {
+            skeys[tid] = key = reg;
+            copyVals(svals, val, tid, delta);
+        }
+    }
+
+    template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+              typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+    __device__ __forceinline__ void merge(const tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                          const tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                          const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                          const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                          const tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                          uint tid, uint delta)
+    {
+        For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+    }
+
+    // WarpReductor
+
+    struct WarpReductor
+    {
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        __device__ static void reduce(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp)
+        {
+            loadToSmem(skeys, key, tid);
+            loadToSmem(svals, val, tid);
+
+            if (tid < 16)
+            {
+                merge(skeys, key, svals, val, cmp, tid, 16);
+                merge(skeys, key, svals, val, cmp, tid, 8);
+                merge(skeys, key, svals, val, cmp, tid, 4);
+                merge(skeys, key, svals, val, cmp, tid, 2);
+                merge(skeys, key, svals, val, cmp, tid, 1);
+            }
+        }
+    };
+}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/warp/reduce.hpp b/modules/cudev/include/opencv2/cudev/warp/reduce.hpp
new file mode 100644
index 00000000000..46826033fea
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/warp/reduce.hpp
@@ -0,0 +1,211 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_WARP_REDUCE_HPP
+#define OPENCV_CUDEV_WARP_REDUCE_HPP
+
+#include "../common.hpp"
+#include "../util/tuple.hpp"
+#include "detail/reduce.hpp"
+#include "detail/reduce_key_val.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+// warpReduce
+
+template <typename T, class Op>
+__device__ __forceinline__ void warpReduce(volatile T* smem, T& val, uint tid, const Op& op)
+{
+    warp_reduce_detail::WarpReductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
+}
+
+template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+          typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+          class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+__device__ __forceinline__ void warpReduce(const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                           const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                           uint tid,
+                                           const tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+{
+    warp_reduce_detail::WarpReductor::template reduce<
+            const tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
+            const tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
+            const tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
+}
+
+// warpReduceKeyVal
+
+template <typename K, typename V, class Cmp>
+__device__ __forceinline__ void warpReduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, uint tid, const Cmp& cmp)
+{
+    warp_reduce_key_val_detail::WarpReductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+}
+
+template <typename K,
+          typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+          typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+          class Cmp>
+__device__ __forceinline__ void warpReduceKeyVal(volatile K* skeys, K& key,
+                                                 const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 uint tid, const Cmp& cmp)
+{
+    warp_reduce_key_val_detail::WarpReductor::template reduce<volatile K*, K&,
+            const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+            const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+            const Cmp&>(skeys, key, svals, val, tid, cmp);
+}
+
+template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+          typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+          typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+          typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+          class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+__device__ __forceinline__ void warpReduceKeyVal(const tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                                 const tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                 const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 uint tid,
+                                                 const tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
+{
+    warp_reduce_key_val_detail::WarpReductor::template reduce<
+            const tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
+            const tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
+            const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+            const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+            const tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
+            >(skeys, key, svals, val, tid, cmp);
+}
+
+// smem_tuple
+
+template <typename T0>
+__device__ __forceinline__
+tuple<volatile T0*>
+smem_tuple(T0* t0)
+{
+    return make_tuple((volatile T0*) t0);
+}
+
+template <typename T0, typename T1>
+__device__ __forceinline__
+tuple<volatile T0*, volatile T1*>
+smem_tuple(T0* t0, T1* t1)
+{
+    return make_tuple((volatile T0*) t0, (volatile T1*) t1);
+}
+
+template <typename T0, typename T1, typename T2>
+__device__ __forceinline__
+tuple<volatile T0*, volatile T1*, volatile T2*>
+smem_tuple(T0* t0, T1* t1, T2* t2)
+{
+    return make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
+}
+
+template <typename T0, typename T1, typename T2, typename T3>
+__device__ __forceinline__
+tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
+smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
+{
+    return make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4>
+__device__ __forceinline__
+tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
+smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
+{
+    return make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+__device__ __forceinline__
+tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
+smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
+{
+    return make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+__device__ __forceinline__
+tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
+smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
+{
+    return make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+__device__ __forceinline__
+tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
+smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
+{
+    return make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+__device__ __forceinline__
+tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
+smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
+{
+    return make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+__device__ __forceinline__
+tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
+smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
+{
+    return make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/warp/scan.hpp b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
new file mode 100644
index 00000000000..a5007f881a5
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@@ -0,0 +1,104 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_WARP_SCAN_HPP
+#define OPENCV_CUDEV_WARP_SCAN_HPP
+
+#include "../common.hpp"
+#include "warp.hpp"
+#include "shuffle.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+template <typename T>
+__device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
+{
+#if CV_CUDEV_ARCH >= 300
+    CV_UNUSED(smem);
+    CV_UNUSED(tid);
+
+    const uint laneId = Warp::laneId();
+
+    // scan on shufl functions
+    #pragma unroll
+    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
+    {
+        const T val = shfl_up(data, i);
+        if (laneId >= i)
+              data += val;
+    }
+
+    return data;
+#else
+    uint pos = 2 * tid - (tid & (WARP_SIZE - 1));
+    smem[pos] = 0;
+
+    pos += WARP_SIZE;
+    smem[pos] = data;
+
+    smem[pos] += smem[pos - 1];
+    smem[pos] += smem[pos - 2];
+    smem[pos] += smem[pos - 4];
+    smem[pos] += smem[pos - 8];
+    smem[pos] += smem[pos - 16];
+
+    return smem[pos];
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint tid)
+{
+    return warpScanInclusive(data, smem, tid) - data;
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
new file mode 100644
index 00000000000..e776dd65df1
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@@ -0,0 +1,439 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_WARP_SHUFFLE_HPP
+#define OPENCV_CUDEV_WARP_SHUFFLE_HPP
+
+#include "../common.hpp"
+#include "../util/vec_traits.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+#if CV_CUDEV_ARCH >= 300
+
+#if __CUDACC_VER_MAJOR__ >= 9
+#  define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
+#  define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
+#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
+#  define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
+#endif
+
+// shfl
+__device__ __forceinline__ uchar shfl(uchar val, int srcLane, int width = warpSize)
+{
+    return (uchar) __shfl((int) val, srcLane, width);
+}
+
+__device__ __forceinline__ schar shfl(schar val, int srcLane, int width = warpSize)
+{
+    return (schar) __shfl((int) val, srcLane, width);
+}
+
+__device__ __forceinline__ ushort shfl(ushort val, int srcLane, int width = warpSize)
+{
+    return (ushort) __shfl((int) val, srcLane, width);
+}
+
+__device__ __forceinline__ short shfl(short val, int srcLane, int width = warpSize)
+{
+    return (short) __shfl((int) val, srcLane, width);
+}
+
+__device__ __forceinline__ int shfl(int val, int srcLane, int width = warpSize)
+{
+    return __shfl(val, srcLane, width);
+}
+
+__device__ __forceinline__ uint shfl(uint val, int srcLane, int width = warpSize)
+{
+    return (uint) __shfl((int) val, srcLane, width);
+}
+
+__device__ __forceinline__ float shfl(float val, int srcLane, int width = warpSize)
+{
+    return __shfl(val, srcLane, width);
+}
+
+__device__ double shfl(double val, int srcLane, int width = warpSize)
+{
+    int lo = __double2loint(val);
+    int hi = __double2hiint(val);
+
+    lo = __shfl(lo, srcLane, width);
+    hi = __shfl(hi, srcLane, width);
+
+    return __hiloint2double(hi, lo);
+}
+
+#define CV_CUDEV_SHFL_VEC_INST(input_type) \
+    __device__ __forceinline__ input_type ## 1 shfl(const input_type ## 1 & val, int srcLane, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 1>::make( \
+                        shfl(val.x, srcLane, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 2 shfl(const input_type ## 2 & val, int srcLane, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 2>::make( \
+                        shfl(val.x, srcLane, width), \
+                        shfl(val.y, srcLane, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 3 shfl(const input_type ## 3 & val, int srcLane, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 3>::make( \
+                        shfl(val.x, srcLane, width), \
+                        shfl(val.y, srcLane, width), \
+                        shfl(val.z, srcLane, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 4 shfl(const input_type ## 4 & val, int srcLane, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 4>::make( \
+                        shfl(val.x, srcLane, width), \
+                        shfl(val.y, srcLane, width), \
+                        shfl(val.z, srcLane, width), \
+                        shfl(val.w, srcLane, width) \
+                    ); \
+    }
+
+CV_CUDEV_SHFL_VEC_INST(uchar)
+CV_CUDEV_SHFL_VEC_INST(char)
+CV_CUDEV_SHFL_VEC_INST(ushort)
+CV_CUDEV_SHFL_VEC_INST(short)
+CV_CUDEV_SHFL_VEC_INST(uint)
+CV_CUDEV_SHFL_VEC_INST(int)
+CV_CUDEV_SHFL_VEC_INST(float)
+CV_CUDEV_SHFL_VEC_INST(double)
+
+#undef CV_CUDEV_SHFL_VEC_INST
+
+// shfl_up
+
+__device__ __forceinline__ uchar shfl_up(uchar val, uint delta, int width = warpSize)
+{
+    return (uchar) __shfl_up((int) val, delta, width);
+}
+
+__device__ __forceinline__ schar shfl_up(schar val, uint delta, int width = warpSize)
+{
+    return (schar) __shfl_up((int) val, delta, width);
+}
+
+__device__ __forceinline__ ushort shfl_up(ushort val, uint delta, int width = warpSize)
+{
+    return (ushort) __shfl_up((int) val, delta, width);
+}
+
+__device__ __forceinline__ short shfl_up(short val, uint delta, int width = warpSize)
+{
+    return (short) __shfl_up((int) val, delta, width);
+}
+
+__device__ __forceinline__ int shfl_up(int val, uint delta, int width = warpSize)
+{
+    return __shfl_up(val, delta, width);
+}
+
+__device__ __forceinline__ uint shfl_up(uint val, uint delta, int width = warpSize)
+{
+    return (uint) __shfl_up((int) val, delta, width);
+}
+
+__device__ __forceinline__ float shfl_up(float val, uint delta, int width = warpSize)
+{
+    return __shfl_up(val, delta, width);
+}
+
+__device__ double shfl_up(double val, uint delta, int width = warpSize)
+{
+    int lo = __double2loint(val);
+    int hi = __double2hiint(val);
+
+    lo = __shfl_up(lo, delta, width);
+    hi = __shfl_up(hi, delta, width);
+
+    return __hiloint2double(hi, lo);
+}
+
+#define CV_CUDEV_SHFL_UP_VEC_INST(input_type) \
+    __device__ __forceinline__ input_type ## 1 shfl_up(const input_type ## 1 & val, uint delta, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 1>::make( \
+                        shfl_up(val.x, delta, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 2 shfl_up(const input_type ## 2 & val, uint delta, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 2>::make( \
+                        shfl_up(val.x, delta, width), \
+                        shfl_up(val.y, delta, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 3 shfl_up(const input_type ## 3 & val, uint delta, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 3>::make( \
+                        shfl_up(val.x, delta, width), \
+                        shfl_up(val.y, delta, width), \
+                        shfl_up(val.z, delta, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 4 shfl_up(const input_type ## 4 & val, uint delta, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 4>::make( \
+                        shfl_up(val.x, delta, width), \
+                        shfl_up(val.y, delta, width), \
+                        shfl_up(val.z, delta, width), \
+                        shfl_up(val.w, delta, width) \
+                    ); \
+    }
+
+CV_CUDEV_SHFL_UP_VEC_INST(uchar)
+CV_CUDEV_SHFL_UP_VEC_INST(char)
+CV_CUDEV_SHFL_UP_VEC_INST(ushort)
+CV_CUDEV_SHFL_UP_VEC_INST(short)
+CV_CUDEV_SHFL_UP_VEC_INST(uint)
+CV_CUDEV_SHFL_UP_VEC_INST(int)
+CV_CUDEV_SHFL_UP_VEC_INST(float)
+CV_CUDEV_SHFL_UP_VEC_INST(double)
+
+#undef CV_CUDEV_SHFL_UP_VEC_INST
+
+// shfl_down
+
+__device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize)
+{
+    return (uchar) __shfl_down((int) val, delta, width);
+}
+
+__device__ __forceinline__ schar shfl_down(schar val, uint delta, int width = warpSize)
+{
+    return (schar) __shfl_down((int) val, delta, width);
+}
+
+__device__ __forceinline__ ushort shfl_down(ushort val, uint delta, int width = warpSize)
+{
+    return (ushort) __shfl_down((int) val, delta, width);
+}
+
+__device__ __forceinline__ short shfl_down(short val, uint delta, int width = warpSize)
+{
+    return (short) __shfl_down((int) val, delta, width);
+}
+
+__device__ __forceinline__ int shfl_down(int val, uint delta, int width = warpSize)
+{
+    return __shfl_down(val, delta, width);
+}
+
+__device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warpSize)
+{
+    return (uint) __shfl_down((int) val, delta, width);
+}
+
+__device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize)
+{
+    return __shfl_down(val, delta, width);
+}
+
+__device__ double shfl_down(double val, uint delta, int width = warpSize)
+{
+    int lo = __double2loint(val);
+    int hi = __double2hiint(val);
+
+    lo = __shfl_down(lo, delta, width);
+    hi = __shfl_down(hi, delta, width);
+
+    return __hiloint2double(hi, lo);
+}
+
+#define CV_CUDEV_SHFL_DOWN_VEC_INST(input_type) \
+    __device__ __forceinline__ input_type ## 1 shfl_down(const input_type ## 1 & val, uint delta, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 1>::make( \
+                        shfl_down(val.x, delta, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 2 shfl_down(const input_type ## 2 & val, uint delta, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 2>::make( \
+                        shfl_down(val.x, delta, width), \
+                        shfl_down(val.y, delta, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 3 shfl_down(const input_type ## 3 & val, uint delta, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 3>::make( \
+                        shfl_down(val.x, delta, width), \
+                        shfl_down(val.y, delta, width), \
+                        shfl_down(val.z, delta, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 4 shfl_down(const input_type ## 4 & val, uint delta, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 4>::make( \
+                        shfl_down(val.x, delta, width), \
+                        shfl_down(val.y, delta, width), \
+                        shfl_down(val.z, delta, width), \
+                        shfl_down(val.w, delta, width) \
+                    ); \
+    }
+
+CV_CUDEV_SHFL_DOWN_VEC_INST(uchar)
+CV_CUDEV_SHFL_DOWN_VEC_INST(char)
+CV_CUDEV_SHFL_DOWN_VEC_INST(ushort)
+CV_CUDEV_SHFL_DOWN_VEC_INST(short)
+CV_CUDEV_SHFL_DOWN_VEC_INST(uint)
+CV_CUDEV_SHFL_DOWN_VEC_INST(int)
+CV_CUDEV_SHFL_DOWN_VEC_INST(float)
+CV_CUDEV_SHFL_DOWN_VEC_INST(double)
+
+#undef CV_CUDEV_SHFL_DOWN_VEC_INST
+
+// shfl_xor
+
+__device__ __forceinline__ uchar shfl_xor(uchar val, int laneMask, int width = warpSize)
+{
+    return (uchar) __shfl_xor((int) val, laneMask, width);
+}
+
+__device__ __forceinline__ schar shfl_xor(schar val, int laneMask, int width = warpSize)
+{
+    return (schar) __shfl_xor((int) val, laneMask, width);
+}
+
+__device__ __forceinline__ ushort shfl_xor(ushort val, int laneMask, int width = warpSize)
+{
+    return (ushort) __shfl_xor((int) val, laneMask, width);
+}
+
+__device__ __forceinline__ short shfl_xor(short val, int laneMask, int width = warpSize)
+{
+    return (short) __shfl_xor((int) val, laneMask, width);
+}
+
+__device__ __forceinline__ int shfl_xor(int val, int laneMask, int width = warpSize)
+{
+    return __shfl_xor(val, laneMask, width);
+}
+
+__device__ __forceinline__ uint shfl_xor(uint val, int laneMask, int width = warpSize)
+{
+    return (uint) __shfl_xor((int) val, laneMask, width);
+}
+
+__device__ __forceinline__ float shfl_xor(float val, int laneMask, int width = warpSize)
+{
+    return __shfl_xor(val, laneMask, width);
+}
+
+__device__ double shfl_xor(double val, int laneMask, int width = warpSize)
+{
+    int lo = __double2loint(val);
+    int hi = __double2hiint(val);
+
+    lo = __shfl_xor(lo, laneMask, width);
+    hi = __shfl_xor(hi, laneMask, width);
+
+    return __hiloint2double(hi, lo);
+}
+
+#define CV_CUDEV_SHFL_XOR_VEC_INST(input_type) \
+    __device__ __forceinline__ input_type ## 1 shfl_xor(const input_type ## 1 & val, int laneMask, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 1>::make( \
+                        shfl_xor(val.x, laneMask, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 2 shfl_xor(const input_type ## 2 & val, int laneMask, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 2>::make( \
+                        shfl_xor(val.x, laneMask, width), \
+                        shfl_xor(val.y, laneMask, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 3 shfl_xor(const input_type ## 3 & val, int laneMask, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 3>::make( \
+                        shfl_xor(val.x, laneMask, width), \
+                        shfl_xor(val.y, laneMask, width), \
+                        shfl_xor(val.z, laneMask, width) \
+                    ); \
+    } \
+    __device__ __forceinline__ input_type ## 4 shfl_xor(const input_type ## 4 & val, int laneMask, int width = warpSize) \
+    { \
+        return VecTraits<input_type ## 4>::make( \
+                        shfl_xor(val.x, laneMask, width), \
+                        shfl_xor(val.y, laneMask, width), \
+                        shfl_xor(val.z, laneMask, width), \
+                        shfl_xor(val.w, laneMask, width) \
+                    ); \
+    }
+
+CV_CUDEV_SHFL_XOR_VEC_INST(uchar)
+CV_CUDEV_SHFL_XOR_VEC_INST(char)
+CV_CUDEV_SHFL_XOR_VEC_INST(ushort)
+CV_CUDEV_SHFL_XOR_VEC_INST(short)
+CV_CUDEV_SHFL_XOR_VEC_INST(uint)
+CV_CUDEV_SHFL_XOR_VEC_INST(int)
+CV_CUDEV_SHFL_XOR_VEC_INST(float)
+CV_CUDEV_SHFL_XOR_VEC_INST(double)
+
+#undef CV_CUDEV_SHFL_XOR_VEC_INST
+#undef __shfl
+#undef __shfl_xor
+#undef __shfl_up
+#undef __shfl_down
+
+#endif // CV_CUDEV_ARCH >= 300
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/warp/warp.hpp b/modules/cudev/include/opencv2/cudev/warp/warp.hpp
new file mode 100644
index 00000000000..db096c56101
--- /dev/null
+++ b/modules/cudev/include/opencv2/cudev/warp/warp.hpp
@@ -0,0 +1,127 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef OPENCV_CUDEV_WARP_WARP_HPP
+#define OPENCV_CUDEV_WARP_WARP_HPP
+
+#include "../common.hpp"
+
+namespace cv { namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+enum
+{
+    LOG_WARP_SIZE = 5,
+    WARP_SIZE     = 1 << LOG_WARP_SIZE
+};
+
+struct Warp
+{
+    __device__ __forceinline__ static uint laneId()
+    {
+        uint ret;
+        asm("mov.u32 %0, %%laneid;" : "=r"(ret));
+        return ret;
+    }
+
+    __device__ __forceinline__ static uint warpId()
+    {
+        const uint tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
+        return tid / WARP_SIZE;
+    }
+};
+
+template <class It, typename T>
+__device__ __forceinline__ void warpFill(It beg, It end, const T& value)
+{
+    for(It t = beg + Warp::laneId(); t < end; t += WARP_SIZE)
+        *t = value;
+}
+
+template <class InIt, class OutIt>
+__device__ __forceinline__ OutIt warpCopy(InIt beg, InIt end, OutIt out)
+{
+    for(InIt t = beg + Warp::laneId(); t < end; t += WARP_SIZE, out += WARP_SIZE)
+        *out = *t;
+    return out;
+}
+
+template <class InIt, class OutIt, class UnOp>
+__device__ __forceinline__ OutIt warpTransform(InIt beg, InIt end, OutIt out, const UnOp& op)
+{
+    for(InIt t = beg + Warp::laneId(); t < end; t += WARP_SIZE, out += WARP_SIZE)
+        *out = op(*t);
+    return out;
+}
+
+template <class InIt1, class InIt2, class OutIt, class BinOp>
+__device__ __forceinline__ OutIt warpTransform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, const BinOp& op)
+{
+    uint lane = Warp::laneId();
+
+    InIt1 t1 = beg1 + lane;
+    InIt2 t2 = beg2 + lane;
+    for(; t1 < end1; t1 += WARP_SIZE, t2 += WARP_SIZE, out += WARP_SIZE)
+        *out = op(*t1, *t2);
+    return out;
+}
+
+template<typename OutIt, typename T>
+__device__ __forceinline__ void warpYota(OutIt beg, OutIt end, T value)
+{
+    uint lane = Warp::laneId();
+    value += lane;
+
+    for(OutIt t = beg + lane; t < end; t += WARP_SIZE, value += WARP_SIZE)
+        *t = value;
+}
+
+//! @}
+
+}}
+
+#endif
diff --git a/modules/cudev/src/stub.cpp b/modules/cudev/src/stub.cpp
new file mode 100644
index 00000000000..ec060adedc9
--- /dev/null
+++ b/modules/cudev/src/stub.cpp
@@ -0,0 +1,11 @@
+#include <opencv2/core/cvdef.h>
+
+namespace cv { namespace cudev {
+
+CV_EXPORTS void stubFunc();
+
+}}
+
+void cv::cudev::stubFunc()
+{
+}
diff --git a/modules/cudev/test/CMakeLists.txt b/modules/cudev/test/CMakeLists.txt
new file mode 100644
index 00000000000..3a321c6409d
--- /dev/null
+++ b/modules/cudev/test/CMakeLists.txt
@@ -0,0 +1,43 @@
+set(test_deps opencv_cudev opencv_core opencv_imgproc opencv_imgcodecs opencv_videoio opencv_highgui opencv_ts ${OPENCV_MODULE_opencv_ts_DEPS})
+
+ocv_check_dependencies(${test_deps})
+
+if(OCV_DEPENDENCIES_FOUND)
+  set(the_target "opencv_test_${name}")
+
+  ocv_module_include_directories("${test_deps}" "${the_module}")
+
+  file(GLOB test_srcs "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.cu")
+  file(GLOB test_hdrs "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp")
+  source_group("Src" FILES ${test_srcs})
+  source_group("Include" FILES ${test_hdrs})
+  set(OPENCV_TEST_${the_module}_SOURCES ${test_srcs} ${test_hdrs})
+
+  ocv_cuda_filter_options()
+
+  CUDA_ADD_EXECUTABLE(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} OPTIONS -std=c++11)
+  ocv_target_link_libraries(${the_target} LINK_PRIVATE
+      ${test_deps} ${OPENCV_LINKER_LIBS} ${CUDA_LIBRARIES}
+  )
+  add_dependencies(opencv_tests ${the_target})
+
+  set_target_properties(${the_target} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL}")
+  set_source_files_properties(${OPENCV_TEST_${the_module}_SOURCES} ${${the_target}_pch}
+    PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL};AccuracyTest")
+
+  # Additional target properties
+  set_target_properties(${the_target} PROPERTIES
+    DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+    RUNTIME_OUTPUT_DIRECTORY "${EXECUTABLE_OUTPUT_PATH}"
+  )
+
+  if(ENABLE_SOLUTION_FOLDERS)
+    set_target_properties(${the_target} PROPERTIES FOLDER "tests accuracy")
+  endif()
+
+  ocv_add_test_from_target("${the_target}" "Accuracy" "${the_target}")
+
+  if(INSTALL_TESTS)
+    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_TEST_INSTALL_PATH} COMPONENT tests)
+  endif()
+endif()
diff --git a/modules/cudev/test/test_arithm_func.cu b/modules/cudev/test/test_arithm_func.cu
new file mode 100644
index 00000000000..8d6826bccd6
--- /dev/null
+++ b/modules/cudev/test/test_arithm_func.cu
@@ -0,0 +1,168 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+////////////////////////////////////////////////////////////////////////////////
+// SqrtTest
+
+template <typename T>
+class SqrtTest : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+
+        GpuMat_<T> dst = sqrt_(d_src);
+
+        Mat dst_gold;
+        cv::sqrt(src, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
+    }
+
+    void test_expr()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<T> dst = sqrt_(d_src1 * d_src2);
+
+        Mat dst_gold;
+        cv::multiply(src1, src2, dst_gold);
+        cv::sqrt(dst_gold, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
+    }
+};
+
+TYPED_TEST_CASE(SqrtTest, float);
+
+TYPED_TEST(SqrtTest, GpuMat)
+{
+    SqrtTest<TypeParam>::test_gpumat();
+}
+
+TYPED_TEST(SqrtTest, Expr)
+{
+    SqrtTest<TypeParam>::test_expr();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// MagnitudeTest
+
+template <typename T>
+class MagnitudeTest : public ::testing::Test
+{
+public:
+    void test_accuracy()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<T> dst1 = hypot_(d_src1, d_src2);
+        GpuMat_<T> dst2 = magnitude_(d_src1, d_src2);
+        GpuMat_<T> dst3 = sqrt_(sqr_(d_src1) + sqr_(d_src2));
+
+        EXPECT_MAT_NEAR(dst1, dst2, 1e-4);
+        EXPECT_MAT_NEAR(dst2, dst3, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(MagnitudeTest, float);
+
+TYPED_TEST(MagnitudeTest, Accuracy)
+{
+    MagnitudeTest<TypeParam>::test_accuracy();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// PowTest
+
+template <typename T>
+class PowTest : public ::testing::Test
+{
+public:
+    void test_accuracy()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+
+        GpuMat_<T> dst1 = pow_(d_src, 0.5);
+        GpuMat_<T> dst2 = sqrt_(d_src);
+
+        EXPECT_MAT_NEAR(dst1, dst2, 1e-5);
+    }
+};
+
+TYPED_TEST_CASE(PowTest, float);
+
+TYPED_TEST(PowTest, Accuracy)
+{
+    PowTest<TypeParam>::test_accuracy();
+}
diff --git a/modules/cudev/test/test_arithm_op.cu b/modules/cudev/test/test_arithm_op.cu
new file mode 100644
index 00000000000..d4dca64d7c1
--- /dev/null
+++ b/modules/cudev/test/test_arithm_op.cu
@@ -0,0 +1,395 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+typedef ::testing::Types<uchar, ushort, short, int, float> AllTypes;
+typedef ::testing::Types<short, int, float> SignedTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+// UnaryMinusTest
+
+template <typename T>
+class UnaryMinusTest : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+
+        GpuMat_<T> dst = -d_src;
+
+        Mat dst_gold;
+        src.convertTo(dst_gold, src.depth(), -1);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+
+    void test_globptr()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+        GlobPtrSz<T> d_src_ptr = d_src;
+
+        GpuMat_<T> dst = -d_src_ptr;
+
+        Mat dst_gold;
+        src.convertTo(dst_gold, src.depth(), -1);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+
+    void test_texptr()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+        Texture<T> tex_src(d_src);
+
+        GpuMat_<T> dst = -tex_src;
+
+        Mat dst_gold;
+        src.convertTo(dst_gold, src.depth(), -1);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+
+    void test_expr()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<T> dst = -(d_src1 + d_src2);
+
+        Mat dst_gold;
+        cv::add(src1, src2, dst_gold);
+        dst_gold.convertTo(dst_gold, dst_gold.depth(), -1);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(UnaryMinusTest, SignedTypes);
+
+TYPED_TEST(UnaryMinusTest, GpuMat)
+{
+    UnaryMinusTest<TypeParam>::test_gpumat();
+}
+
+TYPED_TEST(UnaryMinusTest, GlobPtrSz)
+{
+    UnaryMinusTest<TypeParam>::test_globptr();
+}
+
+TYPED_TEST(UnaryMinusTest, TexturePtr)
+{
+    UnaryMinusTest<TypeParam>::test_texptr();
+}
+
+TYPED_TEST(UnaryMinusTest, Expr)
+{
+    UnaryMinusTest<TypeParam>::test_expr();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// PlusTest
+
+template <typename T>
+class PlusTest : public ::testing::Test
+{
+public:
+    void test_gpumat_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<T> dst = d_src1 + d_src2;
+
+        Mat dst_gold;
+        cv::add(src1, src2, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+
+    void test_texptr_scalar()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+        Texture<T> tex_src(d_src);
+
+        GpuMat_<T> dst = tex_src + static_cast<T>(5);
+
+        Mat dst_gold;
+        cv::add(src, 5, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+
+    void test_expr_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+        Mat src3 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2), d_src3(src3);
+
+        GpuMat_<T> dst = d_src1 + d_src2 + d_src3;
+
+        Mat dst_gold;
+        cv::add(src1, src2, dst_gold);
+        cv::add(dst_gold, src3, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+
+    void test_scalar_expr()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<T> dst = static_cast<T>(5) + (d_src1 + d_src2);
+
+        Mat dst_gold;
+        cv::add(src1, src2, dst_gold);
+        cv::add(dst_gold, 5, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(PlusTest, AllTypes);
+
+TYPED_TEST(PlusTest, GpuMat_GpuMat)
+{
+    PlusTest<TypeParam>::test_gpumat_gpumat();
+}
+
+TYPED_TEST(PlusTest, TexturePtr_Scalar)
+{
+    PlusTest<TypeParam>::test_texptr_scalar();
+}
+
+TYPED_TEST(PlusTest, Expr_GpuMat)
+{
+    PlusTest<TypeParam>::test_expr_gpumat();
+}
+
+TYPED_TEST(PlusTest, Scalar_Expr)
+{
+    PlusTest<TypeParam>::test_scalar_expr();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// MinusTest
+
+template <typename T>
+class MinusTest : public ::testing::Test
+{
+public:
+    void test_gpumat_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<T> dst = d_src1 - d_src2;
+
+        Mat dst_gold;
+        cv::subtract(src1, src2, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+
+    void test_texptr_scalar()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+        Texture<T> tex_src(d_src);
+
+        GpuMat_<T> dst = tex_src - static_cast<T>(5);
+
+        Mat dst_gold;
+        cv::subtract(src, 5, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+
+    void test_expr_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+        Mat src3 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2), d_src3(src3);
+
+        GpuMat_<T> dst = (d_src1 + d_src2) - d_src3;
+
+        Mat dst_gold;
+        cv::add(src1, src2, dst_gold);
+        cv::subtract(dst_gold, src3, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+
+    void test_scalar_expr()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<T> dst = static_cast<T>(5) - (d_src1 + d_src2);
+
+        Mat dst_gold;
+        cv::add(src1, src2, dst_gold);
+        cv::subtract(5, dst_gold, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(MinusTest, SignedTypes);
+
+TYPED_TEST(MinusTest, GpuMat_GpuMat)
+{
+    MinusTest<TypeParam>::test_gpumat_gpumat();
+}
+
+TYPED_TEST(MinusTest, TexturePtr_Scalar)
+{
+    MinusTest<TypeParam>::test_texptr_scalar();
+}
+
+TYPED_TEST(MinusTest, Expr_GpuMat)
+{
+    MinusTest<TypeParam>::test_expr_gpumat();
+}
+
+TYPED_TEST(MinusTest, Scalar_Expr)
+{
+    MinusTest<TypeParam>::test_scalar_expr();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AbsDiffTest
+
+template <typename T>
+class AbsDiffTest : public ::testing::Test
+{
+public:
+    void test_accuracy()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<T> dst1 = absdiff_(d_src1, d_src2);
+        GpuMat_<T> dst2 = abs_(d_src1 - d_src2);
+
+        EXPECT_MAT_NEAR(dst1, dst2, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(AbsDiffTest, SignedTypes);
+
+TYPED_TEST(AbsDiffTest, Accuracy)
+{
+    AbsDiffTest<TypeParam>::test_accuracy();
+}
diff --git a/modules/cudev/test/test_bitwize_op.cu b/modules/cudev/test/test_bitwize_op.cu
new file mode 100644
index 00000000000..6936f57485b
--- /dev/null
+++ b/modules/cudev/test/test_bitwize_op.cu
@@ -0,0 +1,146 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+typedef ::testing::Types<uchar, ushort, short, int> IntTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+// BitNotTest
+
+template <typename T>
+class BitNotTest : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+
+        GpuMat_<T> dst = ~d_src;
+
+        Mat dst_gold;
+        cv::bitwise_not(src, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(BitNotTest, IntTypes);
+
+TYPED_TEST(BitNotTest, GpuMat)
+{
+    BitNotTest<TypeParam>::test_gpumat();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// BitAndTest
+
+template <typename T>
+class BitAndTest : public ::testing::Test
+{
+public:
+    void test_gpumat_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<T> dst = d_src1 & d_src2;
+
+        Mat dst_gold;
+        cv::bitwise_and(src1, src2, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(BitAndTest, IntTypes);
+
+TYPED_TEST(BitAndTest, GpuMat_GpuMat)
+{
+    BitAndTest<TypeParam>::test_gpumat_gpumat();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// LShiftTest
+
+template <typename T>
+class LShiftTest : public ::testing::Test
+{
+public:
+    void test_accuracy()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+
+        GpuMat_<T> dst1 = d_src << 2;
+        GpuMat_<T> dst2 = d_src * 4;
+
+        EXPECT_MAT_NEAR(dst1, dst2, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(LShiftTest, int);
+
+TYPED_TEST(LShiftTest, Accuracy)
+{
+    LShiftTest<TypeParam>::test_accuracy();
+}
diff --git a/modules/cudev/test/test_cmp_op.cu b/modules/cudev/test/test_cmp_op.cu
new file mode 100644
index 00000000000..19933723dac
--- /dev/null
+++ b/modules/cudev/test/test_cmp_op.cu
@@ -0,0 +1,151 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+typedef ::testing::Types<uchar, ushort, short, int, float> AllTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+// LessTest
+
+template <typename T>
+class LessTest : public ::testing::Test
+{
+public:
+    void test_gpumat_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<uchar> dst = (d_src1 < d_src2) * 255;
+
+        Mat dst_gold;
+        cv::compare(src1, src2, dst_gold, CMP_LT);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(LessTest, AllTypes);
+
+TYPED_TEST(LessTest, GpuMat_GpuMat)
+{
+    LessTest<TypeParam>::test_gpumat_gpumat();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// MinTest
+
+template <typename T>
+class MinTest : public ::testing::Test
+{
+public:
+    void test_gpumat_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, type);
+        Mat src2 = randomMat(size, type);
+
+        GpuMat_<T> d_src1(src1), d_src2(src2);
+
+        GpuMat_<T> dst = min_(d_src1, d_src2);
+
+        Mat dst_gold;
+        cv::min(src1, src2, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(MinTest, AllTypes);
+
+TYPED_TEST(MinTest, GpuMat_GpuMat)
+{
+    MinTest<TypeParam>::test_gpumat_gpumat();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ThreshBinaryTest
+
+typedef ::testing::Types<uchar, short, float> ThreshTypes;
+
+template <typename T>
+class ThreshBinaryTest : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+
+        GpuMat_<T> dst = threshBinary_(d_src, 128, 0);
+
+        Mat dst_gold;
+        cv::threshold(src, dst_gold, 128, 0, THRESH_BINARY);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(ThreshBinaryTest, ThreshTypes);
+
+TYPED_TEST(ThreshBinaryTest, GpuMat)
+{
+    ThreshBinaryTest<TypeParam>::test_gpumat();
+}
diff --git a/modules/cudev/test/test_color_cvt.cu b/modules/cudev/test/test_color_cvt.cu
new file mode 100644
index 00000000000..53154f99c66
--- /dev/null
+++ b/modules/cudev/test/test_color_cvt.cu
@@ -0,0 +1,180 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+namespace cv {
+
+enum {
+    COLOR_BGR2BGR = COLOR_BGR2RGB,
+    COLOR_BGR2LRGB = COLOR_BGR2RGB,
+    COLOR_BGR2LBGR = COLOR_BGR2RGB
+};
+
+}
+
+#define CVT_COLOR_TEST(src_space, dst_space, src_cn, dst_cn) \
+    TEST(CvtColor, src_space ## _to_ ## dst_space) \
+    { \
+        const Size size = randomSize(100, 400); \
+        Mat bgrb = randomMat(size, CV_8UC3); \
+        Mat srcb; \
+        cv::cvtColor(bgrb, srcb, COLOR_BGR ## 2 ## src_space, src_cn); \
+        GpuMat_<SelectIf<src_cn == 1, uchar, uchar ## src_cn>::type> d_srcb(srcb); \
+        GpuMat_<SelectIf<dst_cn == 1, uchar, uchar ## dst_cn>::type> dstb = src_space ## _to_ ## dst_space ## _(d_srcb); \
+        Mat dstb_gold; \
+        cv::cvtColor(srcb, dstb_gold, COLOR_ ## src_space ## 2 ## dst_space); \
+        EXPECT_MAT_NEAR(dstb_gold, dstb, 2.0); \
+        Mat bgrf = randomMat(size, CV_32FC3, 0, 1); \
+        Mat srcf; \
+        cv::cvtColor(bgrf, srcf, COLOR_BGR ## 2 ## src_space, src_cn); \
+        GpuMat_<SelectIf<src_cn == 1, float, float ## src_cn>::type> d_srcf(srcf); \
+        GpuMat_<SelectIf<dst_cn == 1, float, float ## dst_cn>::type> dstf = src_space ## _to_ ## dst_space ## _(d_srcf); \
+        Mat dstf_gold; \
+        cv::cvtColor(srcf, dstf_gold, COLOR_ ## src_space ## 2 ## dst_space); \
+        EXPECT_MAT_NEAR(dstf_gold, dstf, 2.0); \
+    }
+
+// RGB <-> BGR
+
+CVT_COLOR_TEST(BGR, RGB, 3, 3)
+CVT_COLOR_TEST(BGR, BGRA, 3, 4)
+CVT_COLOR_TEST(BGR, RGBA, 3, 4)
+CVT_COLOR_TEST(BGRA, BGR, 4, 3)
+CVT_COLOR_TEST(BGRA, RGB, 4, 3)
+CVT_COLOR_TEST(BGRA, RGBA, 4, 4)
+
+// RGB <-> Gray
+
+CVT_COLOR_TEST(BGR, GRAY, 3, 1)
+CVT_COLOR_TEST(RGB, GRAY, 3, 1)
+CVT_COLOR_TEST(BGRA, GRAY, 4, 1)
+CVT_COLOR_TEST(RGBA, GRAY, 4, 1)
+
+CVT_COLOR_TEST(GRAY, BGR, 1, 3)
+CVT_COLOR_TEST(GRAY, BGRA, 1, 4)
+
+// RGB <-> YUV
+
+CVT_COLOR_TEST(RGB, YUV, 3, 3)
+CVT_COLOR_TEST(BGR, YUV, 3, 3)
+
+CVT_COLOR_TEST(YUV, RGB, 3, 3)
+CVT_COLOR_TEST(YUV, BGR, 3, 3)
+
+// RGB <-> YCrCb
+
+CVT_COLOR_TEST(RGB, YCrCb, 3, 3)
+CVT_COLOR_TEST(BGR, YCrCb, 3, 3)
+
+CVT_COLOR_TEST(YCrCb, RGB, 3, 3)
+CVT_COLOR_TEST(YCrCb, BGR, 3, 3)
+
+// RGB <-> XYZ
+
+CVT_COLOR_TEST(RGB, XYZ, 3, 3)
+CVT_COLOR_TEST(BGR, XYZ, 3, 3)
+
+CVT_COLOR_TEST(XYZ, RGB, 3, 3)
+CVT_COLOR_TEST(XYZ, BGR, 3, 3)
+
+// RGB <-> HSV
+
+CVT_COLOR_TEST(RGB, HSV, 3, 3)
+CVT_COLOR_TEST(BGR, HSV, 3, 3)
+
+CVT_COLOR_TEST(HSV, RGB, 3, 3)
+CVT_COLOR_TEST(HSV, BGR, 3, 3)
+
+CVT_COLOR_TEST(RGB, HSV_FULL, 3, 3)
+CVT_COLOR_TEST(BGR, HSV_FULL, 3, 3)
+
+CVT_COLOR_TEST(HSV, RGB_FULL, 3, 3)
+CVT_COLOR_TEST(HSV, BGR_FULL, 3, 3)
+
+// RGB <-> HLS
+
+CVT_COLOR_TEST(RGB, HLS, 3, 3)
+CVT_COLOR_TEST(BGR, HLS, 3, 3)
+
+CVT_COLOR_TEST(HLS, RGB, 3, 3)
+CVT_COLOR_TEST(HLS, BGR, 3, 3)
+
+CVT_COLOR_TEST(RGB, HLS_FULL, 3, 3)
+CVT_COLOR_TEST(BGR, HLS_FULL, 3, 3)
+
+CVT_COLOR_TEST(HLS, RGB_FULL, 3, 3)
+CVT_COLOR_TEST(HLS, BGR_FULL, 3, 3)
+
+// RGB <-> Lab
+
+CVT_COLOR_TEST(RGB, Lab, 3, 3)
+CVT_COLOR_TEST(BGR, Lab, 3, 3)
+
+CVT_COLOR_TEST(Lab, RGB, 3, 3)
+CVT_COLOR_TEST(Lab, BGR, 3, 3)
+
+CVT_COLOR_TEST(LRGB, Lab, 3, 3)
+CVT_COLOR_TEST(LBGR, Lab, 3, 3)
+
+CVT_COLOR_TEST(Lab, LRGB, 3, 3)
+CVT_COLOR_TEST(Lab, LBGR, 3, 3)
+
+// RGB <-> Luv
+
+CVT_COLOR_TEST(RGB, Luv, 3, 3)
+CVT_COLOR_TEST(BGR, Luv, 3, 3)
+
+CVT_COLOR_TEST(Luv, RGB, 3, 3)
+CVT_COLOR_TEST(Luv, BGR, 3, 3)
+
+CVT_COLOR_TEST(LRGB, Luv, 3, 3)
+CVT_COLOR_TEST(LBGR, Luv, 3, 3)
+
+CVT_COLOR_TEST(Luv, LRGB, 3, 3)
+CVT_COLOR_TEST(Luv, LBGR, 3, 3)
diff --git a/modules/cudev/test/test_cvt.cu b/modules/cudev/test/test_cvt.cu
new file mode 100644
index 00000000000..c6595259715
--- /dev/null
+++ b/modules/cudev/test/test_cvt.cu
@@ -0,0 +1,150 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+typedef ::testing::Types<uchar, ushort, short, int, float> AllTypes;
+typedef ::testing::Types<short, float> Fp16Types;
+
+////////////////////////////////////////////////////////////////////////////////
+// CvtTest
+
+template <typename T>
+class CvtTest : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+
+        GpuMat_<T> d_src(src);
+
+        GpuMat_<T> dst = cvt_<T>(cvt_<float>(d_src) * 2.0f - 10.0f);
+
+        Mat dst_gold;
+        src.convertTo(dst_gold, src.depth(), 2, -10);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+// dummy class
+template <typename T>
+class CvFp16Test : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+    }
+};
+
+template <>
+class CvFp16Test <short> : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<float>::type;
+
+        Mat src = randomMat(size, type), dst, ref;
+
+        GpuMat_<float> g_src(src);
+        GpuMat g_dst;
+
+        // Fp32 -> Fp16
+        cuda::convertFp16(g_src, g_dst);
+        cv::convertFp16(src, dst);
+        // Fp16 -> Fp32
+        cuda::convertFp16(g_dst.clone(), g_dst);
+        cv::convertFp16(dst, ref);
+
+        g_dst.download(dst);
+        EXPECT_MAT_NEAR(dst, ref, 0.0);
+    }
+};
+
+template <>
+class CvFp16Test <float> : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<float>::type;
+
+        Mat src = randomMat(size, type), dst, ref;
+
+        GpuMat_<float> g_src(src);
+        GpuMat g_dst;
+
+        // Fp32 -> Fp16
+        cuda::convertFp16(g_src, g_dst);
+        cv::convertFp16(src, ref);
+
+        g_dst.download(dst);
+        EXPECT_MAT_NEAR(dst, ref, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(CvtTest, AllTypes);
+
+TYPED_TEST(CvtTest, GpuMat)
+{
+    CvtTest<TypeParam>::test_gpumat();
+}
+
+TYPED_TEST_CASE(CvFp16Test, Fp16Types);
+
+TYPED_TEST(CvFp16Test, GpuMat)
+{
+    CvFp16Test<TypeParam>::test_gpumat();
+}
diff --git a/modules/cudev/test/test_deriv.cu b/modules/cudev/test/test_deriv.cu
new file mode 100644
index 00000000000..2001b7fdeee
--- /dev/null
+++ b/modules/cudev/test/test_deriv.cu
@@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+TEST(Sobel, Accuracy)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+    Texture<uchar> tex_src(d_src);
+
+    GpuMat_<short> dx = sobelX_(cvt_<int>(tex_src));
+    GpuMat_<short> dy = sobelY_(cvt_<int>(tex_src));
+
+    Mat dx_gold, dy_gold;
+    cv::Sobel(src, dx_gold, CV_16S, 1, 0, 3, 1, 0, BORDER_REPLICATE);
+    cv::Sobel(src, dy_gold, CV_16S, 0, 1, 3, 1, 0, BORDER_REPLICATE);
+
+    EXPECT_MAT_NEAR(dx_gold, dx, 0.0);
+    EXPECT_MAT_NEAR(dy_gold, dy, 0.0);
+}
+
+TEST(Scharr, Accuracy)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+    Texture<uchar> tex_src(d_src);
+
+    GpuMat_<short> dx = scharrX_(cvt_<int>(tex_src));
+    GpuMat_<short> dy = scharrY_(cvt_<int>(tex_src));
+
+    Mat dx_gold, dy_gold;
+    cv::Scharr(src, dx_gold, CV_16S, 1, 0, 1, 0, BORDER_REPLICATE);
+    cv::Scharr(src, dy_gold, CV_16S, 0, 1, 1, 0, BORDER_REPLICATE);
+
+    EXPECT_MAT_NEAR(dx_gold, dx, 0.0);
+    EXPECT_MAT_NEAR(dy_gold, dy, 0.0);
+}
+
+TEST(Laplacian, Accuracy)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+    Texture<uchar> tex_src(d_src);
+
+    GpuMat_<short> dst1 = laplacian_<1>(cvt_<int>(tex_src));
+    GpuMat_<short> dst3 = laplacian_<3>(cvt_<int>(tex_src));
+
+    Mat dst1_gold, dst3_gold;
+    cv::Laplacian(src, dst1_gold, CV_16S, 1, 1, 0, BORDER_REPLICATE);
+    cv::Laplacian(src, dst3_gold, CV_16S, 3, 1, 0, BORDER_REPLICATE);
+
+    EXPECT_MAT_NEAR(dst1_gold, dst1, 0.0);
+    EXPECT_MAT_NEAR(dst3_gold, dst3, 0.0);
+}
diff --git a/modules/cudev/test/test_integral.cu b/modules/cudev/test/test_integral.cu
new file mode 100644
index 00000000000..3c34ffcc056
--- /dev/null
+++ b/modules/cudev/test/test_integral.cu
@@ -0,0 +1,103 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+TEST(Integral, _8u)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<uint> dst = integral_(d_src);
+
+    Mat dst_gold;
+    cv::integral(src, dst_gold);
+
+    dst_gold = dst_gold(Rect(1, 1, size.width, size.height));
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST(Integral, _32f)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_32FC1, 0, 1);
+
+    GpuMat_<float> d_src(src);
+
+    GpuMat_<float> dst = integral_(d_src);
+
+    Mat dst_gold;
+    cv::integral(src, dst_gold, CV_32F);
+
+    dst_gold = dst_gold(Rect(1, 1, size.width, size.height));
+
+    ASSERT_PRED_FORMAT2(cvtest::MatComparator(1e-5, 0), dst_gold, Mat(dst));
+}
+
+TEST(Integral, _8u_opt)
+{
+    const Size size(640, 480);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<uint> dst = integral_(d_src);
+
+    Mat dst_gold;
+    cv::integral(src, dst_gold);
+
+    dst_gold = dst_gold(Rect(1, 1, size.width, size.height));
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0.0);
+}
diff --git a/modules/cudev/test/test_lut.cu b/modules/cudev/test/test_lut.cu
new file mode 100644
index 00000000000..62c3129a98c
--- /dev/null
+++ b/modules/cudev/test/test_lut.cu
@@ -0,0 +1,82 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+////////////////////////////////////////////////////////////////////////////////
+// LutTest
+
+template <typename T>
+class LutTest : public ::testing::Test
+{
+public:
+    void test_gpumat()
+    {
+        const Size size = randomSize(100, 400);
+        const int type = DataType<T>::type;
+
+        Mat src = randomMat(size, type);
+        Mat tbl = randomMat(Size(256, 1), type);
+
+        GpuMat_<T> d_src(src), d_tbl(tbl);
+
+        GpuMat_<T> dst = lut_(d_src, d_tbl);
+
+        Mat dst_gold;
+        cv::LUT(src, tbl, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(LutTest, uchar);
+
+TYPED_TEST(LutTest, GpuMat)
+{
+    LutTest<TypeParam>::test_gpumat();
+}
diff --git a/modules/cudev/test/test_main.cpp b/modules/cudev/test/test_main.cpp
new file mode 100644
index 00000000000..fc7f8a3b3a6
--- /dev/null
+++ b/modules/cudev/test/test_main.cpp
@@ -0,0 +1,46 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/ts.hpp"
+
+CV_TEST_MAIN("cv")
diff --git a/modules/cudev/test/test_precomp.hpp b/modules/cudev/test/test_precomp.hpp
new file mode 100644
index 00000000000..591b9431839
--- /dev/null
+++ b/modules/cudev/test/test_precomp.hpp
@@ -0,0 +1,55 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/cudev.hpp"
+
+#define CV_TEST_SKIP_NAMESPACE_CHECK
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "cvconfig.h"
+
+#endif
diff --git a/modules/cudev/test/test_pyramids.cu b/modules/cudev/test/test_pyramids.cu
new file mode 100644
index 00000000000..28678b8d628
--- /dev/null
+++ b/modules/cudev/test/test_pyramids.cu
@@ -0,0 +1,81 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+TEST(PyrDown, _8uc1)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<uchar> dst = pyrDown_(d_src);
+
+    Mat dst_gold;
+    cv::pyrDown(src, dst_gold);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 1.0);
+}
+
+TEST(PyrUp, _32fc4)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_32FC4);
+
+    GpuMat_<float4> d_src(src);
+
+    GpuMat_<float4> dst = pyrDown_(d_src);
+
+    Mat dst_gold;
+    cv::pyrDown(src, dst_gold);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 1e-4);
+}
diff --git a/modules/cudev/test/test_reduction.cu b/modules/cudev/test/test_reduction.cu
new file mode 100644
index 00000000000..03c78def152
--- /dev/null
+++ b/modules/cudev/test/test_reduction.cu
@@ -0,0 +1,312 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+TEST(Sum, GpuMat)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<float> dst = sum_(d_src);
+    float res;
+    dst.download(_OutputArray(&res, 1));
+
+    Scalar dst_gold = cv::sum(src);
+
+    ASSERT_FLOAT_EQ(static_cast<float>(dst_gold[0]), res);
+}
+
+TEST(Sum, Expr)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src1 = randomMat(size, CV_32FC1, 0, 1);
+    Mat src2 = randomMat(size, CV_32FC1, 0, 1);
+
+    GpuMat_<float> d_src1(src1), d_src2(src2);
+
+    GpuMat_<float> dst = sum_(abs_(d_src1 - d_src2));
+    float res;
+    dst.download(_OutputArray(&res, 1));
+
+    Scalar dst_gold = cv::norm(src1, src2, NORM_L1);
+
+    ASSERT_FLOAT_EQ(static_cast<float>(dst_gold[0]), res);
+}
+
+TEST(MinVal, GpuMat)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<float> dst = minVal_(d_src);
+    float res;
+    dst.download(_OutputArray(&res, 1));
+
+    double res_gold;
+    cv::minMaxLoc(src, &res_gold, 0);
+
+    ASSERT_FLOAT_EQ(static_cast<float>(res_gold), res);
+}
+
+TEST(MaxVal, Expr)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src1 = randomMat(size, CV_32SC1);
+    Mat src2 = randomMat(size, CV_32SC1);
+
+    GpuMat_<int> d_src1(src1), d_src2(src2);
+
+    GpuMat_<float> dst = maxVal_(abs_(d_src1 - d_src2));
+    float res;
+    dst.download(_OutputArray(&res, 1));
+
+    double res_gold = cv::norm(src1, src2, NORM_INF);
+
+    ASSERT_FLOAT_EQ(static_cast<float>(res_gold), res);
+}
+
+TEST(MinMaxVal, GpuMat)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<float> dst = minMaxVal_(d_src);
+    float res[2];
+    dst.download(Mat(1, 2, CV_32FC1, res));
+
+    double res_gold[2];
+    cv::minMaxLoc(src, &res_gold[0], &res_gold[1]);
+
+    ASSERT_FLOAT_EQ(static_cast<float>(res_gold[0]), res[0]);
+    ASSERT_FLOAT_EQ(static_cast<float>(res_gold[1]), res[1]);
+}
+
+TEST(NonZeroCount, Accuracy)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1, 0, 5);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<int> dst1 = countNonZero_(d_src);
+    GpuMat_<int> dst2 = sum_(cvt_<int>(d_src) != 0);
+
+    EXPECT_MAT_NEAR(dst1, dst2, 0.0);
+}
+
+TEST(ReduceToRow, Sum)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<int> dst = reduceToRow_<Sum<int> >(d_src);
+
+    Mat dst_gold;
+    cv::reduce(src, dst_gold, 0, REDUCE_SUM, CV_32S);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST(ReduceToRow, Avg)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<float> dst = reduceToRow_<Avg<float> >(d_src);
+
+    Mat dst_gold;
+    cv::reduce(src, dst_gold, 0, REDUCE_AVG, CV_32F);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
+}
+
+TEST(ReduceToRow, Min)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<uchar> dst = reduceToRow_<Min<uchar> >(d_src);
+
+    Mat dst_gold;
+    cv::reduce(src, dst_gold, 0, REDUCE_MIN);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST(ReduceToRow, Max)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<uchar> dst = reduceToRow_<Max<uchar> >(d_src);
+
+    Mat dst_gold;
+    cv::reduce(src, dst_gold, 0, REDUCE_MAX);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST(ReduceToColumn, Sum)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<int> dst = reduceToColumn_<Sum<int> >(d_src);
+
+    Mat dst_gold;
+    cv::reduce(src, dst_gold, 1, REDUCE_SUM, CV_32S);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST(ReduceToColumn, Avg)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<float> dst = reduceToColumn_<Avg<float> >(d_src);
+
+    Mat dst_gold;
+    cv::reduce(src, dst_gold, 1, REDUCE_AVG, CV_32F);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
+}
+
+TEST(ReduceToColumn, Min)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<uchar> dst = reduceToColumn_<Min<uchar> >(d_src);
+
+    Mat dst_gold;
+    cv::reduce(src, dst_gold, 1, REDUCE_MIN);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST(ReduceToColumn, Max)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<uchar> dst = reduceToColumn_<Max<uchar> >(d_src);
+
+    Mat dst_gold;
+    cv::reduce(src, dst_gold, 1, REDUCE_MAX);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+static void calcHistGold(const cv::Mat& src, cv::Mat& hist)
+{
+    hist.create(1, 256, CV_32SC1);
+    hist.setTo(cv::Scalar::all(0));
+
+    int* hist_row = hist.ptr<int>();
+    for (int y = 0; y < src.rows; ++y)
+    {
+        const uchar* src_row = src.ptr(y);
+
+        for (int x = 0; x < src.cols; ++x)
+            ++hist_row[src_row[x]];
+    }
+}
+
+TEST(Histogram, GpuMat)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<int> dst = histogram_<256>(d_src);
+
+    Mat dst_gold;
+    calcHistGold(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
diff --git a/modules/cudev/test/test_split_merge.cu b/modules/cudev/test/test_split_merge.cu
new file mode 100644
index 00000000000..b25c8b96d6f
--- /dev/null
+++ b/modules/cudev/test/test_split_merge.cu
@@ -0,0 +1,180 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+typedef ::testing::Types<uchar, ushort, short, int, float> AllTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+// MergeTest
+
+template <typename T>
+class MergeTest : public ::testing::Test
+{
+public:
+    void test_c2()
+    {
+        const Size size = randomSize(100, 400);
+
+        const int src_type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, src_type);
+        Mat src2 = randomMat(size, src_type);
+
+        GpuMat_<T> d_src1(src1);
+        GpuMat_<T> d_src2(src2);
+
+        GpuMat_<typename MakeVec<T, 2>::type> dst;
+        gridMerge(zipPtr(d_src1, d_src2), dst);
+
+        Mat dst_gold;
+        Mat srcs[] = {src1, src2};
+        cv::merge(srcs, 2, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+
+    void test_c3()
+    {
+        const Size size = randomSize(100, 400);
+
+        const int src_type = DataType<T>::type;
+
+        Mat src1 = randomMat(size, src_type);
+        Mat src2 = randomMat(size, src_type);
+        Mat src3 = randomMat(size, src_type);
+
+        GpuMat_<T> d_src1(src1);
+        GpuMat_<T> d_src2(src2);
+        GpuMat_<T> d_src3(src3);
+
+        GpuMat_<typename MakeVec<T, 3>::type> dst;
+        gridMerge(zipPtr(d_src1, d_src2, d_src3), dst);
+
+        Mat dst_gold;
+        Mat srcs[] = {src1, src2, src3};
+        cv::merge(srcs, 3, dst_gold);
+
+        ASSERT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(MergeTest, AllTypes);
+
+TYPED_TEST(MergeTest, C2)
+{
+    MergeTest<TypeParam>::test_c2();
+}
+
+TYPED_TEST(MergeTest, C3)
+{
+    MergeTest<TypeParam>::test_c3();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// SplitTest
+
+template <typename T>
+class SplitTest : public ::testing::Test
+{
+public:
+    void test_c3()
+    {
+        const Size size = randomSize(100, 400);
+
+        const int src_type = CV_MAKE_TYPE(DataType<T>::depth, 3);
+
+        Mat src = randomMat(size, src_type);
+
+        GpuMat_<typename MakeVec<T, 3>::type> d_src(src);
+
+        GpuMat_<T> dst1, dst2, dst3;
+        gridSplit(d_src, tie(dst1, dst2, dst3));
+
+        std::vector<Mat> dst;
+        cv::split(src, dst);
+
+        ASSERT_MAT_NEAR(dst[0], dst1, 0.0);
+        ASSERT_MAT_NEAR(dst[1], dst2, 0.0);
+        ASSERT_MAT_NEAR(dst[2], dst3, 0.0);
+    }
+
+    void test_c4()
+    {
+        const Size size = randomSize(100, 400);
+
+        const int src_type = CV_MAKE_TYPE(DataType<T>::depth, 4);
+
+        Mat src = randomMat(size, src_type);
+
+        GpuMat_<typename MakeVec<T, 4>::type> d_src(src);
+
+        GpuMat_<T> dst1, dst2, dst3, dst4;
+        gridSplit(d_src, tie(dst1, dst2, dst3, dst4));
+
+        std::vector<Mat> dst;
+        cv::split(src, dst);
+
+        ASSERT_MAT_NEAR(dst[0], dst1, 0.0);
+        ASSERT_MAT_NEAR(dst[1], dst2, 0.0);
+        ASSERT_MAT_NEAR(dst[2], dst3, 0.0);
+        ASSERT_MAT_NEAR(dst[3], dst4, 0.0);
+    }
+};
+
+TYPED_TEST_CASE(SplitTest, AllTypes);
+
+TYPED_TEST(SplitTest, C3)
+{
+    SplitTest<TypeParam>::test_c3();
+}
+
+TYPED_TEST(SplitTest, C4)
+{
+    SplitTest<TypeParam>::test_c4();
+}
diff --git a/modules/cudev/test/test_warp.cu b/modules/cudev/test/test_warp.cu
new file mode 100644
index 00000000000..72d0643148c
--- /dev/null
+++ b/modules/cudev/test/test_warp.cu
@@ -0,0 +1,256 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+// remap
+
+enum { HALF_SIZE=0, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH };
+
+static void generateMap(Mat& mapx, Mat& mapy, int remapMode)
+{
+    for (int j = 0; j < mapx.rows; ++j)
+    {
+        for (int i = 0; i < mapx.cols; ++i)
+        {
+            switch (remapMode)
+            {
+            case HALF_SIZE:
+                if (i > mapx.cols*0.25 && i < mapx.cols*0.75 && j > mapx.rows*0.25 && j < mapx.rows*0.75)
+                {
+                    mapx.at<float>(j,i) = 2.f * (i - mapx.cols * 0.25f) + 0.5f;
+                    mapy.at<float>(j,i) = 2.f * (j - mapx.rows * 0.25f) + 0.5f;
+                }
+                else
+                {
+                    mapx.at<float>(j,i) = 0.f;
+                    mapy.at<float>(j,i) = 0.f;
+                }
+                break;
+            case UPSIDE_DOWN:
+                mapx.at<float>(j,i) = static_cast<float>(i);
+                mapy.at<float>(j,i) = static_cast<float>(mapx.rows - j);
+                break;
+            case REFLECTION_X:
+                mapx.at<float>(j,i) = static_cast<float>(mapx.cols - i);
+                mapy.at<float>(j,i) = static_cast<float>(j);
+                break;
+            case REFLECTION_BOTH:
+                mapx.at<float>(j,i) = static_cast<float>(mapx.cols - i);
+                mapy.at<float>(j,i) = static_cast<float>(mapx.rows - j);
+                break;
+            } // end of switch
+        }
+    }
+}
+
+static void test_remap(int remapMode)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_32FC1, 0, 1);
+
+    Mat mapx(size, CV_32FC1);
+    Mat mapy(size, CV_32FC1);
+    generateMap(mapx, mapy, remapMode);
+
+    GpuMat_<float> d_src(src);
+    GpuMat_<float> d_mapx(mapx);
+    GpuMat_<float> d_mapy(mapy);
+
+    GpuMat_<float> dst = remap_(interNearest(brdReplicate(d_src)), d_mapx, d_mapy);
+
+    Mat dst_gold;
+    cv::remap(src, dst_gold, mapx, mapy, INTER_NEAREST, BORDER_REPLICATE);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST(Remap, HALF_SIZE)
+{
+    test_remap(HALF_SIZE);
+}
+
+TEST(Remap, UPSIDE_DOWN)
+{
+    test_remap(UPSIDE_DOWN);
+}
+
+TEST(Remap, REFLECTION_X)
+{
+    test_remap(REFLECTION_X);
+}
+
+TEST(Remap, REFLECTION_BOTH)
+{
+    test_remap(REFLECTION_BOTH);
+}
+
+// resize
+
+TEST(Resize, Upscale)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_32FC1, 0, 1);
+
+    GpuMat_<float> d_src(src);
+    Texture<float> tex_src(d_src);
+
+    GpuMat_<float> dst1 = resize_(interCubic(tex_src), 2, 2);
+
+    Mat mapx(size.height * 2, size.width * 2, CV_32FC1);
+    Mat mapy(size.height * 2, size.width * 2, CV_32FC1);
+
+    for (int y = 0; y < mapx.rows; ++y)
+    {
+        for (int x = 0; x < mapx.cols; ++x)
+        {
+            mapx.at<float>(y, x) = static_cast<float>(x / 2);
+            mapy.at<float>(y, x) = static_cast<float>(y / 2);
+        }
+    }
+
+    GpuMat_<float> d_mapx(mapx);
+    GpuMat_<float> d_mapy(mapy);
+
+    GpuMat_<float> dst2 = remap_(interCubic(brdReplicate(d_src)), d_mapx, d_mapy);
+
+    EXPECT_MAT_NEAR(dst1, dst2, 0.0);
+}
+
+TEST(Resize, Downscale)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_32FC1, 0, 1);
+    const float fx = 1.0f / 3.0f;
+    const float fy = 1.0f / 3.0f;
+
+    GpuMat_<float> d_src(src);
+    Texture<float> tex_src(d_src);
+
+    GpuMat_<float> dst1 = resize_(interArea(tex_src, Size(3, 3)), fx, fy);
+
+    Mat mapx(cv::saturate_cast<int>(size.height * fy), cv::saturate_cast<int>(size.width * fx), CV_32FC1);
+    Mat mapy(cv::saturate_cast<int>(size.height * fy), cv::saturate_cast<int>(size.width * fx), CV_32FC1);
+
+    for (int y = 0; y < mapx.rows; ++y)
+    {
+        for (int x = 0; x < mapx.cols; ++x)
+        {
+            mapx.at<float>(y, x) = x / fx;
+            mapy.at<float>(y, x) = y / fy;
+        }
+    }
+
+    GpuMat_<float> d_mapx(mapx);
+    GpuMat_<float> d_mapy(mapy);
+
+    GpuMat_<float> dst2 = remap_(interArea(brdReplicate(d_src), Size(3, 3)), d_mapx, d_mapy);
+
+    EXPECT_MAT_NEAR(dst1, dst2, 0.0);
+}
+
+// warpAffine & warpPerspective
+
+Mat createAffineTransformMatrix(Size srcSize, float angle, bool perspective)
+{
+    cv::Mat M(perspective ? 3 : 2, 3, CV_32FC1);
+
+    {
+        M.at<float>(0, 0) = std::cos(angle); M.at<float>(0, 1) = -std::sin(angle); M.at<float>(0, 2) = static_cast<float>(srcSize.width / 2);
+        M.at<float>(1, 0) = std::sin(angle); M.at<float>(1, 1) =  std::cos(angle); M.at<float>(1, 2) = 0.0f;
+    }
+    if (perspective)
+    {
+        M.at<float>(2, 0) = 0.0f           ; M.at<float>(2, 1) =  0.0f           ; M.at<float>(2, 2) = 1.0f;
+    }
+
+    return M;
+}
+
+TEST(WarpAffine, Rotation)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_32FC1, 0, 1);
+    Mat M = createAffineTransformMatrix(size, static_cast<float>(CV_PI / 4), false);
+
+    GpuMat_<float> d_src(src);
+    GpuMat_<float> d_M;
+    createContinuous(M.size(), M.type(), d_M);
+    d_M.upload(M);
+
+    GpuMat_<float> dst = warpAffine_(interNearest(brdConstant(d_src)), size, d_M);
+
+    Mat dst_gold;
+    cv::warpAffine(src, dst_gold, M, size, INTER_NEAREST | WARP_INVERSE_MAP);
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 1e-3);
+}
+
+TEST(WarpPerspective, Rotation)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_32FC1, 0, 1);
+    Mat M = createAffineTransformMatrix(size, static_cast<float>(CV_PI / 4), true);
+
+    GpuMat_<float> d_src(src);
+    GpuMat_<float> d_M;
+    createContinuous(M.size(), M.type(), d_M);
+    d_M.upload(M);
+
+    GpuMat_<float> dst = warpPerspective_(interNearest(brdConstant(d_src)), size, d_M);
+
+    Mat dst_gold;
+    cv::warpPerspective(src, dst_gold, M, size, INTER_NEAREST | WARP_INVERSE_MAP);
+
+    EXPECT_MAT_SIMILAR(dst_gold, dst, 1e-3);
+}
diff --git a/modules/cudev/test/transpose.cu b/modules/cudev/test/transpose.cu
new file mode 100644
index 00000000000..515eedfc34d
--- /dev/null
+++ b/modules/cudev/test/transpose.cu
@@ -0,0 +1,81 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+using namespace cvtest;
+
+TEST(Transpose, _8uc1)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_8UC1);
+
+    GpuMat_<uchar> d_src(src);
+
+    GpuMat_<uchar> dst = transpose_(d_src);
+
+    Mat dst_gold;
+    cv::transpose(src, dst_gold);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+TEST(Transpose, _32fc3)
+{
+    const Size size = randomSize(100, 400);
+
+    Mat src = randomMat(size, CV_32FC3);
+
+    GpuMat_<float3> d_src(src);
+
+    GpuMat_<float3> dst = transpose_(d_src);
+
+    Mat dst_gold;
+    cv::transpose(src, dst_gold);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0.0);
+}